From e510691c36ea78e319ad9eb2f2ab89c91320901a Mon Sep 17 00:00:00 2001 From: Robert Beckebans Date: Fri, 24 May 2013 19:27:18 +0200 Subject: [PATCH 1/6] Changed model loader to not generate binary files for defaulted models. closes #39 --- neo/renderer/ModelManager.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/neo/renderer/ModelManager.cpp b/neo/renderer/ModelManager.cpp index 802d4770..f9398532 100644 --- a/neo/renderer/ModelManager.cpp +++ b/neo/renderer/ModelManager.cpp @@ -378,9 +378,14 @@ idRenderModel* idRenderModelManagerLocal::GetModel( const char* _modelName, bool { model->InitFromFile( canonical ); - idFileLocal outputFile( fileSystem->OpenFileWrite( generatedFileName, "fs_basepath" ) ); - idLib::Printf( "Writing %s\n", generatedFileName.c_str() ); - model->WriteBinaryModel( outputFile ); + // RB: default models shouldn't be cached as binary models + if( !model->IsDefaultModel() ) + { + idFileLocal outputFile( fileSystem->OpenFileWrite( generatedFileName, "fs_basepath" ) ); + idLib::Printf( "Writing %s\n", generatedFileName.c_str() ); + model->WriteBinaryModel( outputFile ); + } + // RB end } /* else { idLib::Printf( "loaded binary model %s from file %s\n", model->Name(), generatedFileName.c_str() ); } */ From db715535ccb3f324fd6e815fb956a34b7f67d401 Mon Sep 17 00:00:00 2001 From: Robert Beckebans Date: Sat, 1 Jun 2013 15:13:00 +0200 Subject: [PATCH 2/6] Merged generic C++ fallbacks for SSE optimized code to allow support for non-x86 based platforms like ARM --- neo/astyle-code.bat | 4 +- neo/astyle-code.sh | 4 +- neo/framework/FileSystem.cpp | 6 + neo/idlib/geometry/DrawVert.h | 29 + neo/idlib/geometry/DrawVert_intrinsics.h | 9 +- neo/idlib/geometry/RenderMatrix.cpp | 1370 ++++++++++++++++- neo/idlib/math/Lcp.cpp | 160 +- neo/idlib/math/MatX.cpp | 17 + neo/idlib/math/MatX.h | 3 + neo/idlib/math/Math.cpp | 2 + neo/idlib/math/Math.h | 60 +- neo/idlib/math/Simd.cpp | 722 +++++---- neo/idlib/math/Simd_SSE.cpp | 3 + neo/idlib/math/Simd_SSE.h | 5 + neo/idlib/math/VecX.h | 3 + neo/idlib/math/Vector.h | 5 + neo/idlib/sys/sys_intrinsics.h | 32 +- neo/renderer/BufferObject.cpp | 11 + neo/renderer/DXT/DXTCodec.h | 21 + neo/renderer/DXT/DXTEncoder_SSE2.cpp | 2 + neo/renderer/GLMatrix.cpp | 37 +- neo/renderer/ModelDecal.cpp | 60 +- neo/renderer/ModelOverlay.cpp | 100 +- neo/renderer/Model_md5.cpp | 25 +- neo/renderer/jobs/ShadowShared.cpp | 52 +- .../DynamicShadowVolume.cpp | 233 ++- neo/renderer/tr_trace.cpp | 105 +- neo/renderer/tr_trisurf.cpp | 4 +- 28 files changed, 2747 insertions(+), 337 deletions(-) diff --git a/neo/astyle-code.bat b/neo/astyle-code.bat index 9a7dc029..4da3086a 100644 --- a/neo/astyle-code.bat +++ b/neo/astyle-code.bat @@ -1,4 +1,4 @@ -astyle.exe -v --options=astyle-options.ini --exclude="libs" --recursive *.h -astyle.exe -v --options=astyle-options.ini --exclude="libs" --exclude="idlib/math/Simd.cpp" --exclude="d3xp/gamesys/SysCvar.cpp" --exclude="d3xp/gamesys/Callbacks.cpp" --exclude="sys/win32/win_cpu.cpp" --exclude="sys/win32/win_main.cpp" --recursive *.cpp +astyle.exe -v --formatted --options=astyle-options.ini --exclude="libs" --recursive *.h +astyle.exe -v --formatted --options=astyle-options.ini --exclude="libs" --exclude="d3xp/gamesys/SysCvar.cpp" --exclude="d3xp/gamesys/Callbacks.cpp" --exclude="sys/win32/win_cpu.cpp" --exclude="sys/win32/win_main.cpp" --recursive *.cpp pause \ No newline at end of file diff --git a/neo/astyle-code.sh b/neo/astyle-code.sh index 6b3919e3..347492b9 100755 --- a/neo/astyle-code.sh +++ b/neo/astyle-code.sh @@ -1,5 +1,5 @@ #!/bin/sh -./astyle.exe -v --options=astyle-options.ini --exclude="libs" --recursive *.h -./astyle.exe -v --options=astyle-options.ini --exclude="libs" --exclude="idlib/math/Simd.cpp" --exclude="d3xp/gamesys/SysCvar.cpp" --exclude="d3xp/gamesys/Callbacks.cpp" --exclude="sys/win32/win_cpu.cpp" --exclude="sys/win32/win_main.cpp" --recursive *.cpp +./astyle.exe -v --formatted --options=astyle-options.ini --exclude="libs" --recursive *.h +./astyle.exe -v --formatted --options=astyle-options.ini --exclude="libs" --exclude="d3xp/gamesys/SysCvar.cpp" --exclude="d3xp/gamesys/Callbacks.cpp" --exclude="sys/win32/win_cpu.cpp" --exclude="sys/win32/win_main.cpp" --recursive *.cpp #pause \ No newline at end of file diff --git a/neo/framework/FileSystem.cpp b/neo/framework/FileSystem.cpp index c51bd745..64b132db 100644 --- a/neo/framework/FileSystem.cpp +++ b/neo/framework/FileSystem.cpp @@ -301,7 +301,13 @@ idCVar idFileSystemLocal::fs_debugResources( "fs_debugResources", "0", CVAR_SYST idCVar idFileSystemLocal::fs_enableBGL( "fs_enableBGL", "0", CVAR_SYSTEM | CVAR_BOOL, "" ); idCVar idFileSystemLocal::fs_debugBGL( "fs_debugBGL", "0", CVAR_SYSTEM | CVAR_BOOL, "" ); idCVar idFileSystemLocal::fs_copyfiles( "fs_copyfiles", "0", CVAR_SYSTEM | CVAR_INIT | CVAR_BOOL, "Copy every file touched to fs_savepath" ); +// RB +#if defined(RETAIL) idCVar idFileSystemLocal::fs_buildResources( "fs_buildresources", "0", CVAR_SYSTEM | CVAR_BOOL | CVAR_INIT, "Copy every file touched to a resource file" ); +#else +idCVar idFileSystemLocal::fs_buildResources( "fs_buildresources", "1", CVAR_SYSTEM | CVAR_BOOL | CVAR_INIT, "Copy every file touched to a resource file" ); +#endif +// RB end idCVar idFileSystemLocal::fs_game( "fs_game", "", CVAR_SYSTEM | CVAR_INIT | CVAR_SERVERINFO, "mod path" ); idCVar idFileSystemLocal::fs_game_base( "fs_game_base", "", CVAR_SYSTEM | CVAR_INIT | CVAR_SERVERINFO, "alternate mod path, searched after the main fs_game path, before the basedir" ); diff --git a/neo/idlib/geometry/DrawVert.h b/neo/idlib/geometry/DrawVert.h index 6e8a85b0..1169d566 100644 --- a/neo/idlib/geometry/DrawVert.h +++ b/neo/idlib/geometry/DrawVert.h @@ -3,6 +3,7 @@ Doom 3 BFG Edition GPL Source Code Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company. +Copyright (C) 2013 Robert Beckebans This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code"). @@ -113,11 +114,20 @@ ID_INLINE halfFloat_t F32toF16( float a ) class idDrawVert { + friend class idSwap; + friend class idShadowVertSkinned; + friend class idRenderModelStatic; + + friend void TransformVertsAndTangents( idDrawVert* targetVerts, const int numVerts, const idDrawVert* baseVerts, const idJointMat* joints ); + public: idVec3 xyz; // 12 bytes +private: + // RB: don't let the old tools code mess with these values halfFloat_t st[2]; // 4 bytes byte normal[4]; // 4 bytes byte tangent[4]; // 4 bytes -- [3] is texture polarity sign +public: byte color[4]; // 4 bytes byte color2[4]; // 4 bytes -- weights for skinning @@ -187,9 +197,14 @@ public: #define DRAWVERT_COLOR_OFFSET (6*4) #define DRAWVERT_COLOR2_OFFSET (7*4) +// RB begin +assert_sizeof( idDrawVert, DRAWVERT_SIZE ); +#if 0 assert_offsetof( idDrawVert, xyz, DRAWVERT_XYZ_OFFSET ); assert_offsetof( idDrawVert, normal, DRAWVERT_NORMAL_OFFSET ); assert_offsetof( idDrawVert, tangent, DRAWVERT_TANGENT_OFFSET ); +#endif +// RB end /* ======================== @@ -202,6 +217,7 @@ ID_INLINE void VertexFloatToByte( const float& x, const float& y, const float& z { assert_4_byte_aligned( bval ); // for __stvebx +#if defined(USE_INTRINSICS) const __m128 vector_float_one = { 1.0f, 1.0f, 1.0f, 1.0f }; const __m128 vector_float_half = { 0.5f, 0.5f, 0.5f, 0.5f }; @@ -218,6 +234,13 @@ ID_INLINE void VertexFloatToByte( const float& x, const float& y, const float& z bval[1] = ( byte )_mm_extract_epi16( xyz16, 1 ); bval[2] = ( byte )_mm_extract_epi16( xyz16, 2 ); +#else + + bval[0] = VERTEX_FLOAT_TO_BYTE( x ); + bval[1] = VERTEX_FLOAT_TO_BYTE( y ); + bval[2] = VERTEX_FLOAT_TO_BYTE( z ); + +#endif } /* @@ -655,6 +678,7 @@ ID_INLINE void WriteDrawVerts16( idDrawVert* destVerts, const idDrawVert* localV assert_16_byte_aligned( destVerts ); assert_16_byte_aligned( localVerts ); +#if defined(USE_INTRINSICS) for( int i = 0; i < numVerts; i++ ) { @@ -664,6 +688,11 @@ ID_INLINE void WriteDrawVerts16( idDrawVert* destVerts, const idDrawVert* localV _mm_stream_si128( ( __m128i* )( ( byte* )( destVerts + i ) + 16 ), v1 ); } +#else + + memcpy( destVerts, localVerts, numVerts * sizeof( idDrawVert ) ); + +#endif } /* diff --git a/neo/idlib/geometry/DrawVert_intrinsics.h b/neo/idlib/geometry/DrawVert_intrinsics.h index 3ac36cf6..1a409453 100644 --- a/neo/idlib/geometry/DrawVert_intrinsics.h +++ b/neo/idlib/geometry/DrawVert_intrinsics.h @@ -30,6 +30,7 @@ If you have questions concerning this license or the applicable additional terms #define __DRAWVERT_INTRINSICS_H__ +#if defined(USE_INTRINSICS) static const __m128i vector_int_f32_sign_mask = _mm_set1_epi32( 1U << IEEE_FLT_SIGN_BIT ); static const __m128i vector_int_f32_exponent_mask = _mm_set1_epi32( ( ( 1U << IEEE_FLT_EXPONENT_BITS ) - 1 ) << IEEE_FLT_MANTISSA_BITS ); static const __m128i vector_int_f32_mantissa_mask = _mm_set1_epi32( ( 1U << IEEE_FLT_MANTISSA_BITS ) - 1 ); @@ -50,13 +51,14 @@ static const __m128 vector_float_last_one = { 0.0f, 0.0f, 0.0f, 1.0 static const __m128 vector_float_1_over_255 = { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }; static const __m128 vector_float_1_over_4 = { 1.0f / 4.0f, 1.0f / 4.0f, 1.0f / 4.0f, 1.0f / 4.0f }; +#endif /* ==================== FastF32toF16 ==================== */ - +#if defined(USE_INTRINSICS) ID_INLINE_EXTERN __m128i FastF32toF16( __m128i f32_bits ) { __m128i f16_sign = _mm_srli_epi32( _mm_and_si128( f32_bits, vector_int_f32_sign_mask ), f32_to_f16_sign_shift ); @@ -77,6 +79,7 @@ ID_INLINE_EXTERN __m128i FastF32toF16( __m128i f32_bits ) return _mm_packs_epi32( flt16, flt16 ); } +#endif ID_INLINE_EXTERN halfFloat_t Scalar_FastF32toF16( float f32 ) @@ -117,7 +120,7 @@ ID_INLINE_EXTERN halfFloat_t Scalar_FastF32toF16( float f32 ) LoadSkinnedDrawVertPosition ==================== */ - +#if defined(USE_INTRINSICS) ID_INLINE_EXTERN __m128 LoadSkinnedDrawVertPosition( const idDrawVert& base, const idJointMat* joints ) { const idJointMat& j0 = joints[base.color[0]]; @@ -178,7 +181,7 @@ ID_INLINE_EXTERN __m128 LoadSkinnedDrawVertPosition( const idDrawVert& base, con return r0; } - +#endif ID_INLINE_EXTERN idVec3 Scalar_LoadSkinnedDrawVertPosition( const idDrawVert& vert, const idJointMat* joints ) { diff --git a/neo/idlib/geometry/RenderMatrix.cpp b/neo/idlib/geometry/RenderMatrix.cpp index 83dcef06..653bc2f3 100644 --- a/neo/idlib/geometry/RenderMatrix.cpp +++ b/neo/idlib/geometry/RenderMatrix.cpp @@ -92,7 +92,7 @@ SIMD constants ================================================================================================ */ - +#if defined(USE_INTRINSICS) static const __m128i vector_int_1 = _mm_set1_epi32( 1 ); static const __m128i vector_int_4 = _mm_set1_epi32( 4 ); static const __m128i vector_int_0123 = _mm_set_epi32( 3, 2, 1, 0 ); @@ -116,6 +116,7 @@ static const __m128 vector_float_one = { 1.0f, 1.0f, 1.0f, 1.0f }; static const __m128 vector_float_pos_one = { +1.0f, +1.0f, +1.0f, +1.0f }; static const __m128 vector_float_neg_one = { -1.0f, -1.0f, -1.0f, -1.0f }; static const __m128 vector_float_last_one = { 0.0f, 0.0f, 0.0f, 1.0f }; +#endif /* @@ -527,6 +528,8 @@ static const struct silhouetteVertices_t { { 0, 0, 0, 0, 0, 0, 0 }, 0 }, // 111111 = 63 invalid }; + + /* ======================== GetBoxFrontBits @@ -540,7 +543,7 @@ front bits: bit 5 = pos-Z is front facing ======================== */ - +#if defined(USE_INTRINSICS) static int GetBoxFrontBits_SSE2( const __m128& b0, const __m128& b1, const __m128& viewOrigin ) { const __m128 dir0 = _mm_sub_ps( viewOrigin, b0 ); @@ -552,6 +555,23 @@ static int GetBoxFrontBits_SSE2( const __m128& b0, const __m128& b1, const __m12 return frontBits; } +#else + +static int GetBoxFrontBits_Generic( const idBounds& bounds, const idVec3& viewOrigin ) +{ + idVec3 dir0 = viewOrigin - bounds[0]; + idVec3 dir1 = bounds[1] - viewOrigin; + int frontBits = 0; + frontBits |= IEEE_FLT_SIGNBITSET( dir0.x ) << 0; + frontBits |= IEEE_FLT_SIGNBITSET( dir0.y ) << 1; + frontBits |= IEEE_FLT_SIGNBITSET( dir0.z ) << 2; + frontBits |= IEEE_FLT_SIGNBITSET( dir1.x ) << 3; + frontBits |= IEEE_FLT_SIGNBITSET( dir1.y ) << 4; + frontBits |= IEEE_FLT_SIGNBITSET( dir1.z ) << 5; + return frontBits; +} + +#endif /* ================================================================================================ @@ -739,7 +759,7 @@ void idRenderMatrix::OffsetScaleForBounds( const idRenderMatrix& src, const idBo { assert( &src != &out ); - +#if defined(USE_INTRINSICS) __m128 b0 = _mm_loadu_bounds_0( bounds ); __m128 b1 = _mm_loadu_bounds_1( bounds ); @@ -785,6 +805,32 @@ void idRenderMatrix::OffsetScaleForBounds( const idRenderMatrix& src, const idBo _mm_storeu_ps( out.m + 2 * 4, a2 ); _mm_storeu_ps( out.m + 3 * 4, a3 ); +#else + + const idVec3 offset = ( bounds[1] + bounds[0] ) * 0.5f; + const idVec3 scale = ( bounds[1] - bounds[0] ) * 0.5f; + + out[0][0] = src[0][0] * scale[0]; + out[0][1] = src[0][1] * scale[1]; + out[0][2] = src[0][2] * scale[2]; + out[0][3] = src[0][3] + src[0][0] * offset[0] + src[0][1] * offset[1] + src[0][2] * offset[2]; + + out[1][0] = src[1][0] * scale[0]; + out[1][1] = src[1][1] * scale[1]; + out[1][2] = src[1][2] * scale[2]; + out[1][3] = src[1][3] + src[1][0] * offset[0] + src[1][1] * offset[1] + src[1][2] * offset[2]; + + out[2][0] = src[2][0] * scale[0]; + out[2][1] = src[2][1] * scale[1]; + out[2][2] = src[2][2] * scale[2]; + out[2][3] = src[2][3] + src[2][0] * offset[0] + src[2][1] * offset[1] + src[2][2] * offset[2]; + + out[3][0] = src[3][0] * scale[0]; + out[3][1] = src[3][1] * scale[1]; + out[3][2] = src[3][2] * scale[2]; + out[3][3] = src[3][3] + src[3][0] * offset[0] + src[3][1] * offset[1] + src[3][2] * offset[2]; + +#endif } /* @@ -799,7 +845,7 @@ void idRenderMatrix::InverseOffsetScaleForBounds( const idRenderMatrix& src, con { assert( &src != &out ); - +#if defined(USE_INTRINSICS) __m128 b0 = _mm_loadu_bounds_0( bounds ); __m128 b1 = _mm_loadu_bounds_1( bounds ); @@ -830,6 +876,32 @@ void idRenderMatrix::InverseOffsetScaleForBounds( const idRenderMatrix& src, con _mm_storeu_ps( out.m + 2 * 4, a2 ); _mm_storeu_ps( out.m + 3 * 4, a3 ); +#else + + const idVec3 offset = -0.5f * ( bounds[1] + bounds[0] ); + const idVec3 scale = 2.0f / ( bounds[1] - bounds[0] ); + + out[0][0] = scale[0] * src[0][0]; + out[0][1] = scale[0] * src[0][1]; + out[0][2] = scale[0] * src[0][2]; + out[0][3] = scale[0] * ( src[0][3] + offset[0] ); + + out[1][0] = scale[1] * src[1][0]; + out[1][1] = scale[1] * src[1][1]; + out[1][2] = scale[1] * src[1][2]; + out[1][3] = scale[1] * ( src[1][3] + offset[1] ); + + out[2][0] = scale[2] * src[2][0]; + out[2][1] = scale[2] * src[2][1]; + out[2][2] = scale[2] * src[2][2]; + out[2][3] = scale[2] * ( src[2][3] + offset[2] ); + + out[3][0] = src[3][0]; + out[3][1] = src[3][1]; + out[3][2] = src[3][2]; + out[3][3] = src[3][3]; + +#endif } /* @@ -841,7 +913,7 @@ void idRenderMatrix::Transpose( const idRenderMatrix& src, idRenderMatrix& out ) { assert( &src != &out ); - +#if defined(USE_INTRINSICS) const __m128 a0 = _mm_loadu_ps( src.m + 0 * 4 ); const __m128 a1 = _mm_loadu_ps( src.m + 1 * 4 ); const __m128 a2 = _mm_loadu_ps( src.m + 2 * 4 ); @@ -862,6 +934,24 @@ void idRenderMatrix::Transpose( const idRenderMatrix& src, idRenderMatrix& out ) _mm_storeu_ps( out.m + 2 * 4, t2 ); _mm_storeu_ps( out.m + 3 * 4, t3 ); +#else + out.m[ 0] = src.m[ 0]; + out.m[ 1] = src.m[ 4]; + out.m[ 2] = src.m[ 8]; + out.m[ 3] = src.m[12]; + out.m[ 4] = src.m[ 1]; + out.m[ 5] = src.m[ 5]; + out.m[ 6] = src.m[ 9]; + out.m[ 7] = src.m[13]; + out.m[ 8] = src.m[ 2]; + out.m[ 9] = src.m[ 6]; + out.m[10] = src.m[10]; + out.m[11] = src.m[14]; + out.m[12] = src.m[ 3]; + out.m[13] = src.m[ 7]; + out.m[14] = src.m[11]; + out.m[15] = src.m[15]; +#endif } /* @@ -871,8 +961,7 @@ idRenderMatrix::Multiply */ void idRenderMatrix::Multiply( const idRenderMatrix& a, const idRenderMatrix& b, idRenderMatrix& out ) { - - +#if defined(USE_INTRINSICS) __m128 a0 = _mm_loadu_ps( a.m + 0 * 4 ); __m128 a1 = _mm_loadu_ps( a.m + 1 * 4 ); __m128 a2 = _mm_loadu_ps( a.m + 2 * 4 ); @@ -908,6 +997,41 @@ void idRenderMatrix::Multiply( const idRenderMatrix& a, const idRenderMatrix& b, _mm_storeu_ps( out.m + 2 * 4, t2 ); _mm_storeu_ps( out.m + 3 * 4, t3 ); +#else + + /* + for ( int i = 0 ; i < 4 ; i++ ) { + for ( int j = 0 ; j < 4 ; j++ ) { + out.m[ i * 4 + j ] = + a.m[ i * 4 + 0 ] * b.m[ 0 * 4 + j ] + + a.m[ i * 4 + 1 ] * b.m[ 1 * 4 + j ] + + a.m[ i * 4 + 2 ] * b.m[ 2 * 4 + j ] + + a.m[ i * 4 + 3 ] * b.m[ 3 * 4 + j ]; + } + } + */ + + out.m[0 * 4 + 0] = a.m[0 * 4 + 0] * b.m[0 * 4 + 0] + a.m[0 * 4 + 1] * b.m[1 * 4 + 0] + a.m[0 * 4 + 2] * b.m[2 * 4 + 0] + a.m[0 * 4 + 3] * b.m[3 * 4 + 0]; + out.m[0 * 4 + 1] = a.m[0 * 4 + 0] * b.m[0 * 4 + 1] + a.m[0 * 4 + 1] * b.m[1 * 4 + 1] + a.m[0 * 4 + 2] * b.m[2 * 4 + 1] + a.m[0 * 4 + 3] * b.m[3 * 4 + 1]; + out.m[0 * 4 + 2] = a.m[0 * 4 + 0] * b.m[0 * 4 + 2] + a.m[0 * 4 + 1] * b.m[1 * 4 + 2] + a.m[0 * 4 + 2] * b.m[2 * 4 + 2] + a.m[0 * 4 + 3] * b.m[3 * 4 + 2]; + out.m[0 * 4 + 3] = a.m[0 * 4 + 0] * b.m[0 * 4 + 3] + a.m[0 * 4 + 1] * b.m[1 * 4 + 3] + a.m[0 * 4 + 2] * b.m[2 * 4 + 3] + a.m[0 * 4 + 3] * b.m[3 * 4 + 3]; + + out.m[1 * 4 + 0] = a.m[1 * 4 + 0] * b.m[0 * 4 + 0] + a.m[1 * 4 + 1] * b.m[1 * 4 + 0] + a.m[1 * 4 + 2] * b.m[2 * 4 + 0] + a.m[1 * 4 + 3] * b.m[3 * 4 + 0]; + out.m[1 * 4 + 1] = a.m[1 * 4 + 0] * b.m[0 * 4 + 1] + a.m[1 * 4 + 1] * b.m[1 * 4 + 1] + a.m[1 * 4 + 2] * b.m[2 * 4 + 1] + a.m[1 * 4 + 3] * b.m[3 * 4 + 1]; + out.m[1 * 4 + 2] = a.m[1 * 4 + 0] * b.m[0 * 4 + 2] + a.m[1 * 4 + 1] * b.m[1 * 4 + 2] + a.m[1 * 4 + 2] * b.m[2 * 4 + 2] + a.m[1 * 4 + 3] * b.m[3 * 4 + 2]; + out.m[1 * 4 + 3] = a.m[1 * 4 + 0] * b.m[0 * 4 + 3] + a.m[1 * 4 + 1] * b.m[1 * 4 + 3] + a.m[1 * 4 + 2] * b.m[2 * 4 + 3] + a.m[1 * 4 + 3] * b.m[3 * 4 + 3]; + + out.m[2 * 4 + 0] = a.m[2 * 4 + 0] * b.m[0 * 4 + 0] + a.m[2 * 4 + 1] * b.m[1 * 4 + 0] + a.m[2 * 4 + 2] * b.m[2 * 4 + 0] + a.m[2 * 4 + 3] * b.m[3 * 4 + 0]; + out.m[2 * 4 + 1] = a.m[2 * 4 + 0] * b.m[0 * 4 + 1] + a.m[2 * 4 + 1] * b.m[1 * 4 + 1] + a.m[2 * 4 + 2] * b.m[2 * 4 + 1] + a.m[2 * 4 + 3] * b.m[3 * 4 + 1]; + out.m[2 * 4 + 2] = a.m[2 * 4 + 0] * b.m[0 * 4 + 2] + a.m[2 * 4 + 1] * b.m[1 * 4 + 2] + a.m[2 * 4 + 2] * b.m[2 * 4 + 2] + a.m[2 * 4 + 3] * b.m[3 * 4 + 2]; + out.m[2 * 4 + 3] = a.m[2 * 4 + 0] * b.m[0 * 4 + 3] + a.m[2 * 4 + 1] * b.m[1 * 4 + 3] + a.m[2 * 4 + 2] * b.m[2 * 4 + 3] + a.m[2 * 4 + 3] * b.m[3 * 4 + 3]; + + out.m[3 * 4 + 0] = a.m[3 * 4 + 0] * b.m[0 * 4 + 0] + a.m[3 * 4 + 1] * b.m[1 * 4 + 0] + a.m[3 * 4 + 2] * b.m[2 * 4 + 0] + a.m[3 * 4 + 3] * b.m[3 * 4 + 0]; + out.m[3 * 4 + 1] = a.m[3 * 4 + 0] * b.m[0 * 4 + 1] + a.m[3 * 4 + 1] * b.m[1 * 4 + 1] + a.m[3 * 4 + 2] * b.m[2 * 4 + 1] + a.m[3 * 4 + 3] * b.m[3 * 4 + 1]; + out.m[3 * 4 + 2] = a.m[3 * 4 + 0] * b.m[0 * 4 + 2] + a.m[3 * 4 + 1] * b.m[1 * 4 + 2] + a.m[3 * 4 + 2] * b.m[2 * 4 + 2] + a.m[3 * 4 + 3] * b.m[3 * 4 + 2]; + out.m[3 * 4 + 3] = a.m[3 * 4 + 0] * b.m[0 * 4 + 3] + a.m[3 * 4 + 1] * b.m[1 * 4 + 3] + a.m[3 * 4 + 2] * b.m[2 * 4 + 3] + a.m[3 * 4 + 3] * b.m[3 * 4 + 3]; + +#endif } /* @@ -916,7 +1040,7 @@ idRenderMatrix::Inverse inverse( M ) = ( 1 / determinant( M ) ) * transpose( cofactor( M ) ) -This code is based on the code written by C�dric Lallain, published on "Cell Performance" +This code is based on the code written by Cédric Lallain, published on "Cell Performance" (by Mike Acton) and released under the BSD 3-Clause ("BSD New" or "BSD Simplified") license. https://code.google.com/p/cellperformance-snippets/ @@ -927,7 +1051,7 @@ can get really, really small. */ bool idRenderMatrix::Inverse( const idRenderMatrix& src, idRenderMatrix& out ) { - +#if defined(USE_INTRINSICS) const __m128 r0 = _mm_loadu_ps( src.m + 0 * 4 ); const __m128 r1 = _mm_loadu_ps( src.m + 1 * 4 ); @@ -1033,6 +1157,88 @@ bool idRenderMatrix::Inverse( const idRenderMatrix& src, idRenderMatrix& out ) _mm_storeu_ps( out.m + 2 * 4, _mm_mul_ps( adjoint_r2, rcpDet ) ); _mm_storeu_ps( out.m + 3 * 4, _mm_mul_ps( adjoint_r3, rcpDet ) ); +#else + + const int FRL = 4; + + // 84+4+16 = 104 multiplications + // 1 division + + // 2x2 sub-determinants required to calculate 4x4 determinant + const float det2_01_01 = src.m[0 * FRL + 0] * src.m[1 * FRL + 1] - src.m[0 * FRL + 1] * src.m[1 * FRL + 0]; + const float det2_01_02 = src.m[0 * FRL + 0] * src.m[1 * FRL + 2] - src.m[0 * FRL + 2] * src.m[1 * FRL + 0]; + const float det2_01_03 = src.m[0 * FRL + 0] * src.m[1 * FRL + 3] - src.m[0 * FRL + 3] * src.m[1 * FRL + 0]; + const float det2_01_12 = src.m[0 * FRL + 1] * src.m[1 * FRL + 2] - src.m[0 * FRL + 2] * src.m[1 * FRL + 1]; + const float det2_01_13 = src.m[0 * FRL + 1] * src.m[1 * FRL + 3] - src.m[0 * FRL + 3] * src.m[1 * FRL + 1]; + const float det2_01_23 = src.m[0 * FRL + 2] * src.m[1 * FRL + 3] - src.m[0 * FRL + 3] * src.m[1 * FRL + 2]; + + // 3x3 sub-determinants required to calculate 4x4 determinant + const float det3_201_012 = src.m[2 * FRL + 0] * det2_01_12 - src.m[2 * FRL + 1] * det2_01_02 + src.m[2 * FRL + 2] * det2_01_01; + const float det3_201_013 = src.m[2 * FRL + 0] * det2_01_13 - src.m[2 * FRL + 1] * det2_01_03 + src.m[2 * FRL + 3] * det2_01_01; + const float det3_201_023 = src.m[2 * FRL + 0] * det2_01_23 - src.m[2 * FRL + 2] * det2_01_03 + src.m[2 * FRL + 3] * det2_01_02; + const float det3_201_123 = src.m[2 * FRL + 1] * det2_01_23 - src.m[2 * FRL + 2] * det2_01_13 + src.m[2 * FRL + 3] * det2_01_12; + + const float det = ( - det3_201_123 * src.m[3 * FRL + 0] + det3_201_023 * src.m[3 * FRL + 1] - det3_201_013 * src.m[3 * FRL + 2] + det3_201_012 * src.m[3 * FRL + 3] ); + + if( idMath::Fabs( det ) < RENDER_MATRIX_INVERSE_EPSILON ) + { + return false; + } + + const float rcpDet = 1.0f / det; + + // remaining 2x2 sub-determinants + const float det2_03_01 = src.m[0 * FRL + 0] * src.m[3 * FRL + 1] - src.m[0 * FRL + 1] * src.m[3 * FRL + 0]; + const float det2_03_02 = src.m[0 * FRL + 0] * src.m[3 * FRL + 2] - src.m[0 * FRL + 2] * src.m[3 * FRL + 0]; + const float det2_03_03 = src.m[0 * FRL + 0] * src.m[3 * FRL + 3] - src.m[0 * FRL + 3] * src.m[3 * FRL + 0]; + const float det2_03_12 = src.m[0 * FRL + 1] * src.m[3 * FRL + 2] - src.m[0 * FRL + 2] * src.m[3 * FRL + 1]; + const float det2_03_13 = src.m[0 * FRL + 1] * src.m[3 * FRL + 3] - src.m[0 * FRL + 3] * src.m[3 * FRL + 1]; + const float det2_03_23 = src.m[0 * FRL + 2] * src.m[3 * FRL + 3] - src.m[0 * FRL + 3] * src.m[3 * FRL + 2]; + + const float det2_13_01 = src.m[1 * FRL + 0] * src.m[3 * FRL + 1] - src.m[1 * FRL + 1] * src.m[3 * FRL + 0]; + const float det2_13_02 = src.m[1 * FRL + 0] * src.m[3 * FRL + 2] - src.m[1 * FRL + 2] * src.m[3 * FRL + 0]; + const float det2_13_03 = src.m[1 * FRL + 0] * src.m[3 * FRL + 3] - src.m[1 * FRL + 3] * src.m[3 * FRL + 0]; + const float det2_13_12 = src.m[1 * FRL + 1] * src.m[3 * FRL + 2] - src.m[1 * FRL + 2] * src.m[3 * FRL + 1]; + const float det2_13_13 = src.m[1 * FRL + 1] * src.m[3 * FRL + 3] - src.m[1 * FRL + 3] * src.m[3 * FRL + 1]; + const float det2_13_23 = src.m[1 * FRL + 2] * src.m[3 * FRL + 3] - src.m[1 * FRL + 3] * src.m[3 * FRL + 2]; + + // remaining 3x3 sub-determinants + const float det3_203_012 = src.m[2 * FRL + 0] * det2_03_12 - src.m[2 * FRL + 1] * det2_03_02 + src.m[2 * FRL + 2] * det2_03_01; + const float det3_203_013 = src.m[2 * FRL + 0] * det2_03_13 - src.m[2 * FRL + 1] * det2_03_03 + src.m[2 * FRL + 3] * det2_03_01; + const float det3_203_023 = src.m[2 * FRL + 0] * det2_03_23 - src.m[2 * FRL + 2] * det2_03_03 + src.m[2 * FRL + 3] * det2_03_02; + const float det3_203_123 = src.m[2 * FRL + 1] * det2_03_23 - src.m[2 * FRL + 2] * det2_03_13 + src.m[2 * FRL + 3] * det2_03_12; + + const float det3_213_012 = src.m[2 * FRL + 0] * det2_13_12 - src.m[2 * FRL + 1] * det2_13_02 + src.m[2 * FRL + 2] * det2_13_01; + const float det3_213_013 = src.m[2 * FRL + 0] * det2_13_13 - src.m[2 * FRL + 1] * det2_13_03 + src.m[2 * FRL + 3] * det2_13_01; + const float det3_213_023 = src.m[2 * FRL + 0] * det2_13_23 - src.m[2 * FRL + 2] * det2_13_03 + src.m[2 * FRL + 3] * det2_13_02; + const float det3_213_123 = src.m[2 * FRL + 1] * det2_13_23 - src.m[2 * FRL + 2] * det2_13_13 + src.m[2 * FRL + 3] * det2_13_12; + + const float det3_301_012 = src.m[3 * FRL + 0] * det2_01_12 - src.m[3 * FRL + 1] * det2_01_02 + src.m[3 * FRL + 2] * det2_01_01; + const float det3_301_013 = src.m[3 * FRL + 0] * det2_01_13 - src.m[3 * FRL + 1] * det2_01_03 + src.m[3 * FRL + 3] * det2_01_01; + const float det3_301_023 = src.m[3 * FRL + 0] * det2_01_23 - src.m[3 * FRL + 2] * det2_01_03 + src.m[3 * FRL + 3] * det2_01_02; + const float det3_301_123 = src.m[3 * FRL + 1] * det2_01_23 - src.m[3 * FRL + 2] * det2_01_13 + src.m[3 * FRL + 3] * det2_01_12; + + out.m[0 * FRL + 0] = - det3_213_123 * rcpDet; + out.m[1 * FRL + 0] = + det3_213_023 * rcpDet; + out.m[2 * FRL + 0] = - det3_213_013 * rcpDet; + out.m[3 * FRL + 0] = + det3_213_012 * rcpDet; + + out.m[0 * FRL + 1] = + det3_203_123 * rcpDet; + out.m[1 * FRL + 1] = - det3_203_023 * rcpDet; + out.m[2 * FRL + 1] = + det3_203_013 * rcpDet; + out.m[3 * FRL + 1] = - det3_203_012 * rcpDet; + + out.m[0 * FRL + 2] = + det3_301_123 * rcpDet; + out.m[1 * FRL + 2] = - det3_301_023 * rcpDet; + out.m[2 * FRL + 2] = + det3_301_013 * rcpDet; + out.m[3 * FRL + 2] = - det3_301_012 * rcpDet; + + out.m[0 * FRL + 3] = - det3_201_123 * rcpDet; + out.m[1 * FRL + 3] = + det3_201_023 * rcpDet; + out.m[2 * FRL + 3] = - det3_201_013 * rcpDet; + out.m[3 * FRL + 3] = + det3_201_012 * rcpDet; + +#endif return true; } @@ -1159,10 +1365,9 @@ bool idRenderMatrix::InverseByDoubles( const idRenderMatrix& src, idRenderMatrix DeterminantIsNegative ======================== */ - +#if defined(USE_INTRINSICS) void DeterminantIsNegative( bool& negativeDeterminant, const __m128& r0, const __m128& r1, const __m128& r2, const __m128& r3 ) { - const __m128 r1u1 = _mm_perm_ps( r1, _MM_SHUFFLE( 2, 1, 0, 3 ) ); const __m128 r1u2 = _mm_perm_ps( r1, _MM_SHUFFLE( 1, 0, 3, 2 ) ); const __m128 r1u3 = _mm_perm_ps( r1, _MM_SHUFFLE( 0, 3, 2, 1 ) ); @@ -1204,6 +1409,31 @@ void DeterminantIsNegative( bool& negativeDeterminant, const __m128& r0, const _ negativeDeterminant = _mm_movemask_ps( result ) & 1; } +#else + +void DeterminantIsNegative( bool& negativeDeterminant, const float* row0, const float* row1, const float* row2, const float* row3 ) +{ + + // 2x2 sub-determinants required to calculate 4x4 determinant + const float det2_01_01 = row0[0] * row1[1] - row0[1] * row1[0]; + const float det2_01_02 = row0[0] * row1[2] - row0[2] * row1[0]; + const float det2_01_03 = row0[0] * row1[3] - row0[3] * row1[0]; + const float det2_01_12 = row0[1] * row1[2] - row0[2] * row1[1]; + const float det2_01_13 = row0[1] * row1[3] - row0[3] * row1[1]; + const float det2_01_23 = row0[2] * row1[3] - row0[3] * row1[2]; + + // 3x3 sub-determinants required to calculate 4x4 determinant + const float det3_201_012 = row2[0] * det2_01_12 - row2[1] * det2_01_02 + row2[2] * det2_01_01; + const float det3_201_013 = row2[0] * det2_01_13 - row2[1] * det2_01_03 + row2[3] * det2_01_01; + const float det3_201_023 = row2[0] * det2_01_23 - row2[2] * det2_01_03 + row2[3] * det2_01_02; + const float det3_201_123 = row2[1] * det2_01_23 - row2[2] * det2_01_13 + row2[3] * det2_01_12; + + const float det = ( - det3_201_123 * row3[0] + det3_201_023 * row3[1] - det3_201_013 * row3[2] + det3_201_012 * row3[3] ); + + negativeDeterminant = ( det < 0.0f ); +} + +#endif /* ======================== @@ -1217,7 +1447,7 @@ void idRenderMatrix::CopyMatrix( const idRenderMatrix& matrix, idVec4& row0, idV assert_16_byte_aligned( row2.ToFloatPtr() ); assert_16_byte_aligned( row3.ToFloatPtr() ); - +#if defined(USE_INTRINSICS) const __m128 r0 = _mm_loadu_ps( matrix.m + 0 * 4 ); const __m128 r1 = _mm_loadu_ps( matrix.m + 1 * 4 ); const __m128 r2 = _mm_loadu_ps( matrix.m + 2 * 4 ); @@ -1227,7 +1457,24 @@ void idRenderMatrix::CopyMatrix( const idRenderMatrix& matrix, idVec4& row0, idV _mm_store_ps( row1.ToFloatPtr(), r1 ); _mm_store_ps( row2.ToFloatPtr(), r2 ); _mm_store_ps( row3.ToFloatPtr(), r3 ); - +#else + row0[ 0] = matrix.m[ 0]; + row0[1] = matrix.m[ 1]; + row0[2] = matrix.m[ 2]; + row0[3] = matrix.m[ 3]; + row1[ 0] = matrix.m[ 4]; + row1[1] = matrix.m[ 5]; + row1[2] = matrix.m[ 6]; + row1[3] = matrix.m[ 7]; + row2[ 0] = matrix.m[ 8]; + row2[1] = matrix.m[ 9]; + row2[2] = matrix.m[10]; + row2[3] = matrix.m[11]; + row3[ 0] = matrix.m[12]; + row3[1] = matrix.m[13]; + row3[2] = matrix.m[14]; + row3[3] = matrix.m[15]; +#endif } /* @@ -1242,7 +1489,7 @@ void idRenderMatrix::SetMVP( const idRenderMatrix& mvp, idVec4& row0, idVec4& ro assert_16_byte_aligned( row2.ToFloatPtr() ); assert_16_byte_aligned( row3.ToFloatPtr() ); - +#if defined(USE_INTRINSICS) const __m128 r0 = _mm_loadu_ps( mvp.m + 0 * 4 ); const __m128 r1 = _mm_loadu_ps( mvp.m + 1 * 4 ); const __m128 r2 = _mm_loadu_ps( mvp.m + 2 * 4 ); @@ -1254,7 +1501,27 @@ void idRenderMatrix::SetMVP( const idRenderMatrix& mvp, idVec4& row0, idVec4& ro _mm_store_ps( row3.ToFloatPtr(), r3 ); DeterminantIsNegative( negativeDeterminant, r0, r1, r2, r3 ); +#else + row0[0] = mvp.m[ 0]; + row0[1] = mvp.m[ 1]; + row0[2] = mvp.m[ 2]; + row0[3] = mvp.m[ 3]; + row1[0] = mvp.m[ 4]; + row1[1] = mvp.m[ 5]; + row1[2] = mvp.m[ 6]; + row1[3] = mvp.m[ 7]; + row2[0] = mvp.m[ 8]; + row2[1] = mvp.m[ 9]; + row2[2] = mvp.m[10]; + row2[3] = mvp.m[11]; + row3[0] = mvp.m[12]; + row3[1] = mvp.m[13]; + row3[2] = mvp.m[14]; + row3[3] = mvp.m[15]; + DeterminantIsNegative( negativeDeterminant, mvp[0], mvp[1], mvp[2], mvp[3] ); + +#endif } /* @@ -1269,6 +1536,7 @@ void idRenderMatrix::SetMVPForBounds( const idRenderMatrix& mvp, const idBounds& assert_16_byte_aligned( row2.ToFloatPtr() ); assert_16_byte_aligned( row3.ToFloatPtr() ); +#if defined(USE_INTRINSICS) __m128 b0 = _mm_loadu_bounds_0( bounds ); __m128 b1 = _mm_loadu_bounds_1( bounds ); @@ -1317,6 +1585,34 @@ void idRenderMatrix::SetMVPForBounds( const idRenderMatrix& mvp, const idBounds& DeterminantIsNegative( negativeDeterminant, r0, r1, r2, r3 ); +#else + + const idVec3 offset = ( bounds[1] + bounds[0] ) * 0.5f; + const idVec3 scale = ( bounds[1] - bounds[0] ) * 0.5f; + + row0[0] = mvp[0][0] * scale[0]; + row0[1] = mvp[0][1] * scale[1]; + row0[2] = mvp[0][2] * scale[2]; + row0[3] = mvp[0][3] + mvp[0][0] * offset[0] + mvp[0][1] * offset[1] + mvp[0][2] * offset[2]; + + row1[0] = mvp[1][0] * scale[0]; + row1[1] = mvp[1][1] * scale[1]; + row1[2] = mvp[1][2] * scale[2]; + row1[3] = mvp[1][3] + mvp[1][0] * offset[0] + mvp[1][1] * offset[1] + mvp[1][2] * offset[2]; + + row2[0] = mvp[2][0] * scale[0]; + row2[1] = mvp[2][1] * scale[1]; + row2[2] = mvp[2][2] * scale[2]; + row2[3] = mvp[2][3] + mvp[2][0] * offset[0] + mvp[2][1] * offset[1] + mvp[2][2] * offset[2]; + + row3[0] = mvp[3][0] * scale[0]; + row3[1] = mvp[3][1] * scale[1]; + row3[2] = mvp[3][2] * scale[2]; + row3[3] = mvp[3][3] + mvp[3][0] * offset[0] + mvp[3][1] * offset[1] + mvp[3][2] * offset[2]; + + DeterminantIsNegative( negativeDeterminant, row0.ToFloatPtr(), row1.ToFloatPtr(), row2.ToFloatPtr(), row3.ToFloatPtr() ); + +#endif } /* @@ -1331,6 +1627,7 @@ void idRenderMatrix::SetMVPForInverseProject( const idRenderMatrix& mvp, const i assert_16_byte_aligned( row2.ToFloatPtr() ); assert_16_byte_aligned( row3.ToFloatPtr() ); +#if defined(USE_INTRINSICS) __m128 r0 = _mm_loadu_ps( mvp.m + 0 * 4 ); __m128 r1 = _mm_loadu_ps( mvp.m + 1 * 4 ); @@ -1369,6 +1666,31 @@ void idRenderMatrix::SetMVPForInverseProject( const idRenderMatrix& mvp, const i DeterminantIsNegative( negativeDeterminant, t0, t1, t2, t3 ); +#else + + row0[0] = mvp.m[0 * 4 + 0] * inverseProject.m[0 * 4 + 0] + mvp.m[0 * 4 + 1] * inverseProject.m[1 * 4 + 0] + mvp.m[0 * 4 + 2] * inverseProject.m[2 * 4 + 0] + mvp.m[0 * 4 + 3] * inverseProject.m[3 * 4 + 0]; + row0[1] = mvp.m[0 * 4 + 0] * inverseProject.m[0 * 4 + 1] + mvp.m[0 * 4 + 1] * inverseProject.m[1 * 4 + 1] + mvp.m[0 * 4 + 2] * inverseProject.m[2 * 4 + 1] + mvp.m[0 * 4 + 3] * inverseProject.m[3 * 4 + 1]; + row0[2] = mvp.m[0 * 4 + 0] * inverseProject.m[0 * 4 + 2] + mvp.m[0 * 4 + 1] * inverseProject.m[1 * 4 + 2] + mvp.m[0 * 4 + 2] * inverseProject.m[2 * 4 + 2] + mvp.m[0 * 4 + 3] * inverseProject.m[3 * 4 + 2]; + row0[3] = mvp.m[0 * 4 + 0] * inverseProject.m[0 * 4 + 3] + mvp.m[0 * 4 + 1] * inverseProject.m[1 * 4 + 3] + mvp.m[0 * 4 + 2] * inverseProject.m[2 * 4 + 3] + mvp.m[0 * 4 + 3] * inverseProject.m[3 * 4 + 3]; + + row1[0] = mvp.m[1 * 4 + 0] * inverseProject.m[0 * 4 + 0] + mvp.m[1 * 4 + 1] * inverseProject.m[1 * 4 + 0] + mvp.m[1 * 4 + 2] * inverseProject.m[2 * 4 + 0] + mvp.m[1 * 4 + 3] * inverseProject.m[3 * 4 + 0]; + row1[1] = mvp.m[1 * 4 + 0] * inverseProject.m[0 * 4 + 1] + mvp.m[1 * 4 + 1] * inverseProject.m[1 * 4 + 1] + mvp.m[1 * 4 + 2] * inverseProject.m[2 * 4 + 1] + mvp.m[1 * 4 + 3] * inverseProject.m[3 * 4 + 1]; + row1[2] = mvp.m[1 * 4 + 0] * inverseProject.m[0 * 4 + 2] + mvp.m[1 * 4 + 1] * inverseProject.m[1 * 4 + 2] + mvp.m[1 * 4 + 2] * inverseProject.m[2 * 4 + 2] + mvp.m[1 * 4 + 3] * inverseProject.m[3 * 4 + 2]; + row1[3] = mvp.m[1 * 4 + 0] * inverseProject.m[0 * 4 + 3] + mvp.m[1 * 4 + 1] * inverseProject.m[1 * 4 + 3] + mvp.m[1 * 4 + 2] * inverseProject.m[2 * 4 + 3] + mvp.m[1 * 4 + 3] * inverseProject.m[3 * 4 + 3]; + + row2[0] = mvp.m[2 * 4 + 0] * inverseProject.m[0 * 4 + 0] + mvp.m[2 * 4 + 1] * inverseProject.m[1 * 4 + 0] + mvp.m[2 * 4 + 2] * inverseProject.m[2 * 4 + 0] + mvp.m[2 * 4 + 3] * inverseProject.m[3 * 4 + 0]; + row2[1] = mvp.m[2 * 4 + 0] * inverseProject.m[0 * 4 + 1] + mvp.m[2 * 4 + 1] * inverseProject.m[1 * 4 + 1] + mvp.m[2 * 4 + 2] * inverseProject.m[2 * 4 + 1] + mvp.m[2 * 4 + 3] * inverseProject.m[3 * 4 + 1]; + row2[2] = mvp.m[2 * 4 + 0] * inverseProject.m[0 * 4 + 2] + mvp.m[2 * 4 + 1] * inverseProject.m[1 * 4 + 2] + mvp.m[2 * 4 + 2] * inverseProject.m[2 * 4 + 2] + mvp.m[2 * 4 + 3] * inverseProject.m[3 * 4 + 2]; + row2[3] = mvp.m[2 * 4 + 0] * inverseProject.m[0 * 4 + 3] + mvp.m[2 * 4 + 1] * inverseProject.m[1 * 4 + 3] + mvp.m[2 * 4 + 2] * inverseProject.m[2 * 4 + 3] + mvp.m[2 * 4 + 3] * inverseProject.m[3 * 4 + 3]; + + row3[0] = mvp.m[3 * 4 + 0] * inverseProject.m[0 * 4 + 0] + mvp.m[3 * 4 + 1] * inverseProject.m[1 * 4 + 0] + mvp.m[3 * 4 + 2] * inverseProject.m[2 * 4 + 0] + mvp.m[3 * 4 + 3] * inverseProject.m[3 * 4 + 0]; + row3[1] = mvp.m[3 * 4 + 0] * inverseProject.m[0 * 4 + 1] + mvp.m[3 * 4 + 1] * inverseProject.m[1 * 4 + 1] + mvp.m[3 * 4 + 2] * inverseProject.m[2 * 4 + 1] + mvp.m[3 * 4 + 3] * inverseProject.m[3 * 4 + 1]; + row3[2] = mvp.m[3 * 4 + 0] * inverseProject.m[0 * 4 + 2] + mvp.m[3 * 4 + 1] * inverseProject.m[1 * 4 + 2] + mvp.m[3 * 4 + 2] * inverseProject.m[2 * 4 + 2] + mvp.m[3 * 4 + 3] * inverseProject.m[3 * 4 + 2]; + row3[3] = mvp.m[3 * 4 + 0] * inverseProject.m[0 * 4 + 3] + mvp.m[3 * 4 + 1] * inverseProject.m[1 * 4 + 3] + mvp.m[3 * 4 + 2] * inverseProject.m[2 * 4 + 3] + mvp.m[3 * 4 + 3] * inverseProject.m[3 * 4 + 3]; + + DeterminantIsNegative( negativeDeterminant, row0.ToFloatPtr(), row1.ToFloatPtr(), row2.ToFloatPtr(), row3.ToFloatPtr() ); + +#endif } /* @@ -1449,8 +1771,7 @@ frustum plane, but only while also being behind another one. */ bool idRenderMatrix::CullBoundsToMVPbits( const idRenderMatrix& mvp, const idBounds& bounds, byte* outBits, bool zeroToOne ) { - - +#if defined(USE_INTRINSICS) __m128 mvp0 = _mm_loadu_ps( mvp[0] ); __m128 mvp1 = _mm_loadu_ps( mvp[1] ); __m128 mvp2 = _mm_loadu_ps( mvp[2] ); @@ -1549,6 +1870,70 @@ bool idRenderMatrix::CullBoundsToMVPbits( const idRenderMatrix& mvp, const idBou return ( bits != 63 ); +#else + + int bits = 0; + + idVec3 v; + for( int x = 0; x < 2; x++ ) + { + v[0] = bounds[x][0]; + for( int y = 0; y < 2; y++ ) + { + v[1] = bounds[y][1]; + for( int z = 0; z < 2; z++ ) + { + v[2] = bounds[z][2]; + + idVec4 c; + for( int i = 0; i < 4; i++ ) + { + c[i] = v[0] * mvp[i][0] + v[1] * mvp[i][1] + v[2] * mvp[i][2] + mvp[i][3]; + } + + const float minW = zeroToOne ? 0.0f : -c[3]; + const float maxW = c[3]; +#if defined( CLIP_SPACE_D3D ) // the D3D clip space Z is in the range [0,1] so always compare Z vs zero whether 'zeroToOne' is true or false + const float minZ = 0.0f; +#else + const float minZ = minW; +#endif + + if( c[0] > minW ) + { + bits |= ( 1 << 0 ); + } + if( c[0] < maxW ) + { + bits |= ( 1 << 1 ); + } + if( c[1] > minW ) + { + bits |= ( 1 << 2 ); + } + if( c[1] < maxW ) + { + bits |= ( 1 << 3 ); + } + if( c[2] > minZ ) + { + bits |= ( 1 << 4 ); // NOTE: using minZ + } + if( c[2] < maxW ) + { + bits |= ( 1 << 5 ); + } + } + } + } + + // store out a bit set for each side where the bounds is outside the clip space + *outBits = ( byte )( bits ^ 63 ); + + // if any bits weren't set, the bounds is completely off one side of the frustum + return ( bits != 63 ); + +#endif } /* @@ -1572,6 +1957,7 @@ bool idRenderMatrix::CullExtrudedBoundsToMVPbits( const idRenderMatrix& mvp, con { assert( idMath::Fabs( extrudeDirection * clipPlane.Normal() ) >= idMath::FLT_SMALLEST_NON_DENORMAL ); +#if defined(USE_INTRINSICS) __m128 mvp0 = _mm_loadu_ps( mvp[0] ); __m128 mvp1 = _mm_loadu_ps( mvp[1] ); @@ -1789,6 +2175,88 @@ bool idRenderMatrix::CullExtrudedBoundsToMVPbits( const idRenderMatrix& mvp, con return ( bits != 63 ); +#else + + int bits = 0; + + float closing = extrudeDirection * clipPlane.Normal(); + float invClosing = -1.0f / closing; + + idVec3 v; + for( int x = 0; x < 2; x++ ) + { + v[0] = bounds[x][0]; + for( int y = 0; y < 2; y++ ) + { + v[1] = bounds[y][1]; + for( int z = 0; z < 2; z++ ) + { + v[2] = bounds[z][2]; + + for( int extrude = 0; extrude <= 1; extrude++ ) + { + + idVec3 test; + if( extrude ) + { + const float extrudeDist = clipPlane.Distance( v ) * invClosing; + test = v + extrudeDirection * extrudeDist; + } + else + { + test = v; + } + + idVec4 c; + for( int i = 0; i < 4; i++ ) + { + c[i] = test[0] * mvp[i][0] + test[1] * mvp[i][1] + test[2] * mvp[i][2] + mvp[i][3]; + } + + const float minW = zeroToOne ? 0.0f : -c[3]; + const float maxW = c[3]; +#if defined( CLIP_SPACE_D3D ) // the D3D clip space Z is in the range [0,1] so always compare Z vs zero whether 'zeroToOne' is true or false + const float minZ = 0.0f; +#else + const float minZ = minW; +#endif + + if( c[0] > minW ) + { + bits |= ( 1 << 0 ); + } + if( c[0] < maxW ) + { + bits |= ( 1 << 1 ); + } + if( c[1] > minW ) + { + bits |= ( 1 << 2 ); + } + if( c[1] < maxW ) + { + bits |= ( 1 << 3 ); + } + if( c[2] > minZ ) + { + bits |= ( 1 << 4 ); // NOTE: using minZ + } + if( c[2] < maxW ) + { + bits |= ( 1 << 5 ); + } + } + } + } + } + + // store out a bit set for each side where the bounds is outside the clip space + *outBits = ( byte )( bits ^ 63 ); + + // if any bits weren't set, the bounds is completely off one side of the frustum + return ( bits != 63 ); + +#endif } /* @@ -1807,6 +2275,7 @@ is W=0 clipped. */ void idRenderMatrix::ProjectedBounds( idBounds& projected, const idRenderMatrix& mvp, const idBounds& bounds, bool windowSpace ) { +#if defined(USE_INTRINSICS) __m128 mvp0 = _mm_loadu_ps( mvp[0] ); __m128 mvp1 = _mm_loadu_ps( mvp[1] ); @@ -1940,6 +2409,84 @@ void idRenderMatrix::ProjectedBounds( idBounds& projected, const idRenderMatrix& _mm_store_ss( & projected[1].y, maxY ); _mm_store_ss( & projected[1].z, maxZ ); +#else + + for( int i = 0; i < 3; i++ ) + { + projected[0][i] = RENDER_MATRIX_INFINITY; + projected[1][i] = - RENDER_MATRIX_INFINITY; + } + + idVec3 v; + for( int x = 0; x < 2; x++ ) + { + v[0] = bounds[x][0]; + for( int y = 0; y < 2; y++ ) + { + v[1] = bounds[y][1]; + for( int z = 0; z < 2; z++ ) + { + v[2] = bounds[z][2]; + + float tx = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2] + mvp[0][3]; + float ty = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2] + mvp[1][3]; + float tz = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3]; + float tw = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3]; + + if( tw <= idMath::FLT_SMALLEST_NON_DENORMAL ) + { + projected[0][0] = -RENDER_MATRIX_INFINITY; + projected[0][1] = -RENDER_MATRIX_INFINITY; + projected[0][2] = -RENDER_MATRIX_INFINITY; + projected[1][0] = RENDER_MATRIX_INFINITY; + projected[1][1] = RENDER_MATRIX_INFINITY; + // NOTE: projected[1][1] is still valid + continue; + } + + float rw = 1.0f / tw; + + tx = tx * rw; + ty = ty * rw; + tz = tz * rw; + + projected[0][0] = Min( projected[0][0], tx ); + projected[0][1] = Min( projected[0][1], ty ); + projected[0][2] = Min( projected[0][2], tz ); + + projected[1][0] = Max( projected[1][0], tx ); + projected[1][1] = Max( projected[1][1], ty ); + projected[1][2] = Max( projected[1][2], tz ); + } + } + } + + if( windowSpace ) + { + // convert to window coords + projected[0][0] = projected[0][0] * 0.5f + 0.5f; + projected[1][0] = projected[1][0] * 0.5f + 0.5f; + + projected[0][1] = projected[0][1] * 0.5f + 0.5f; + projected[1][1] = projected[1][1] * 0.5f + 0.5f; + +#if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1] + projected[0][2] = projected[0][2] * 0.5f + 0.5f; + projected[1][2] = projected[1][2] * 0.5f + 0.5f; +#endif + + // clamp to [0, 1] range + projected[0][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][0] ); + projected[1][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][0] ); + + projected[0][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][1] ); + projected[1][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][1] ); + + projected[0][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][2] ); + projected[1][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][2] ); + } + +#endif } /* @@ -1976,6 +2523,7 @@ void idRenderMatrix::ProjectedNearClippedBounds( idBounds& projected, const idRe - X + */ +#if defined(USE_INTRINSICS) const __m128 mvp0 = _mm_loadu_ps( mvp[0] ); const __m128 mvp1 = _mm_loadu_ps( mvp[1] ); @@ -2238,6 +2786,320 @@ void idRenderMatrix::ProjectedNearClippedBounds( idBounds& projected, const idRe _mm_store_ss( & projected[1].y, maxY ); _mm_store_ss( & projected[1].z, maxZ ); +#elif 1 + + { + const idVec3 points[8] = + { + idVec3( bounds[0][0], bounds[0][1], bounds[0][2] ), + idVec3( bounds[1][0], bounds[0][1], bounds[0][2] ), + idVec3( bounds[1][0], bounds[1][1], bounds[0][2] ), + idVec3( bounds[0][0], bounds[1][1], bounds[0][2] ), + idVec3( bounds[0][0], bounds[0][1], bounds[1][2] ), + idVec3( bounds[1][0], bounds[0][1], bounds[1][2] ), + idVec3( bounds[1][0], bounds[1][1], bounds[1][2] ), + idVec3( bounds[0][0], bounds[1][1], bounds[1][2] ) + }; + + idVec4 projectedPoints[8]; + for( int i = 0; i < 8; i++ ) + { + const idVec3& v = points[i]; + projectedPoints[i].x = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2] + mvp[0][3]; + projectedPoints[i].y = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2] + mvp[1][3]; + projectedPoints[i].z = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3]; + projectedPoints[i].w = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3]; + } + + const idVec4& p0 = projectedPoints[0]; + const idVec4& p1 = projectedPoints[1]; + const idVec4& p2 = projectedPoints[2]; + const idVec4& p3 = projectedPoints[3]; + const idVec4& p4 = projectedPoints[4]; + const idVec4& p5 = projectedPoints[5]; + const idVec4& p6 = projectedPoints[6]; + const idVec4& p7 = projectedPoints[7]; + +#if defined( CLIP_SPACE_D3D ) // the D3D near plane is at Z=0 instead of Z=-1 + const float d0 = p0.z; + const float d1 = p1.z; + const float d2 = p2.z; + const float d3 = p3.z; + const float d4 = p4.z; + const float d5 = p5.z; + const float d6 = p6.z; + const float d7 = p7.z; +#else + const float d0 = p0.z + p0.w; + const float d1 = p1.z + p1.w; + const float d2 = p2.z + p2.w; + const float d3 = p3.z + p3.w; + const float d4 = p4.z + p4.w; + const float d5 = p5.z + p5.w; + const float d6 = p6.z + p6.w; + const float d7 = p7.z + p7.w; +#endif + + const float deltaA = d0 - d1; + const float deltaB = d1 - d2; + const float deltaC = d2 - d3; + const float deltaD = d3 - d0; + + const float deltaE = d4 - d5; + const float deltaF = d5 - d6; + const float deltaG = d6 - d7; + const float deltaH = d7 - d4; + + const float deltaI = d0 - d4; + const float deltaJ = d1 - d5; + const float deltaK = d2 - d6; + const float deltaL = d3 - d7; + + const float fractionA = ( fabs( deltaA ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d0 / deltaA ) : 0.0f; + const float fractionB = ( fabs( deltaB ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d1 / deltaB ) : 0.0f; + const float fractionC = ( fabs( deltaC ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d2 / deltaC ) : 0.0f; + const float fractionD = ( fabs( deltaD ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d3 / deltaD ) : 0.0f; + + const float fractionE = ( fabs( deltaE ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d4 / deltaE ) : 0.0f; + const float fractionF = ( fabs( deltaF ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d5 / deltaF ) : 0.0f; + const float fractionG = ( fabs( deltaG ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d6 / deltaG ) : 0.0f; + const float fractionH = ( fabs( deltaH ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d7 / deltaH ) : 0.0f; + + const float fractionI = ( fabs( deltaI ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d0 / deltaI ) : 0.0f; + const float fractionJ = ( fabs( deltaJ ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d1 / deltaJ ) : 0.0f; + const float fractionK = ( fabs( deltaK ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d2 / deltaK ) : 0.0f; + const float fractionL = ( fabs( deltaL ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d3 / deltaL ) : 0.0f; + + const bool clipA = ( fractionA > 0.0f && fractionA < 1.0f ); + const bool clipB = ( fractionB > 0.0f && fractionB < 1.0f ); + const bool clipC = ( fractionC > 0.0f && fractionC < 1.0f ); + const bool clipD = ( fractionD > 0.0f && fractionD < 1.0f ); + + const bool clipE = ( fractionE > 0.0f && fractionE < 1.0f ); + const bool clipF = ( fractionF > 0.0f && fractionF < 1.0f ); + const bool clipG = ( fractionG > 0.0f && fractionG < 1.0f ); + const bool clipH = ( fractionH > 0.0f && fractionH < 1.0f ); + + const bool clipI = ( fractionI > 0.0f && fractionI < 1.0f ); + const bool clipJ = ( fractionJ > 0.0f && fractionJ < 1.0f ); + const bool clipK = ( fractionK > 0.0f && fractionK < 1.0f ); + const bool clipL = ( fractionL > 0.0f && fractionL < 1.0f ); + + const idVec4 intersectionA = p0 + fractionA * ( p1 - p0 ); + const idVec4 intersectionB = p1 + fractionB * ( p2 - p1 ); + const idVec4 intersectionC = p2 + fractionC * ( p3 - p2 ); + const idVec4 intersectionD = p3 + fractionD * ( p0 - p3 ); + + const idVec4 intersectionE = p4 + fractionE * ( p5 - p4 ); + const idVec4 intersectionF = p5 + fractionF * ( p6 - p5 ); + const idVec4 intersectionG = p6 + fractionG * ( p7 - p6 ); + const idVec4 intersectionH = p7 + fractionH * ( p4 - p7 ); + + const idVec4 intersectionI = p0 + fractionI * ( p4 - p0 ); + const idVec4 intersectionJ = p1 + fractionJ * ( p5 - p1 ); + const idVec4 intersectionK = p2 + fractionK * ( p6 - p2 ); + const idVec4 intersectionL = p3 + fractionL * ( p7 - p3 ); + + idVec4 edgeVerts[24]; + + edgeVerts[ 0] = ( clipA && d0 < 0.0f ) ? intersectionA : p0; + edgeVerts[ 2] = ( clipB && d1 < 0.0f ) ? intersectionB : p1; + edgeVerts[ 4] = ( clipC && d2 < 0.0f ) ? intersectionC : p2; + edgeVerts[ 6] = ( clipD && d3 < 0.0f ) ? intersectionD : p3; + + edgeVerts[ 1] = ( clipA && d1 < 0.0f ) ? intersectionA : p1; + edgeVerts[ 3] = ( clipB && d2 < 0.0f ) ? intersectionB : p2; + edgeVerts[ 5] = ( clipC && d3 < 0.0f ) ? intersectionC : p3; + edgeVerts[ 7] = ( clipD && d0 < 0.0f ) ? intersectionD : p0; + + edgeVerts[ 8] = ( clipE && d4 < 0.0f ) ? intersectionE : p4; + edgeVerts[10] = ( clipF && d5 < 0.0f ) ? intersectionF : p5; + edgeVerts[12] = ( clipG && d6 < 0.0f ) ? intersectionG : p6; + edgeVerts[14] = ( clipH && d7 < 0.0f ) ? intersectionH : p7; + + edgeVerts[ 9] = ( clipE && d5 < 0.0f ) ? intersectionE : p5; + edgeVerts[11] = ( clipF && d6 < 0.0f ) ? intersectionF : p6; + edgeVerts[13] = ( clipG && d7 < 0.0f ) ? intersectionG : p7; + edgeVerts[15] = ( clipH && d4 < 0.0f ) ? intersectionH : p4; + + edgeVerts[16] = ( clipI && d0 < 0.0f ) ? intersectionI : p0; + edgeVerts[18] = ( clipJ && d1 < 0.0f ) ? intersectionJ : p1; + edgeVerts[20] = ( clipK && d2 < 0.0f ) ? intersectionK : p2; + edgeVerts[22] = ( clipL && d3 < 0.0f ) ? intersectionL : p3; + + edgeVerts[17] = ( clipI && d4 < 0.0f ) ? intersectionI : p4; + edgeVerts[19] = ( clipJ && d5 < 0.0f ) ? intersectionJ : p5; + edgeVerts[21] = ( clipK && d6 < 0.0f ) ? intersectionK : p6; + edgeVerts[23] = ( clipL && d7 < 0.0f ) ? intersectionL : p7; + + idBounds projBnds; + for( int i = 0; i < 3; i++ ) + { + projBnds[0][i] = RENDER_MATRIX_INFINITY; + projBnds[1][i] = - RENDER_MATRIX_INFINITY; + } + + for( int i = 0; i < 24; i++ ) + { + const idVec4& v = edgeVerts[i]; + + if( v.w <= idMath::FLT_SMALLEST_NON_DENORMAL ) + { + continue; + } + + const float rw = 1.0f / v.w; + + const float px = v.x * rw; + const float py = v.y * rw; + const float pz = v.z * rw; + + projBnds[0][0] = Min( projBnds[0][0], px ); + projBnds[0][1] = Min( projBnds[0][1], py ); + projBnds[0][2] = Min( projBnds[0][2], pz ); + + projBnds[1][0] = Max( projBnds[1][0], px ); + projBnds[1][1] = Max( projBnds[1][1], py ); + projBnds[1][2] = Max( projBnds[1][2], pz ); + } + + if( windowSpace ) + { + // convert to window coords + projBnds[0][0] = projBnds[0][0] * 0.5f + 0.5f; + projBnds[1][0] = projBnds[1][0] * 0.5f + 0.5f; + + projBnds[0][1] = projBnds[0][1] * 0.5f + 0.5f; + projBnds[1][1] = projBnds[1][1] * 0.5f + 0.5f; + +#if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1] + projBnds[0][2] = projBnds[0][2] * 0.5f + 0.5f; + projBnds[1][2] = projBnds[1][2] * 0.5f + 0.5f; +#endif + + // clamp to [0, 1] range + projBnds[0][0] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[0][0] ); + projBnds[1][0] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[1][0] ); + + projBnds[0][1] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[0][1] ); + projBnds[1][1] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[1][1] ); + + projBnds[0][2] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[0][2] ); + projBnds[1][2] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[1][2] ); + } + + assert( projected[0].Compare( projBnds[0], 0.01f ) ); + assert( projected[1].Compare( projBnds[1], 0.01f ) ); + } + +#else + + const idVec3 points[8] = + { + idVec3( bounds[0][0], bounds[0][1], bounds[0][2] ), + idVec3( bounds[1][0], bounds[0][1], bounds[0][2] ), + idVec3( bounds[1][0], bounds[1][1], bounds[0][2] ), + idVec3( bounds[0][0], bounds[1][1], bounds[0][2] ), + idVec3( bounds[0][0], bounds[0][1], bounds[1][2] ), + idVec3( bounds[1][0], bounds[0][1], bounds[1][2] ), + idVec3( bounds[1][0], bounds[1][1], bounds[1][2] ), + idVec3( bounds[0][0], bounds[1][1], bounds[1][2] ) + }; + + idVec4 projectedPoints[8]; + for( int i = 0; i < 8; i++ ) + { + const idVec3& v = points[i]; + projectedPoints[i].x = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2] + mvp[0][3]; + projectedPoints[i].y = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2] + mvp[1][3]; + projectedPoints[i].z = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3]; + projectedPoints[i].w = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3]; + } + + idVec4 edgeVerts[24]; + for( int i = 0; i < 3; i++ ) + { + int offset0 = ( i & 1 ) * 4; + int offset1 = ( i & 1 ) * 4 + ( i & 2 ) * 2; + int offset3 = ~( i >> 1 ) & 1; + for( int j = 0; j < 4; j++ ) + { + const idVec4 p0 = projectedPoints[offset0 + ( ( j + 0 ) & 3 )]; + const idVec4 p1 = projectedPoints[offset1 + ( ( j + offset3 ) & 3 )]; + +#if defined( CLIP_SPACE_D3D ) // the D3D near plane is at Z=0 instead of Z=-1 + const float d0 = p0.z; + const float d1 = p1.z; +#else + const float d0 = p0.z + p0.w; + const float d1 = p1.z + p1.w; +#endif + const float delta = d0 - d1; + const float fraction = idMath::Fabs( delta ) > idMath::FLT_SMALLEST_NON_DENORMAL ? ( d0 / delta ) : 1.0f; + const bool clip = ( fraction > 0.0f && fraction < 1.0f ); + const idVec4 intersection = p0 + fraction * ( p1 - p0 ); + + edgeVerts[i * 8 + j * 2 + 0] = ( clip && d0 < 0.0f ) ? intersection : p0; + edgeVerts[i * 8 + j * 2 + 1] = ( clip && d1 < 0.0f ) ? intersection : p1; + } + } + + for( int i = 0; i < 3; i++ ) + { + projected[0][i] = RENDER_MATRIX_INFINITY; + projected[1][i] = - RENDER_MATRIX_INFINITY; + } + + for( int i = 0; i < 24; i++ ) + { + const idVec4& v = edgeVerts[i]; + + if( v.w <= idMath::FLT_SMALLEST_NON_DENORMAL ) + { + continue; + } + + const float rw = 1.0f / v.w; + + const float px = v.x * rw; + const float py = v.y * rw; + const float pz = v.z * rw; + + projected[0][0] = Min( projected[0][0], px ); + projected[0][1] = Min( projected[0][1], py ); + projected[0][2] = Min( projected[0][2], pz ); + + projected[1][0] = Max( projected[1][0], px ); + projected[1][1] = Max( projected[1][1], py ); + projected[1][2] = Max( projected[1][2], pz ); + } + + if( windowSpace ) + { + // convert to window coords + projected[0][0] = projected[0][0] * 0.5f + 0.5f; + projected[1][0] = projected[1][0] * 0.5f + 0.5f; + + projected[0][1] = projected[0][1] * 0.5f + 0.5f; + projected[1][1] = projected[1][1] * 0.5f + 0.5f; + +#if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1] + projected[0][2] = projected[0][2] * 0.5f + 0.5f; + projected[1][2] = projected[1][2] * 0.5f + 0.5f; +#endif + + // clamp to [0, 1] range + projected[0][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][0] ); + projected[1][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][0] ); + + projected[0][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][1] ); + projected[1][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][1] ); + + projected[0][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][2] ); + projected[1][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][2] ); + } + +#endif } #if 0 @@ -2300,6 +3162,7 @@ ClipHomogeneousPolygonToSide Clips a polygon with homogeneous coordinates to the axis aligned plane[axis] = sign * offset. ======================== */ +#if defined(USE_INTRINSICS) static void ClipHomogeneousPolygonToSide_SSE2( idVec4* __restrict newPoints, idVec4* __restrict points, int& numPoints, const int axis, const __m128& sign, const __m128& offset ) { @@ -2446,6 +3309,97 @@ static int ClipHomogeneousPolygonToUnitCube_SSE2( idVec4* points, int numPoints return numPoints; } +#else + +/* +======================== +ClipHomogeneousLineToSide + +Clips a line with homogeneous coordinates to the axis aligned plane[axis] = side. +======================== +*/ +static idVec4 ClipHomogeneousLineToSide( const idVec4& p0, const idVec4& p1, int axis, float side ) +{ + const float d0 = p0.w * side - p0[axis]; + const float d1 = p1.w * side - p1[axis]; + const float delta = d0 - d1; + const float f = idMath::Fabs( delta ) > idMath::FLT_SMALLEST_NON_DENORMAL ? ( d0 / delta ) : 1.0f; + const float c = idMath::ClampFloat( 0.0f, 1.0f, f ); + return p0 + c * ( p1 - p0 ); +} + +/* +======================== +ClipHomogeneousPolygonToSide + +Clips a polygon with homogeneous coordinates to the axis aligned plane[axis] = sign * offset. +======================== +*/ +static int ClipHomogeneousPolygonToSide_Generic( idVec4* __restrict newPoints, idVec4* __restrict points, int numPoints, int axis, float sign, float offset ) +{ + assert( newPoints != points ); + + assert( numPoints < 16 ); + int sides[16]; + + const float side = sign * offset; + + // calculate the plane side for each original point and calculate all potential new points + for( int i = 0; i < numPoints; i++ ) + { + int j = ( i + 1 ) & ( ( i + 1 - numPoints ) >> 31 ); + sides[i] = sign * points[i][axis] < offset * points[i].w; + newPoints[i * 2 + 0] = points[i]; + newPoints[i * 2 + 1] = ClipHomogeneousLineToSide( points[i], points[j], axis, side ); + }; + + // repeat the first side at the end to avoid having to wrap around + sides[numPoints] = sides[0]; + + // compact the array of points + int numNewPoints = 0; + for( int i = 0; i < numPoints; i++ ) + { + if( sides[i + 0] != 0 ) + { + newPoints[numNewPoints++] = newPoints[i * 2 + 0]; + } + if( ( sides[i + 0] ^ sides[i + 1] ) != 0 ) + { + newPoints[numNewPoints++] = newPoints[i * 2 + 1]; + } + } + + assert( numNewPoints <= 16 ); + return numNewPoints; +} + +/* +======================== +ClipHomogeneousPolygonToUnitCube + +Clips a polygon with homogeneous coordinates to all six axis aligned unit cube planes. +======================== +*/ +static int ClipHomogeneousPolygonToUnitCube_Generic( idVec4* points, int numPoints ) +{ + assert( numPoints < 16 - 6 ); + ALIGNTYPE16 idVec4 newPoints[2 * 16]; // the C clip code temporarily doubles the points + +#if defined( CLIP_SPACE_D3D ) // the D3D near plane is at Z=0 instead of Z=-1 + numPoints = ClipHomogeneousPolygonToSide_Generic( newPoints, points, numPoints, 2, -1.0f, 0.0f ); // near +#else + numPoints = ClipHomogeneousPolygonToSide_Generic( newPoints, points, numPoints, 2, -1.0f, 1.0f ); // near +#endif + numPoints = ClipHomogeneousPolygonToSide_Generic( points, newPoints, numPoints, 2, +1.0f, 1.0f ); // far + numPoints = ClipHomogeneousPolygonToSide_Generic( newPoints, points, numPoints, 1, -1.0f, 1.0f ); // bottom + numPoints = ClipHomogeneousPolygonToSide_Generic( points, newPoints, numPoints, 1, +1.0f, 1.0f ); // top + numPoints = ClipHomogeneousPolygonToSide_Generic( newPoints, points, numPoints, 0, -1.0f, 1.0f ); // left + numPoints = ClipHomogeneousPolygonToSide_Generic( points, newPoints, numPoints, 0, +1.0f, 1.0f ); // right + return numPoints; +} + +#endif /* ======================== @@ -2465,6 +3419,7 @@ the given bounds in which case the projected bounds should be set to fully cover */ void idRenderMatrix::ProjectedFullyClippedBounds( idBounds& projected, const idRenderMatrix& mvp, const idBounds& bounds, bool windowSpace ) { +#if defined(USE_INTRINSICS) const __m128 mvp0 = _mm_loadu_ps( mvp[0] ); const __m128 mvp1 = _mm_loadu_ps( mvp[1] ); @@ -2619,6 +3574,102 @@ void idRenderMatrix::ProjectedFullyClippedBounds( idBounds& projected, const idR _mm_store_ss( & projected[1].y, maxY ); _mm_store_ss( & projected[1].z, maxZ ); +#else + + const idVec3 points[8] = + { + idVec3( bounds[0][0], bounds[0][1], bounds[0][2] ), + idVec3( bounds[1][0], bounds[0][1], bounds[0][2] ), + idVec3( bounds[1][0], bounds[1][1], bounds[0][2] ), + idVec3( bounds[0][0], bounds[1][1], bounds[0][2] ), + idVec3( bounds[0][0], bounds[0][1], bounds[1][2] ), + idVec3( bounds[1][0], bounds[0][1], bounds[1][2] ), + idVec3( bounds[1][0], bounds[1][1], bounds[1][2] ), + idVec3( bounds[0][0], bounds[1][1], bounds[1][2] ) + }; + + idVec4 projectedPoints[8]; + for( int i = 0; i < 8; i++ ) + { + const idVec3& v = points[i]; + projectedPoints[i].x = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2] + mvp[0][3]; + projectedPoints[i].y = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2] + mvp[1][3]; + projectedPoints[i].z = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3]; + projectedPoints[i].w = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3]; + } + + idVec4 clippedPoints[6 * 16]; + int numClippedPoints = 0; + for( int i = 0; i < 6; i++ ) + { + clippedPoints[numClippedPoints + 0] = projectedPoints[boxPolygonVertices[i][0]]; + clippedPoints[numClippedPoints + 1] = projectedPoints[boxPolygonVertices[i][1]]; + clippedPoints[numClippedPoints + 2] = projectedPoints[boxPolygonVertices[i][2]]; + clippedPoints[numClippedPoints + 3] = projectedPoints[boxPolygonVertices[i][3]]; + numClippedPoints += ClipHomogeneousPolygonToUnitCube_Generic( &clippedPoints[numClippedPoints], 4 ); + } + + // test if the center of the near clip plane is inside the given bounding box + const idVec3 localNearClipCenter = LocalNearClipCenterFromMVP( mvp ); + const bool inside = bounds.Expand( RENDER_MATRIX_PROJECTION_EPSILON ).ContainsPoint( localNearClipCenter ); + + for( int i = 0; i < 3; i++ ) + { + projected[0][i] = RENDER_MATRIX_INFINITY; + projected[1][i] = - RENDER_MATRIX_INFINITY; + } + if( inside ) + { + projected[0][2] = -1.0f; + } + + for( int i = 0; i < numClippedPoints; i++ ) + { + const idVec4& c = clippedPoints[i]; + + assert( c.w > idMath::FLT_SMALLEST_NON_DENORMAL ); + + const float rw = 1.0f / c.w; + + const float px = c.x * rw; + const float py = c.y * rw; + const float pz = c.z * rw; + + projected[0][0] = Min( projected[0][0], px ); + projected[0][1] = Min( projected[0][1], py ); + projected[0][2] = Min( projected[0][2], pz ); + + projected[1][0] = Max( projected[1][0], px ); + projected[1][1] = Max( projected[1][1], py ); + projected[1][2] = Max( projected[1][2], pz ); + } + + if( windowSpace ) + { + // convert to window coords + projected[0][0] = projected[0][0] * 0.5f + 0.5f; + projected[1][0] = projected[1][0] * 0.5f + 0.5f; + + projected[0][1] = projected[0][1] * 0.5f + 0.5f; + projected[1][1] = projected[1][1] * 0.5f + 0.5f; + +#if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1] + projected[0][2] = projected[0][2] * 0.5f + 0.5f; + projected[1][2] = projected[1][2] * 0.5f + 0.5f; +#endif + + // clamp to [0, 1] range + projected[0][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][0] ); + projected[1][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][0] ); + + projected[0][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][1] ); + projected[1][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][1] ); + + projected[0][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][2] ); + projected[1][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][2] ); + } + +#endif } /* @@ -2633,6 +3684,7 @@ The given bounding box is not clipped to the MVP so the depth bounds may not be */ void idRenderMatrix::DepthBoundsForBounds( float& min, float& max, const idRenderMatrix& mvp, const idBounds& bounds, bool windowSpace ) { +#if defined(USE_INTRINSICS) __m128 mvp2 = _mm_loadu_ps( mvp[2] ); __m128 mvp3 = _mm_loadu_ps( mvp[3] ); @@ -2700,6 +3752,53 @@ void idRenderMatrix::DepthBoundsForBounds( float& min, float& max, const idRende _mm_store_ss( & min, minv ); _mm_store_ss( & max, maxv ); +#else + + float localMin = RENDER_MATRIX_INFINITY; + float localMax = - RENDER_MATRIX_INFINITY; + + idVec3 v; + for( int x = 0; x < 2; x++ ) + { + v[0] = bounds[x][0]; + for( int y = 0; y < 2; y++ ) + { + v[1] = bounds[y][1]; + for( int z = 0; z < 2; z++ ) + { + v[2] = bounds[z][2]; + + float tz = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3]; + float tw = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3]; + + if( tw > idMath::FLT_SMALLEST_NON_DENORMAL ) + { + tz = tz / tw; + } + else + { + tz = -RENDER_MATRIX_INFINITY; + } + + localMin = Min( localMin, tz ); + localMax = Max( localMax, tz ); + } + } + } + + if( windowSpace ) + { + // convert to window coords +#if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1] + min = localMin * 0.5f + 0.5f; + max = localMax * 0.5f + 0.5f; +#endif + // clamp to the [0, 1] range + min = Max( min, 0.0f ); + max = Min( max, 1.0f ); + } + +#endif } /* @@ -2717,6 +3816,7 @@ void idRenderMatrix::DepthBoundsForExtrudedBounds( float& min, float& max, const { assert( idMath::Fabs( extrudeDirection * clipPlane.Normal() ) >= idMath::FLT_SMALLEST_NON_DENORMAL ); +#if defined(USE_INTRINSICS) __m128 mvp2 = _mm_loadu_ps( mvp[2] ); __m128 mvp3 = _mm_loadu_ps( mvp[3] ); @@ -2872,6 +3972,71 @@ void idRenderMatrix::DepthBoundsForExtrudedBounds( float& min, float& max, const _mm_store_ss( & min, minv ); _mm_store_ss( & max, maxv ); +#else + + const float closing = extrudeDirection * clipPlane.Normal(); + const float invClosing = -1.0f / closing; + + float localMin = RENDER_MATRIX_INFINITY; + float localMax = - RENDER_MATRIX_INFINITY; + + idVec3 v; + for( int x = 0; x < 2; x++ ) + { + v[0] = bounds[x][0]; + for( int y = 0; y < 2; y++ ) + { + v[1] = bounds[y][1]; + for( int z = 0; z < 2; z++ ) + { + v[2] = bounds[z][2]; + + for( int extrude = 0; extrude <= 1; extrude++ ) + { + + idVec3 test; + if( extrude ) + { + float extrudeDist = clipPlane.Distance( v ) * invClosing; + test = v + extrudeDirection * extrudeDist; + } + else + { + test = v; + } + + float tz = test[0] * mvp[2][0] + test[1] * mvp[2][1] + test[2] * mvp[2][2] + mvp[2][3]; + float tw = test[0] * mvp[3][0] + test[1] * mvp[3][1] + test[2] * mvp[3][2] + mvp[3][3]; + + if( tw > idMath::FLT_SMALLEST_NON_DENORMAL ) + { + tz = tz / tw; + } + else + { + tz = -RENDER_MATRIX_INFINITY; + } + + localMin = Min( localMin, tz ); + localMax = Max( localMax, tz ); + } + } + } + } + + if( windowSpace ) + { + // convert to window coords +#if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1] + min = localMin * 0.5f + 0.5f; + max = localMax * 0.5f + 0.5f; +#endif + // clamp to the [0, 1] range + min = Max( min, 0.0f ); + max = Min( max, 1.0f ); + } + +#endif } /* @@ -2932,6 +4097,7 @@ testing if the center of the far clipping plane is contained inside the shadow v */ void idRenderMatrix::DepthBoundsForShadowBounds( float& min, float& max, const idRenderMatrix& mvp, const idBounds& bounds, const idVec3& localLightOrigin, bool windowSpace ) { +#if defined(USE_INTRINSICS) const __m128 mvp0 = _mm_loadu_ps( mvp[0] ); const __m128 mvp1 = _mm_loadu_ps( mvp[1] ); @@ -3112,6 +4278,116 @@ void idRenderMatrix::DepthBoundsForShadowBounds( float& min, float& max, const i _mm_store_ss( & min, minZ ); _mm_store_ss( & max, maxZ ); +#else + + const idVec3 points[8] = + { + idVec3( bounds[0][0], bounds[0][1], bounds[0][2] ), + idVec3( bounds[1][0], bounds[0][1], bounds[0][2] ), + idVec3( bounds[1][0], bounds[1][1], bounds[0][2] ), + idVec3( bounds[0][0], bounds[1][1], bounds[0][2] ), + idVec3( bounds[0][0], bounds[0][1], bounds[1][2] ), + idVec3( bounds[1][0], bounds[0][1], bounds[1][2] ), + idVec3( bounds[1][0], bounds[1][1], bounds[1][2] ), + idVec3( bounds[0][0], bounds[1][1], bounds[1][2] ) + }; + + // calculate the front facing polygon bits + int frontBits = GetBoxFrontBits_Generic( bounds, localLightOrigin ); + + // bounding box corners + ALIGNTYPE16 idVec4 projectedNearPoints[8]; + for( int i = 0; i < 8; i++ ) + { + const idVec3& v = points[i]; + projectedNearPoints[i].x = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2] + mvp[0][3]; + projectedNearPoints[i].y = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2] + mvp[1][3]; + projectedNearPoints[i].z = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3]; + projectedNearPoints[i].w = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3]; + } + + // bounding box corners projected to infinity from the light position + ALIGNTYPE16 idVec4 projectedFarPoints[8]; + for( int i = 0; i < 8; i++ ) + { + const idVec3 v = points[i] - localLightOrigin; + projectedFarPoints[i].x = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2]; + projectedFarPoints[i].y = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2]; + projectedFarPoints[i].z = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2]; + projectedFarPoints[i].w = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2]; + } + + ALIGNTYPE16 idVec4 clippedPoints[( 6 + 12 ) * 16]; + int numClippedPoints = 0; + + // clip the front facing bounding box polygons at the near cap + const frontPolygons_t& frontPolygons = boxFrontPolygonsForFrontBits[frontBits]; + for( int i = 0; i < frontPolygons.count; i++ ) + { + const int polygon = frontPolygons.indices[i]; + clippedPoints[numClippedPoints + 0] = projectedNearPoints[boxPolygonVertices[polygon][0]]; + clippedPoints[numClippedPoints + 1] = projectedNearPoints[boxPolygonVertices[polygon][1]]; + clippedPoints[numClippedPoints + 2] = projectedNearPoints[boxPolygonVertices[polygon][2]]; + clippedPoints[numClippedPoints + 3] = projectedNearPoints[boxPolygonVertices[polygon][3]]; + numClippedPoints += ClipHomogeneousPolygonToUnitCube_Generic( &clippedPoints[numClippedPoints], 4 ); + } + + // clip the front facing bounding box polygons projected to the far cap + for( int i = 0; i < frontPolygons.count; i++ ) + { + const int polygon = frontPolygons.indices[i]; + clippedPoints[numClippedPoints + 0] = projectedFarPoints[boxPolygonVertices[polygon][0]]; + clippedPoints[numClippedPoints + 1] = projectedFarPoints[boxPolygonVertices[polygon][1]]; + clippedPoints[numClippedPoints + 2] = projectedFarPoints[boxPolygonVertices[polygon][2]]; + clippedPoints[numClippedPoints + 3] = projectedFarPoints[boxPolygonVertices[polygon][3]]; + numClippedPoints += ClipHomogeneousPolygonToUnitCube_Generic( &clippedPoints[numClippedPoints], 4 ); + } + + // clip the silhouette edge polygons that stretch to infinity + const silhouetteEdges_t& silhouetteEdges = boxSilhouetteEdgesForFrontBits[frontBits]; + for( int i = 0; i < silhouetteEdges.count; i++ ) + { + const int edge = silhouetteEdges.indices[i]; + clippedPoints[numClippedPoints + 0] = projectedNearPoints[boxEdgeVertices[edge][0]]; + clippedPoints[numClippedPoints + 1] = projectedNearPoints[boxEdgeVertices[edge][1]]; + clippedPoints[numClippedPoints + 2] = projectedFarPoints[boxEdgeVertices[edge][1]]; + clippedPoints[numClippedPoints + 3] = projectedFarPoints[boxEdgeVertices[edge][0]]; + numClippedPoints += ClipHomogeneousPolygonToUnitCube_Generic( &clippedPoints[numClippedPoints], 4 ); + } + + // test if the center of the near clip plane is inside the infinite shadow volume + const idVec3 localNearClipCenter = LocalNearClipCenterFromMVP( mvp ); + const bool inside = PointInsideInfiniteShadow( bounds, localLightOrigin, localNearClipCenter, RENDER_MATRIX_PROJECTION_EPSILON ); + + min = inside ? -1.0f : RENDER_MATRIX_INFINITY; + max = - RENDER_MATRIX_INFINITY; + + for( int i = 0; i < numClippedPoints; i++ ) + { + const idVec4& c = clippedPoints[i]; + + assert( c.w > idMath::FLT_SMALLEST_NON_DENORMAL ); + + const float rw = 1.0f / c.w; + const float pz = c.z * rw; + + min = Min( min, pz ); + max = Max( max, pz ); + } + + if( windowSpace ) + { + // convert to window coords +#if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1] + min = min * 0.5f + 0.5f; + max = max * 0.5f + 0.5f; +#endif + // clamp to [0, 1] range + min = idMath::ClampFloat( 0.0f, 1.0f, min ); + max = idMath::ClampFloat( 0.0f, 1.0f, max ); + } + +#endif } /* @@ -3212,6 +4488,7 @@ void idRenderMatrix::GetFrustumCorners( frustumCorners_t& corners, const idRende { assert_16_byte_aligned( &corners ); +#if defined(USE_INTRINSICS) __m128 mvp0 = _mm_loadu_ps( frustumTransform[0] ); __m128 mvp1 = _mm_loadu_ps( frustumTransform[1] ); @@ -3284,6 +4561,36 @@ void idRenderMatrix::GetFrustumCorners( frustumCorners_t& corners, const idRende _mm_store_ps( corners.z + 0, z0 ); _mm_store_ps( corners.z + 4, z1 ); +#else + + idVec3 v; + for( int x = 0; x < 2; x++ ) + { + v[0] = frustumBounds[x][0]; + for( int y = 0; y < 2; y++ ) + { + v[1] = frustumBounds[y][1]; + for( int z = 0; z < 2; z++ ) + { + v[2] = frustumBounds[z][2]; + + float tx = v[0] * frustumTransform[0][0] + v[1] * frustumTransform[0][1] + v[2] * frustumTransform[0][2] + frustumTransform[0][3]; + float ty = v[0] * frustumTransform[1][0] + v[1] * frustumTransform[1][1] + v[2] * frustumTransform[1][2] + frustumTransform[1][3]; + float tz = v[0] * frustumTransform[2][0] + v[1] * frustumTransform[2][1] + v[2] * frustumTransform[2][2] + frustumTransform[2][3]; + float tw = v[0] * frustumTransform[3][0] + v[1] * frustumTransform[3][1] + v[2] * frustumTransform[3][2] + frustumTransform[3][3]; + + assert( tw > idMath::FLT_SMALLEST_NON_DENORMAL ); + + float rw = 1.0f / tw; + + corners.x[( z << 2 ) | ( y << 1 ) | ( x << 0 )] = tx * rw; + corners.y[( z << 2 ) | ( y << 1 ) | ( x << 0 )] = ty * rw; + corners.z[( z << 2 ) | ( y << 1 ) | ( x << 0 )] = tz * rw; + } + } + } + +#endif } /* @@ -3295,6 +4602,7 @@ frustumCull_t idRenderMatrix::CullFrustumCornersToPlane( const frustumCorners_t& { assert_16_byte_aligned( &corners ); +#if defined(USE_INTRINSICS) __m128 vp = _mm_loadu_ps( plane.ToFloatPtr() ); @@ -3326,4 +4634,34 @@ frustumCull_t idRenderMatrix::CullFrustumCornersToPlane( const frustumCorners_t& return ( frustumCull_t )( front | ( back << 1 ) ); +#else + + bool front = false; + bool back = false; + for( int i = 0; i < 8; i++ ) + { + const float d = corners.x[i] * plane[0] + corners.y[i] * plane[1] + corners.z[i] * plane[2] + plane[3]; + if( d >= 0.0f ) + { + front = true; + } + else if( d <= 0.0f ) + { + back = true; + } + if( back && front ) + { + return FRUSTUM_CULL_CROSS; + } + } + if( front ) + { + return FRUSTUM_CULL_FRONT; + } + else + { + return FRUSTUM_CULL_BACK; + } + +#endif } diff --git a/neo/idlib/math/Lcp.cpp b/neo/idlib/math/Lcp.cpp index 09976247..eb8c5354 100644 --- a/neo/idlib/math/Lcp.cpp +++ b/neo/idlib/math/Lcp.cpp @@ -44,8 +44,11 @@ const float LCP_DELTA_FORCE_EPSILON = 1e-9f; #define IGNORE_UNSATISFIABLE_VARIABLES +#if defined(USE_INTRINSICS) +#define LCP_SIMD +#endif - +#if defined(LCP_SIMD) ALIGN16( const __m128 SIMD_SP_zero ) = { 0.0f, 0.0f, 0.0f, 0.0f }; ALIGN16( const __m128 SIMD_SP_one ) = { 1.0f, 1.0f, 1.0f, 1.0f }; ALIGN16( const __m128 SIMD_SP_two ) = { 2.0f, 2.0f, 2.0f, 2.0f }; @@ -70,7 +73,7 @@ ALIGN16( const unsigned int SIMD_DW_one[4] ) = { 1, 1, 1, 1 }; ALIGN16( const unsigned int SIMD_DW_four[4] ) = { 4, 4, 4, 4 }; ALIGN16( const unsigned int SIMD_DW_index[4] ) = { 0, 1, 2, 3 }; ALIGN16( const int SIMD_DW_not3[4] ) = { ~3, ~3, ~3, ~3 }; - +#endif // #if defined(LCP_SIMD) /* ======================== Multiply_SIMD @@ -91,6 +94,7 @@ static void Multiply_SIMD( float* dst, const float* src0, const float* src1, con dst[i] = src0[i] * src1[i]; } +#if defined(LCP_SIMD) for( ; i + 4 <= count; i += 4 ) { @@ -104,6 +108,21 @@ static void Multiply_SIMD( float* dst, const float* src0, const float* src1, con _mm_store_ps( dst + i, s0 ); } +#else + + for( ; i + 4 <= count; i += 4 ) + { + assert_16_byte_aligned( &dst[i] ); + assert_16_byte_aligned( &src0[i] ); + assert_16_byte_aligned( &src1[i] ); + + dst[i + 0] = src0[i + 0] * src1[i + 0]; + dst[i + 1] = src0[i + 1] * src1[i + 1]; + dst[i + 2] = src0[i + 2] * src1[i + 2]; + dst[i + 3] = src0[i + 3] * src1[i + 3]; + } + +#endif for( ; i < count; i++ ) { @@ -124,6 +143,7 @@ static void MultiplyAdd_SIMD( float* dst, const float constant, const float* src { int i = 0; + // RB: changed unsigned int to uintptr_t for( ; ( ( uintptr_t )dst & 0xF ) != 0 && i < count; i++ ) // RB end @@ -131,6 +151,7 @@ static void MultiplyAdd_SIMD( float* dst, const float constant, const float* src dst[i] += constant * src[i]; } +#if defined(LCP_SIMD) __m128 c = _mm_load1_ps( & constant ); for( ; i + 4 <= count; i += 4 ) @@ -144,6 +165,20 @@ static void MultiplyAdd_SIMD( float* dst, const float constant, const float* src _mm_store_ps( dst + i, s ); } +#else + + for( ; i + 4 <= count; i += 4 ) + { + assert_16_byte_aligned( &src[i] ); + assert_16_byte_aligned( &dst[i] ); + + dst[i + 0] += constant * src[i + 0]; + dst[i + 1] += constant * src[i + 1]; + dst[i + 2] += constant * src[i + 2]; + dst[i + 3] += constant * src[i + 3]; + } + +#endif for( ; i < count; i++ ) { @@ -163,7 +198,7 @@ static float DotProduct_SIMD( const float* src0, const float* src1, const int co assert_16_byte_aligned( src0 ); assert_16_byte_aligned( src1 ); -#ifndef _lint +#if defined(LCP_SIMD) __m128 sum = ( __m128& ) SIMD_SP_zero; int i = 0; @@ -328,7 +363,7 @@ static void LowerTriangularSolve_SIMD( const idMatX& L, float* x, const float* b int i = skip; -#ifndef _lint +#if defined(LCP_SIMD) // work up to a multiple of 4 rows for( ; ( i & 3 ) != 0 && i < n; i++ ) @@ -601,7 +636,7 @@ static void LowerTriangularSolveTranspose_SIMD( const idMatX& L, float* x, const const float* lptr = L.ToFloatPtr() + m * nc + m - 4; float* xptr = x + m; -#ifndef _lint +#if defined(LCP_SIMD) // process 4 rows at a time for( int i = m; i >= 4; i -= 4 ) @@ -982,7 +1017,7 @@ static bool LDLT_Factor_SIMD( idMatX& mat, idVecX& invDiag, const int n ) mptr[j * nc + 3] = ( mptr[j * nc + 3] - v[0] * mptr[j * nc + 0] - v[1] * mptr[j * nc + 1] - v[2] * mptr[j * nc + 2] ) * d; } -#ifndef _lint +#if defined(LCP_SIMD) __m128 vzero = _mm_setzero_ps(); for( int i = 4; i < n; i += 4 ) @@ -1360,7 +1395,7 @@ static void GetMaxStep_SIMD( const float* f, const float* a, const float* delta_ int d, float dir, float& maxStep, int& limit, int& limitSide ) { - +#if defined(LCP_SIMD) __m128 vMaxStep; __m128i vLimit; __m128i vLimitSide; @@ -1484,6 +1519,117 @@ static void GetMaxStep_SIMD( const float* f, const float* a, const float* delta_ _mm_store_ss( & maxStep, vMaxStep ); limit = _mm_cvtsi128_si32( vLimit ); limitSide = _mm_cvtsi128_si32( vLimitSide ); +#else + int i; + float s; + + // default to a full step for the current variable + if( idMath::Fabs( delta_a[d] ) > LCP_DELTA_ACCEL_EPSILON ) + { + maxStep = -a[d] / delta_a[d]; + } + else + { + maxStep = 0.0f; + } + limit = d; + limitSide = 0; + + // test the current variable + if( dir < 0.0f ) + { + if( lo[d] != -idMath::INFINITY ) + { + s = ( lo[d] - f[d] ) / dir; + if( s < maxStep ) + { + maxStep = s; + limitSide = -1; + } + } + } + else + { + if( hi[d] != idMath::INFINITY ) + { + s = ( hi[d] - f[d] ) / dir; + if( s < maxStep ) + { + maxStep = s; + limitSide = 1; + } + } + } + + // test the clamped bounded variables + for( i = numUnbounded; i < numClamped; i++ ) + { + if( delta_f[i] < -LCP_DELTA_FORCE_EPSILON ) + { + // if there is a low boundary + if( lo[i] != -idMath::INFINITY ) + { + s = ( lo[i] - f[i] ) / delta_f[i]; + if( s < maxStep ) + { + maxStep = s; + limit = i; + limitSide = -1; + } + } + } + else if( delta_f[i] > LCP_DELTA_FORCE_EPSILON ) + { + // if there is a high boundary + if( hi[i] != idMath::INFINITY ) + { + s = ( hi[i] - f[i] ) / delta_f[i]; + if( s < maxStep ) + { + maxStep = s; + limit = i; + limitSide = 1; + } + } + } + } + + // test the not clamped bounded variables + for( i = numClamped; i < d; i++ ) + { + if( side[i] == -1 ) + { + if( delta_a[i] >= -LCP_DELTA_ACCEL_EPSILON ) + { + continue; + } + } + else if( side[i] == 1 ) + { + if( delta_a[i] <= LCP_DELTA_ACCEL_EPSILON ) + { + continue; + } + } + else + { + continue; + } + // ignore variables for which the force is not allowed to take any substantial value + if( lo[i] >= -LCP_BOUND_EPSILON && hi[i] <= LCP_BOUND_EPSILON ) + { + continue; + } + s = -a[i] / delta_a[i]; + if( s < maxStep ) + { + maxStep = s; + limit = i; + limitSide = 0; + } + } + +#endif } /* diff --git a/neo/idlib/math/MatX.cpp b/neo/idlib/math/MatX.cpp index 0f936246..e4d009cc 100644 --- a/neo/idlib/math/MatX.cpp +++ b/neo/idlib/math/MatX.cpp @@ -201,6 +201,7 @@ void idMatX::CopyLowerToUpperTriangle() assert( ( GetNumColumns() & 3 ) == 0 ); assert( GetNumColumns() >= GetNumRows() ); +#if defined(USE_INTRINSICS) const int n = GetNumColumns(); const int m = GetNumRows(); @@ -341,6 +342,22 @@ void idMatX::CopyLowerToUpperTriangle() _mm_store_ps( basePtr + n0, r0 ); } +#else + + const int n = GetNumColumns(); + const int m = GetNumRows(); + for( int i = 0; i < m; i++ ) + { + const float* __restrict ptr = ToFloatPtr() + ( i + 1 ) * n + i; + float* __restrict dstPtr = ToFloatPtr() + i * n; + for( int j = i + 1; j < m; j++ ) + { + dstPtr[j] = ptr[0]; + ptr += n; + } + } + +#endif #ifdef _DEBUG for( int i = 0; i < numRows; i++ ) diff --git a/neo/idlib/math/MatX.h b/neo/idlib/math/MatX.h index 6a7583ef..16559f8d 100644 --- a/neo/idlib/math/MatX.h +++ b/neo/idlib/math/MatX.h @@ -46,7 +46,10 @@ NOTE: due to the temporary memory pool idMatX cannot be used by multiple threads #define MATX_CLEAREND() int s = numRows * numColumns; while( s < ( ( s + 3 ) & ~3 ) ) { mat[s++] = 0.0f; } #define MATX_ALLOCA( n ) ( (float *) _alloca16( MATX_QUAD( n ) ) ) #define MATX_ALLOCA_CACHE_LINES( n ) ( (float *) _alloca128( ( ( n ) * sizeof( float ) + CACHE_LINE_SIZE - 1 ) & ~ ( CACHE_LINE_SIZE - 1 ) ) ) + +#if defined(USE_INTRINSICS) #define MATX_SIMD +#endif class idMatX { diff --git a/neo/idlib/math/Math.cpp b/neo/idlib/math/Math.cpp index 3b8e4b3b..c6fa513f 100644 --- a/neo/idlib/math/Math.cpp +++ b/neo/idlib/math/Math.cpp @@ -51,6 +51,7 @@ const float idMath::INFINITY = 1e30f; const float idMath::FLT_EPSILON = 1.192092896e-07f; const float idMath::FLT_SMALLEST_NON_DENORMAL = * reinterpret_cast< const float* >( & SMALLEST_NON_DENORMAL ); // 1.1754944e-038f +#if defined(USE_INTRINSICS) const __m128 idMath::SIMD_SP_zero = { 0.0f, 0.0f, 0.0f, 0.0f }; const __m128 idMath::SIMD_SP_255 = { 255.0f, 255.0f, 255.0f, 255.0f }; const __m128 idMath::SIMD_SP_min_char = { -128.0f, -128.0f, -128.0f, -128.0f }; @@ -61,6 +62,7 @@ const __m128 idMath::SIMD_SP_smallestNonDenorm = { FLT_SMALLEST_NON_DENORMAL, FL const __m128 idMath::SIMD_SP_tiny = { 1e-4f, 1e-4f, 1e-4f, 1e-4f }; const __m128 idMath::SIMD_SP_rsqrt_c0 = { 3.0f, 3.0f, 3.0f, 3.0f }; const __m128 idMath::SIMD_SP_rsqrt_c1 = { -0.5f, -0.5f, -0.5f, -0.5f }; +#endif bool idMath::initialized = false; dword idMath::iSqrt[SQRT_TABLE_SIZE]; // inverse square root lookup table diff --git a/neo/idlib/math/Math.h b/neo/idlib/math/Math.h index 9daf72e4..0870e09b 100644 --- a/neo/idlib/math/Math.h +++ b/neo/idlib/math/Math.h @@ -469,6 +469,7 @@ public: static const float FLT_EPSILON; // smallest positive number such that 1.0+FLT_EPSILON != 1.0 static const float FLT_SMALLEST_NON_DENORMAL; // smallest non-denormal 32-bit floating point value +#if defined(USE_INTRINSICS) static const __m128 SIMD_SP_zero; static const __m128 SIMD_SP_255; static const __m128 SIMD_SP_min_char; @@ -479,6 +480,7 @@ public: static const __m128 SIMD_SP_tiny; static const __m128 SIMD_SP_rsqrt_c0; static const __m128 SIMD_SP_rsqrt_c1; +#endif private: enum @@ -526,9 +528,7 @@ idMath::InvSqrt16 */ ID_INLINE float idMath::InvSqrt16( float x ) { - return ( x > FLT_SMALLEST_NON_DENORMAL ) ? sqrtf( 1.0f / x ) : INFINITY; - } /* @@ -1321,8 +1321,21 @@ ID_INLINE int idMath::Ftoi( float f ) // If a converted result is larger than the maximum signed doubleword integer, // the floating-point invalid exception is raised, and if this exception is masked, // the indefinite integer value (80000000H) is returned. +#if defined(USE_INTRINSICS) __m128 x = _mm_load_ss( &f ); return _mm_cvttss_si32( x ); +#elif 0 // round chop (C/C++ standard) + int i, s, e, m, shift; + i = *reinterpret_cast( &f ); + s = i >> IEEE_FLT_SIGN_BIT; + e = ( ( i >> IEEE_FLT_MANTISSA_BITS ) & ( ( 1 << IEEE_FLT_EXPONENT_BITS ) - 1 ) ) - IEEE_FLT_EXPONENT_BIAS; + m = ( i & ( ( 1 << IEEE_FLT_MANTISSA_BITS ) - 1 ) ) | ( 1 << IEEE_FLT_MANTISSA_BITS ); + shift = e - IEEE_FLT_MANTISSA_BITS; + return ( ( ( ( m >> -shift ) | ( m << shift ) ) & ~( e >> INT32_SIGN_BIT ) ) ^ s ) - s; +#else + // If a converted result is larger than the maximum signed doubleword integer the result is undefined. + return C_FLOAT_TO_INT( f ); +#endif } /* @@ -1332,10 +1345,24 @@ idMath::Ftoi8 */ ID_INLINE char idMath::Ftoi8( float f ) { +#if defined(USE_INTRINSICS) __m128 x = _mm_load_ss( &f ); x = _mm_max_ss( x, SIMD_SP_min_char ); x = _mm_min_ss( x, SIMD_SP_max_char ); return static_cast( _mm_cvttss_si32( x ) ); +#else + // The converted result is clamped to the range [-128,127]. + int i = C_FLOAT_TO_INT( f ); + if( i < -128 ) + { + return -128; + } + else if( i > 127 ) + { + return 127; + } + return static_cast( i ); +#endif } /* @@ -1345,10 +1372,24 @@ idMath::Ftoi16 */ ID_INLINE short idMath::Ftoi16( float f ) { +#if defined(USE_INTRINSICS) __m128 x = _mm_load_ss( &f ); x = _mm_max_ss( x, SIMD_SP_min_short ); x = _mm_min_ss( x, SIMD_SP_max_short ); return static_cast( _mm_cvttss_si32( x ) ); +#else + // The converted result is clamped to the range [-32768,32767]. + int i = C_FLOAT_TO_INT( f ); + if( i < -32768 ) + { + return -32768; + } + else if( i > 32767 ) + { + return 32767; + } + return static_cast( i ); +#endif } /* @@ -1382,10 +1423,25 @@ ID_INLINE byte idMath::Ftob( float f ) { // If a converted result is negative the value (0) is returned and if the // converted result is larger than the maximum byte the value (255) is returned. + +#if defined(USE_INTRINSICS) __m128 x = _mm_load_ss( &f ); x = _mm_max_ss( x, SIMD_SP_zero ); x = _mm_min_ss( x, SIMD_SP_255 ); return static_cast( _mm_cvttss_si32( x ) ); +#else + // The converted result is clamped to the range [0,255]. + int i = C_FLOAT_TO_INT( f ); + if( i < 0 ) + { + return 0; + } + else if( i > 255 ) + { + return 255; + } + return static_cast( i ); +#endif } /* diff --git a/neo/idlib/math/Simd.cpp b/neo/idlib/math/Simd.cpp index 49deedfd..abd620d7 100644 --- a/neo/idlib/math/Simd.cpp +++ b/neo/idlib/math/Simd.cpp @@ -2,10 +2,10 @@ =========================================================================== Doom 3 BFG Edition GPL Source Code -Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company. +Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company. Copyright (C) 2012 Robert Beckebans -This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code"). +This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code"). Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -33,17 +33,18 @@ If you have questions concerning this license or the applicable additional terms #include "Simd_Generic.h" #include "Simd_SSE.h" -idSIMDProcessor * processor = NULL; // pointer to SIMD processor -idSIMDProcessor * generic = NULL; // pointer to generic SIMD implementation -idSIMDProcessor * SIMDProcessor = NULL; +idSIMDProcessor* processor = NULL; // pointer to SIMD processor +idSIMDProcessor* generic = NULL; // pointer to generic SIMD implementation +idSIMDProcessor* SIMDProcessor = NULL; /* ================ idSIMD::Init ================ */ -void idSIMD::Init() { - generic = new (TAG_MATH) idSIMD_Generic; +void idSIMD::Init() +{ + generic = new( TAG_MATH ) idSIMD_Generic; generic->cpuid = CPUID_GENERIC; processor = NULL; SIMDProcessor = generic; @@ -54,41 +55,54 @@ void idSIMD::Init() { idSIMD::InitProcessor ============ */ -void idSIMD::InitProcessor( const char *module, bool forceGeneric ) { +void idSIMD::InitProcessor( const char* module, bool forceGeneric ) +{ cpuid_t cpuid; - idSIMDProcessor *newProcessor; - + idSIMDProcessor* newProcessor; + cpuid = idLib::sys->GetProcessorId(); - - if ( forceGeneric ) { - + + if( forceGeneric ) + { + newProcessor = generic; - - } else { - - if ( processor == NULL ) { - if ( ( cpuid & CPUID_MMX ) && ( cpuid & CPUID_SSE ) ) { - processor = new (TAG_MATH) idSIMD_SSE; - } else { + + } + else + { + + if( processor == NULL ) + { +#if defined(USE_INTRINSICS) + if( ( cpuid & CPUID_MMX ) && ( cpuid & CPUID_SSE ) ) + { + processor = new( TAG_MATH ) idSIMD_SSE; + } + else +#endif + { processor = generic; } processor->cpuid = cpuid; } - + newProcessor = processor; } - - if ( newProcessor != SIMDProcessor ) { + + if( newProcessor != SIMDProcessor ) + { SIMDProcessor = newProcessor; idLib::common->Printf( "%s using %s for SIMD processing\n", module, SIMDProcessor->GetName() ); } - - if ( cpuid & CPUID_FTZ ) { + + if( cpuid & CPUID_FTZ ) + { idLib::sys->FPU_SetFTZ( true ); idLib::common->Printf( "enabled Flush-To-Zero mode\n" ); } - - if ( cpuid & CPUID_DAZ ) { + + if( cpuid & CPUID_DAZ ) + { idLib::sys->FPU_SetDAZ( true ); idLib::common->Printf( "enabled Denormals-Are-Zero mode\n" ); } @@ -99,8 +113,10 @@ void idSIMD::InitProcessor( const char *module, bool forceGeneric ) { idSIMD::Shutdown ================ */ -void idSIMD::Shutdown() { - if ( processor != generic ) { +void idSIMD::Shutdown() +{ + if( processor != generic ) + { delete processor; } delete generic; @@ -122,8 +138,8 @@ void idSIMD::Shutdown() { #define RANDOM_SEED 1013904223L //((int)idLib::sys->GetClockTicks()) -idSIMDProcessor *p_simd; -idSIMDProcessor *p_generic; +idSIMDProcessor* p_simd; +idSIMDProcessor* p_generic; int baseClocks = 0; // DG: use int instead of long for 64bit compatibility #if defined(_MSC_VER) && defined(_M_IX86) @@ -186,19 +202,24 @@ double ticksPerNanosecond; PrintClocks ============ */ -void PrintClocks( const char *string, int dataCount, int clocks, int otherClocks = 0 ) { +void PrintClocks( const char* string, int dataCount, int clocks, int otherClocks = 0 ) +{ int i; - + idLib::common->Printf( string ); - for ( i = idStr::LengthWithoutColors(string); i < 48; i++ ) { - idLib::common->Printf(" "); + for( i = idStr::LengthWithoutColors( string ); i < 48; i++ ) + { + idLib::common->Printf( " " ); } clocks -= baseClocks; - if ( otherClocks && clocks ) { + if( otherClocks && clocks ) + { otherClocks -= baseClocks; - float p = (float)otherClocks / (float)clocks; + float p = ( float )otherClocks / ( float )clocks; idLib::common->Printf( "c = %4d, clcks = %5d, %.1fX\n", dataCount, clocks, p ); - } else { + } + else + { idLib::common->Printf( "c = %4d, clcks = %5d\n", dataCount, clocks ); } } @@ -208,11 +229,13 @@ void PrintClocks( const char *string, int dataCount, int clocks, int otherClocks GetBaseClocks ============ */ -void GetBaseClocks() { +void GetBaseClocks() +{ int i, start, end, bestClocks; - + bestClocks = 0; - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); StopRecordTime( end ); GetBest( start, end, bestClocks ); @@ -225,7 +248,8 @@ void GetBaseClocks() { TestMinMax ============ */ -void TestMinMax() { +void TestMinMax() +{ int i; TIME_TYPE start, end, bestClocksGeneric, bestClocksSIMD; ALIGN16( float fsrc0[COUNT] ); @@ -236,11 +260,12 @@ void TestMinMax() { float min = 0.0f, max = 0.0f, min2 = 0.0f, max2 = 0.0f; idVec2 v2min, v2max, v2min2, v2max2; idVec3 vmin, vmax, vmin2, vmax2; - const char *result; - + const char* result; + idRandom srnd( RANDOM_SEED ); - - for ( i = 0; i < COUNT; i++ ) { + + for( i = 0; i < COUNT; i++ ) + { fsrc0[i] = srnd.CRandomFloat() * 10.0f; v2src0[i][0] = srnd.CRandomFloat() * 10.0f; v2src0[i][1] = srnd.CRandomFloat() * 10.0f; @@ -250,11 +275,12 @@ void TestMinMax() { drawVerts[i].xyz = v3src0[i]; indexes[i] = i; } - - idLib::common->Printf("====================================\n" ); - + + idLib::common->Printf( "====================================\n" ); + bestClocksGeneric = 0; - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { min = idMath::INFINITY; max = -idMath::INFINITY; StartRecordTime( start ); @@ -263,95 +289,104 @@ void TestMinMax() { GetBest( start, end, bestClocksGeneric ); } PrintClocks( "generic->MinMax( float[] )", COUNT, bestClocksGeneric ); - + bestClocksSIMD = 0; - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); p_simd->MinMax( min2, max2, fsrc0, COUNT ); StopRecordTime( end ); GetBest( start, end, bestClocksSIMD ); } - + result = ( min == min2 && max == max2 ) ? "ok" : S_COLOR_RED"X"; PrintClocks( va( " simd->MinMax( float[] ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric ); - + bestClocksGeneric = 0; - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); p_generic->MinMax( v2min, v2max, v2src0, COUNT ); StopRecordTime( end ); GetBest( start, end, bestClocksGeneric ); } PrintClocks( "generic->MinMax( idVec2[] )", COUNT, bestClocksGeneric ); - + bestClocksSIMD = 0; - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); p_simd->MinMax( v2min2, v2max2, v2src0, COUNT ); StopRecordTime( end ); GetBest( start, end, bestClocksSIMD ); } - + result = ( v2min == v2min2 && v2max == v2max2 ) ? "ok" : S_COLOR_RED"X"; PrintClocks( va( " simd->MinMax( idVec2[] ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric ); - + bestClocksGeneric = 0; - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); p_generic->MinMax( vmin, vmax, v3src0, COUNT ); StopRecordTime( end ); GetBest( start, end, bestClocksGeneric ); } PrintClocks( "generic->MinMax( idVec3[] )", COUNT, bestClocksGeneric ); - + bestClocksSIMD = 0; - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); p_simd->MinMax( vmin2, vmax2, v3src0, COUNT ); StopRecordTime( end ); GetBest( start, end, bestClocksSIMD ); } - + result = ( vmin == vmin2 && vmax == vmax2 ) ? "ok" : S_COLOR_RED"X"; PrintClocks( va( " simd->MinMax( idVec3[] ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric ); - + bestClocksGeneric = 0; - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); p_generic->MinMax( vmin, vmax, drawVerts, COUNT ); StopRecordTime( end ); GetBest( start, end, bestClocksGeneric ); } PrintClocks( "generic->MinMax( idDrawVert[] )", COUNT, bestClocksGeneric ); - + bestClocksSIMD = 0; - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); p_simd->MinMax( vmin2, vmax2, drawVerts, COUNT ); StopRecordTime( end ); GetBest( start, end, bestClocksSIMD ); } - + result = ( vmin == vmin2 && vmax == vmax2 ) ? "ok" : S_COLOR_RED"X"; PrintClocks( va( " simd->MinMax( idDrawVert[] ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric ); - + bestClocksGeneric = 0; - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); p_generic->MinMax( vmin, vmax, drawVerts, indexes, COUNT ); StopRecordTime( end ); GetBest( start, end, bestClocksGeneric ); } PrintClocks( "generic->MinMax( idDrawVert[], indexes[] )", COUNT, bestClocksGeneric ); - + bestClocksSIMD = 0; - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); p_simd->MinMax( vmin2, vmax2, drawVerts, indexes, COUNT ); StopRecordTime( end ); GetBest( start, end, bestClocksSIMD ); } - + result = ( vmin == vmin2 && vmax == vmax2 ) ? "ok" : S_COLOR_RED"X"; PrintClocks( va( " simd->MinMax( idDrawVert[], indexes[] ) %s", result ), COUNT, bestClocksSIMD, bestClocksGeneric ); } @@ -361,47 +396,54 @@ void TestMinMax() { TestMemcpy ============ */ -void TestMemcpy() { +void TestMemcpy() +{ TIME_TYPE start, end, bestClocksGeneric, bestClocksSIMD; int i; byte test0[BIG_COUNT]; byte test1[BIG_COUNT]; - const char * result; - + const char* result; + idRandom random( RANDOM_SEED ); - for ( i = 0; i < BIG_COUNT; i++ ) { + for( i = 0; i < BIG_COUNT; i++ ) + { test0[i] = random.RandomInt( 255 ); } - - idLib::common->Printf("====================================\n" ); - + + idLib::common->Printf( "====================================\n" ); + bestClocksGeneric = 0; - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); p_generic->Memcpy( test1, test0, BIG_COUNT ); StopRecordTime( end ); GetBest( start, end, bestClocksGeneric ); } PrintClocks( "generic->Memcpy()", BIG_COUNT, bestClocksGeneric ); - - for ( i = 0; i < BIG_COUNT; i++ ) { + + for( i = 0; i < BIG_COUNT; i++ ) + { test0[i] = random.RandomInt( 255 ); } - + bestClocksSIMD = 0; - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); p_simd->Memcpy( test1, test0, BIG_COUNT ); StopRecordTime( end ); GetBest( start, end, bestClocksSIMD ); } - for ( i = 0; i < BIG_COUNT; i++ ) { - if ( test1[i] != test0[i] ) { + for( i = 0; i < BIG_COUNT; i++ ) + { + if( test1[i] != test0[i] ) + { break; } } result = ( i >= BIG_COUNT ) ? "ok" : S_COLOR_RED"X"; - PrintClocks( va( " simd->Memcpy() %s", result), BIG_COUNT, bestClocksSIMD, bestClocksGeneric ); + PrintClocks( va( " simd->Memcpy() %s", result ), BIG_COUNT, bestClocksSIMD, bestClocksGeneric ); } /* @@ -409,68 +451,77 @@ void TestMemcpy() { TestMemset ============ */ -void TestMemset() { +void TestMemset() +{ TIME_TYPE start, end, bestClocksGeneric, bestClocksSIMD; int i, j; - const char * result; + const char* result; byte test0[BIG_COUNT]; - + idRandom random( RANDOM_SEED ); j = 1 + random.RandomInt( 254 ); - - idLib::common->Printf("====================================\n" ); - + + idLib::common->Printf( "====================================\n" ); + bestClocksGeneric = 0; - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); p_generic->Memset( test0, j, BIG_COUNT ); StopRecordTime( end ); GetBest( start, end, bestClocksGeneric ); } PrintClocks( "generic->Memset()", BIG_COUNT, bestClocksGeneric ); - + j = 1 + random.RandomInt( 254 ); - + bestClocksSIMD = 0; - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); p_simd->Memset( test0, j, BIG_COUNT ); StopRecordTime( end ); GetBest( start, end, bestClocksSIMD ); } - for ( i = 0; i < BIG_COUNT; i++ ) { - if ( test0[i] != j ) { + for( i = 0; i < BIG_COUNT; i++ ) + { + if( test0[i] != j ) + { break; } } result = ( i >= BIG_COUNT ) ? "ok" : S_COLOR_RED"X"; - PrintClocks( va( " simd->Memset() %s", result), BIG_COUNT, bestClocksSIMD, bestClocksGeneric ); - + PrintClocks( va( " simd->Memset() %s", result ), BIG_COUNT, bestClocksSIMD, bestClocksGeneric ); + j = 0; - + bestClocksGeneric = 0; - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); p_generic->Memset( test0, j, BIG_COUNT ); StopRecordTime( end ); GetBest( start, end, bestClocksGeneric ); } PrintClocks( "generic->Memset( 0 )", BIG_COUNT, bestClocksGeneric ); - + bestClocksSIMD = 0; - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); p_simd->Memset( test0, j, BIG_COUNT ); StopRecordTime( end ); GetBest( start, end, bestClocksSIMD ); } - for ( i = 0; i < BIG_COUNT; i++ ) { - if ( test0[i] != j ) { + for( i = 0; i < BIG_COUNT; i++ ) + { + if( test0[i] != j ) + { break; } } result = ( i >= BIG_COUNT ) ? "ok" : S_COLOR_RED"X"; - PrintClocks( va( " simd->Memset( 0 ) %s", result), BIG_COUNT, bestClocksSIMD, bestClocksGeneric ); + PrintClocks( va( " simd->Memset( 0 ) %s", result ), BIG_COUNT, bestClocksSIMD, bestClocksGeneric ); } /* @@ -478,7 +529,8 @@ void TestMemset() { TestBlendJoints ============ */ -void TestBlendJoints() { +void TestBlendJoints() +{ int i, j; TIME_TYPE start, end, bestClocksGeneric, bestClocksSIMD; idTempArray< idJointQuat > baseJoints( COUNT ); @@ -487,11 +539,12 @@ void TestBlendJoints() { idTempArray< idJointQuat > blendJoints( COUNT ); idTempArray< int > index( COUNT ); float lerp = 0.3f; - const char *result; - + const char* result; + idRandom srnd( RANDOM_SEED ); - - for ( i = 0; i < COUNT; i++ ) { + + for( i = 0; i < COUNT; i++ ) + { idAngles angles; angles[0] = srnd.CRandomFloat() * 180.0f; angles[1] = srnd.CRandomFloat() * 180.0f; @@ -511,10 +564,12 @@ void TestBlendJoints() { blendJoints[i].w = 0.0f; index[i] = i; } - + bestClocksGeneric = 0; - for ( i = 0; i < NUMTESTS; i++ ) { - for ( j = 0; j < COUNT; j++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { + for( j = 0; j < COUNT; j++ ) + { joints1[j] = baseJoints[j]; } StartRecordTime( start ); @@ -523,10 +578,12 @@ void TestBlendJoints() { GetBest( start, end, bestClocksGeneric ); } PrintClocks( "generic->BlendJoints()", COUNT, bestClocksGeneric ); - + bestClocksSIMD = 0; - for ( i = 0; i < NUMTESTS; i++ ) { - for ( j = 0; j < COUNT; j++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { + for( j = 0; j < COUNT; j++ ) + { joints2[j] = baseJoints[j]; } StartRecordTime( start ); @@ -534,12 +591,15 @@ void TestBlendJoints() { StopRecordTime( end ); GetBest( start, end, bestClocksSIMD ); } - - for ( i = 0; i < COUNT; i++ ) { - if ( !joints1[i].t.Compare( joints2[i].t, 1e-3f ) ) { + + for( i = 0; i < COUNT; i++ ) + { + if( !joints1[i].t.Compare( joints2[i].t, 1e-3f ) ) + { break; } - if ( !joints1[i].q.Compare( joints2[i].q, 1e-2f ) ) { + if( !joints1[i].q.Compare( joints2[i].q, 1e-2f ) ) + { break; } } @@ -552,7 +612,8 @@ void TestBlendJoints() { TestBlendJoints ============ */ -void TestBlendJointsFast() { +void TestBlendJointsFast() +{ int i, j; TIME_TYPE start, end, bestClocksGeneric, bestClocksSIMD; idTempArray< idJointQuat > baseJoints( COUNT ); @@ -561,11 +622,12 @@ void TestBlendJointsFast() { idTempArray< idJointQuat > blendJoints( COUNT ); idTempArray< int > index( COUNT ); float lerp = 0.3f; - const char *result; - + const char* result; + idRandom srnd( RANDOM_SEED ); - - for ( i = 0; i < COUNT; i++ ) { + + for( i = 0; i < COUNT; i++ ) + { idAngles angles; angles[0] = srnd.CRandomFloat() * 180.0f; angles[1] = srnd.CRandomFloat() * 180.0f; @@ -585,10 +647,12 @@ void TestBlendJointsFast() { blendJoints[i].w = 0.0f; index[i] = i; } - + bestClocksGeneric = 0; - for ( i = 0; i < NUMTESTS; i++ ) { - for ( j = 0; j < COUNT; j++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { + for( j = 0; j < COUNT; j++ ) + { joints1[j] = baseJoints[j]; } StartRecordTime( start ); @@ -597,10 +661,12 @@ void TestBlendJointsFast() { GetBest( start, end, bestClocksGeneric ); } PrintClocks( "generic->BlendJointsFast()", COUNT, bestClocksGeneric ); - + bestClocksSIMD = 0; - for ( i = 0; i < NUMTESTS; i++ ) { - for ( j = 0; j < COUNT; j++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { + for( j = 0; j < COUNT; j++ ) + { joints2[j] = baseJoints[j]; } StartRecordTime( start ); @@ -608,12 +674,15 @@ void TestBlendJointsFast() { StopRecordTime( end ); GetBest( start, end, bestClocksSIMD ); } - - for ( i = 0; i < COUNT; i++ ) { - if ( !joints1[i].t.Compare( joints2[i].t, 1e-3f ) ) { + + for( i = 0; i < COUNT; i++ ) + { + if( !joints1[i].t.Compare( joints2[i].t, 1e-3f ) ) + { break; } - if ( !joints1[i].q.Compare( joints2[i].q, 1e-2f ) ) { + if( !joints1[i].q.Compare( joints2[i].q, 1e-2f ) ) + { break; } } @@ -626,17 +695,19 @@ void TestBlendJointsFast() { TestConvertJointQuatsToJointMats ============ */ -void TestConvertJointQuatsToJointMats() { +void TestConvertJointQuatsToJointMats() +{ int i; TIME_TYPE start, end, bestClocksGeneric, bestClocksSIMD; idTempArray< idJointQuat > baseJoints( COUNT ); idTempArray< idJointMat > joints1( COUNT ); idTempArray< idJointMat > joints2( COUNT ); - const char *result; - + const char* result; + idRandom srnd( RANDOM_SEED ); - - for ( i = 0; i < COUNT; i++ ) { + + for( i = 0; i < COUNT; i++ ) + { idAngles angles; angles[0] = srnd.CRandomFloat() * 180.0f; angles[1] = srnd.CRandomFloat() * 180.0f; @@ -646,26 +717,30 @@ void TestConvertJointQuatsToJointMats() { baseJoints[i].t[1] = srnd.CRandomFloat() * 10.0f; baseJoints[i].t[2] = srnd.CRandomFloat() * 10.0f; } - + bestClocksGeneric = 0; - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); p_generic->ConvertJointQuatsToJointMats( joints1.Ptr(), baseJoints.Ptr(), COUNT ); StopRecordTime( end ); GetBest( start, end, bestClocksGeneric ); } PrintClocks( "generic->ConvertJointQuatsToJointMats()", COUNT, bestClocksGeneric ); - + bestClocksSIMD = 0; - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); p_simd->ConvertJointQuatsToJointMats( joints2.Ptr(), baseJoints.Ptr(), COUNT ); StopRecordTime( end ); GetBest( start, end, bestClocksSIMD ); } - - for ( i = 0; i < COUNT; i++ ) { - if ( !joints1[i].Compare( joints2[i], 1e-4f ) ) { + + for( i = 0; i < COUNT; i++ ) + { + if( !joints1[i].Compare( joints2[i], 1e-4f ) ) + { break; } } @@ -678,17 +753,19 @@ void TestConvertJointQuatsToJointMats() { TestConvertJointMatsToJointQuats ============ */ -void TestConvertJointMatsToJointQuats() { +void TestConvertJointMatsToJointQuats() +{ int i; TIME_TYPE start, end, bestClocksGeneric, bestClocksSIMD; idTempArray< idJointMat > baseJoints( COUNT ); idTempArray< idJointQuat > joints1( COUNT ); idTempArray< idJointQuat > joints2( COUNT ); - const char *result; - + const char* result; + idRandom srnd( RANDOM_SEED ); - - for ( i = 0; i < COUNT; i++ ) { + + for( i = 0; i < COUNT; i++ ) + { idAngles angles; angles[0] = srnd.CRandomFloat() * 180.0f; angles[1] = srnd.CRandomFloat() * 180.0f; @@ -700,29 +777,34 @@ void TestConvertJointMatsToJointQuats() { v[2] = srnd.CRandomFloat() * 10.0f; baseJoints[i].SetTranslation( v ); } - + bestClocksGeneric = 0; - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); p_generic->ConvertJointMatsToJointQuats( joints1.Ptr(), baseJoints.Ptr(), COUNT ); StopRecordTime( end ); GetBest( start, end, bestClocksGeneric ); } PrintClocks( "generic->ConvertJointMatsToJointQuats()", COUNT, bestClocksGeneric ); - + bestClocksSIMD = 0; - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); p_simd->ConvertJointMatsToJointQuats( joints2.Ptr(), baseJoints.Ptr(), COUNT ); StopRecordTime( end ); GetBest( start, end, bestClocksSIMD ); } - - for ( i = 0; i < COUNT; i++ ) { - if ( !joints1[i].q.Compare( joints2[i].q, 1e-4f ) ) { + + for( i = 0; i < COUNT; i++ ) + { + if( !joints1[i].q.Compare( joints2[i].q, 1e-4f ) ) + { break; } - if ( !joints1[i].t.Compare( joints2[i].t, 1e-4f ) ) { + if( !joints1[i].t.Compare( joints2[i].t, 1e-4f ) ) + { break; } } @@ -735,18 +817,20 @@ void TestConvertJointMatsToJointQuats() { TestTransformJoints ============ */ -void TestTransformJoints() { +void TestTransformJoints() +{ int i, j; TIME_TYPE start, end, bestClocksGeneric, bestClocksSIMD; - idTempArray< idJointMat > joints( COUNT+1 ); - idTempArray< idJointMat > joints1( COUNT+1 ); - idTempArray< idJointMat > joints2( COUNT+1 ); - idTempArray< int > parents( COUNT+1 ); - const char *result; - + idTempArray< idJointMat > joints( COUNT + 1 ); + idTempArray< idJointMat > joints1( COUNT + 1 ); + idTempArray< idJointMat > joints2( COUNT + 1 ); + idTempArray< int > parents( COUNT + 1 ); + const char* result; + idRandom srnd( RANDOM_SEED ); - - for ( i = 0; i <= COUNT; i++ ) { + + for( i = 0; i <= COUNT; i++ ) + { idAngles angles; angles[0] = srnd.CRandomFloat() * 180.0f; angles[1] = srnd.CRandomFloat() * 180.0f; @@ -759,10 +843,12 @@ void TestTransformJoints() { joints[i].SetTranslation( v ); parents[i] = i - 1; } - + bestClocksGeneric = 0; - for ( i = 0; i < NUMTESTS; i++ ) { - for ( j = 0; j <= COUNT; j++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { + for( j = 0; j <= COUNT; j++ ) + { joints1[j] = joints[j]; } StartRecordTime( start ); @@ -771,10 +857,12 @@ void TestTransformJoints() { GetBest( start, end, bestClocksGeneric ); } PrintClocks( "generic->TransformJoints()", COUNT, bestClocksGeneric ); - + bestClocksSIMD = 0; - for ( i = 0; i < NUMTESTS; i++ ) { - for ( j = 0; j <= COUNT; j++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { + for( j = 0; j <= COUNT; j++ ) + { joints2[j] = joints[j]; } StartRecordTime( start ); @@ -782,9 +870,11 @@ void TestTransformJoints() { StopRecordTime( end ); GetBest( start, end, bestClocksSIMD ); } - - for ( i = 1; i <= COUNT; i++ ) { - if ( !joints1[i].Compare( joints2[i], 1e-3f ) ) { + + for( i = 1; i <= COUNT; i++ ) + { + if( !joints1[i].Compare( joints2[i], 1e-3f ) ) + { break; } } @@ -797,18 +887,20 @@ void TestTransformJoints() { TestUntransformJoints ============ */ -void TestUntransformJoints() { +void TestUntransformJoints() +{ int i, j; TIME_TYPE start, end, bestClocksGeneric, bestClocksSIMD; - idTempArray< idJointMat > joints( COUNT+1 ); - idTempArray< idJointMat > joints1( COUNT+1 ); - idTempArray< idJointMat > joints2( COUNT+1 ); - idTempArray< int > parents( COUNT+1 ); - const char *result; - + idTempArray< idJointMat > joints( COUNT + 1 ); + idTempArray< idJointMat > joints1( COUNT + 1 ); + idTempArray< idJointMat > joints2( COUNT + 1 ); + idTempArray< int > parents( COUNT + 1 ); + const char* result; + idRandom srnd( RANDOM_SEED ); - - for ( i = 0; i <= COUNT; i++ ) { + + for( i = 0; i <= COUNT; i++ ) + { idAngles angles; angles[0] = srnd.CRandomFloat() * 180.0f; angles[1] = srnd.CRandomFloat() * 180.0f; @@ -821,10 +913,12 @@ void TestUntransformJoints() { joints[i].SetTranslation( v ); parents[i] = i - 1; } - + bestClocksGeneric = 0; - for ( i = 0; i < NUMTESTS; i++ ) { - for ( j = 0; j <= COUNT; j++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { + for( j = 0; j <= COUNT; j++ ) + { joints1[j] = joints[j]; } StartRecordTime( start ); @@ -833,10 +927,12 @@ void TestUntransformJoints() { GetBest( start, end, bestClocksGeneric ); } PrintClocks( "generic->UntransformJoints()", COUNT, bestClocksGeneric ); - + bestClocksSIMD = 0; - for ( i = 0; i < NUMTESTS; i++ ) { - for ( j = 0; j <= COUNT; j++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { + for( j = 0; j <= COUNT; j++ ) + { joints2[j] = joints[j]; } StartRecordTime( start ); @@ -844,9 +940,11 @@ void TestUntransformJoints() { StopRecordTime( end ); GetBest( start, end, bestClocksSIMD ); } - - for ( i = 1; i <= COUNT; i++ ) { - if ( !joints1[i].Compare( joints2[i], 1e-3f ) ) { + + for( i = 1; i <= COUNT; i++ ) + { + if( !joints1[i].Compare( joints2[i], 1e-3f ) ) + { break; } } @@ -859,20 +957,22 @@ void TestUntransformJoints() { TestMath ============ */ -void TestMath() { +void TestMath() +{ int i; TIME_TYPE start, end, bestClocks; - - idLib::common->Printf("====================================\n" ); - + + idLib::common->Printf( "====================================\n" ); + float tst = -1.0f; float tst2 = 1.0f; float testvar = 1.0f; idRandom rnd; - + bestClocks = 0; tst = rnd.CRandomFloat(); - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); tst = fabs( tst ); StopRecordTime( end ); @@ -881,24 +981,26 @@ void TestMath() { tst = rnd.CRandomFloat(); } PrintClocks( " fabs( tst )", 1, bestClocks ); - + bestClocks = 0; tst = rnd.CRandomFloat(); - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); - int tmp = * ( int * ) &tst; + int tmp = * ( int* ) &tst; tmp &= 0x7FFFFFFF; - tst = * ( float * ) &tmp; + tst = * ( float* ) &tmp; StopRecordTime( end ); GetBest( start, end, bestClocks ); testvar = ( testvar + tst ) * tst; tst = rnd.CRandomFloat(); } PrintClocks( " idMath::Fabs( tst )", 1, bestClocks ); - + bestClocks = 0; tst = 10.0f + 100.0f * rnd.RandomFloat(); - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); tst = sqrt( tst ); StopRecordTime( end ); @@ -907,10 +1009,11 @@ void TestMath() { tst = 10.0f + 100.0f * rnd.RandomFloat(); } PrintClocks( " sqrt( tst )", 1, bestClocks ); - + bestClocks = 0; tst = rnd.RandomFloat(); - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); tst = idMath::Sqrt( tst ); StopRecordTime( end ); @@ -919,10 +1022,11 @@ void TestMath() { tst = rnd.RandomFloat(); } PrintClocks( " idMath::Sqrt( tst )", 1, bestClocks ); - + bestClocks = 0; tst = rnd.RandomFloat(); - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); tst = idMath::Sqrt16( tst ); StopRecordTime( end ); @@ -931,10 +1035,11 @@ void TestMath() { tst = rnd.RandomFloat(); } PrintClocks( " idMath::Sqrt16( tst )", 1, bestClocks ); - + bestClocks = 0; tst = rnd.CRandomFloat(); - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); tst = idMath::Sin( tst ); StopRecordTime( end ); @@ -943,10 +1048,11 @@ void TestMath() { tst = rnd.CRandomFloat(); } PrintClocks( " idMath::Sin( tst )", 1, bestClocks ); - + bestClocks = 0; tst = rnd.CRandomFloat(); - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); tst = idMath::Sin16( tst ); StopRecordTime( end ); @@ -955,10 +1061,11 @@ void TestMath() { tst = rnd.CRandomFloat(); } PrintClocks( " idMath::Sin16( tst )", 1, bestClocks ); - + bestClocks = 0; tst = rnd.CRandomFloat(); - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); tst = idMath::Cos( tst ); StopRecordTime( end ); @@ -967,10 +1074,11 @@ void TestMath() { tst = rnd.CRandomFloat(); } PrintClocks( " idMath::Cos( tst )", 1, bestClocks ); - + bestClocks = 0; tst = rnd.CRandomFloat(); - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); tst = idMath::Cos16( tst ); StopRecordTime( end ); @@ -979,10 +1087,11 @@ void TestMath() { tst = rnd.CRandomFloat(); } PrintClocks( " idMath::Cos16( tst )", 1, bestClocks ); - + bestClocks = 0; tst = rnd.CRandomFloat(); - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); idMath::SinCos( tst, tst, tst2 ); StopRecordTime( end ); @@ -991,10 +1100,11 @@ void TestMath() { tst = rnd.CRandomFloat(); } PrintClocks( " idMath::SinCos( tst )", 1, bestClocks ); - + bestClocks = 0; tst = rnd.CRandomFloat(); - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); idMath::SinCos16( tst, tst, tst2 ); StopRecordTime( end ); @@ -1003,10 +1113,11 @@ void TestMath() { tst = rnd.CRandomFloat(); } PrintClocks( "idMath::SinCos16( tst )", 1, bestClocks ); - + bestClocks = 0; tst = rnd.CRandomFloat(); - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); tst = idMath::Tan( tst ); StopRecordTime( end ); @@ -1015,10 +1126,11 @@ void TestMath() { tst = rnd.CRandomFloat(); } PrintClocks( " idMath::Tan( tst )", 1, bestClocks ); - + bestClocks = 0; tst = rnd.CRandomFloat(); - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); tst = idMath::Tan16( tst ); StopRecordTime( end ); @@ -1027,10 +1139,11 @@ void TestMath() { tst = rnd.CRandomFloat(); } PrintClocks( " idMath::Tan16( tst )", 1, bestClocks ); - + bestClocks = 0; tst = rnd.CRandomFloat(); - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); tst = idMath::ASin( tst ); StopRecordTime( end ); @@ -1039,10 +1152,11 @@ void TestMath() { tst = rnd.CRandomFloat(); } PrintClocks( " idMath::ASin( tst )", 1, bestClocks ); - + bestClocks = 0; tst = rnd.CRandomFloat(); - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); tst = idMath::ASin16( tst ); StopRecordTime( end ); @@ -1051,10 +1165,11 @@ void TestMath() { tst = rnd.CRandomFloat(); } PrintClocks( " idMath::ASin16( tst )", 1, bestClocks ); - + bestClocks = 0; tst = rnd.CRandomFloat(); - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); tst = idMath::ACos( tst ); StopRecordTime( end ); @@ -1063,10 +1178,11 @@ void TestMath() { tst = rnd.CRandomFloat(); } PrintClocks( " idMath::ACos( tst )", 1, bestClocks ); - + bestClocks = 0; tst = rnd.CRandomFloat(); - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); tst = idMath::ACos16( tst ); StopRecordTime( end ); @@ -1075,10 +1191,11 @@ void TestMath() { tst = rnd.CRandomFloat(); } PrintClocks( " idMath::ACos16( tst )", 1, bestClocks ); - + bestClocks = 0; tst = rnd.CRandomFloat(); - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); tst = idMath::ATan( tst ); StopRecordTime( end ); @@ -1087,10 +1204,11 @@ void TestMath() { tst = rnd.CRandomFloat(); } PrintClocks( " idMath::ATan( tst )", 1, bestClocks ); - + bestClocks = 0; tst = rnd.CRandomFloat(); - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); tst = idMath::ATan16( tst ); StopRecordTime( end ); @@ -1099,10 +1217,11 @@ void TestMath() { tst = rnd.CRandomFloat(); } PrintClocks( " idMath::ATan16( tst )", 1, bestClocks ); - + bestClocks = 0; tst = rnd.CRandomFloat(); - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); tst = idMath::Pow( 2.7f, tst ); StopRecordTime( end ); @@ -1111,10 +1230,11 @@ void TestMath() { tst = rnd.CRandomFloat(); } PrintClocks( " idMath::Pow( tst )", 1, bestClocks ); - + bestClocks = 0; tst = rnd.CRandomFloat(); - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); tst = idMath::Pow16( 2.7f, tst ); StopRecordTime( end ); @@ -1123,10 +1243,11 @@ void TestMath() { tst = rnd.CRandomFloat(); } PrintClocks( " idMath::Pow16( tst )", 1, bestClocks ); - + bestClocks = 0; tst = rnd.CRandomFloat(); - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); tst = idMath::Exp( tst ); StopRecordTime( end ); @@ -1135,10 +1256,11 @@ void TestMath() { tst = rnd.CRandomFloat(); } PrintClocks( " idMath::Exp( tst )", 1, bestClocks ); - + bestClocks = 0; tst = rnd.CRandomFloat(); - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); tst = idMath::Exp16( tst ); StopRecordTime( end ); @@ -1147,10 +1269,11 @@ void TestMath() { tst = rnd.CRandomFloat(); } PrintClocks( " idMath::Exp16( tst )", 1, bestClocks ); - + bestClocks = 0; tst = rnd.CRandomFloat(); - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { tst = fabs( tst ) + 1.0f; StartRecordTime( start ); tst = idMath::Log( tst ); @@ -1160,10 +1283,11 @@ void TestMath() { tst = rnd.CRandomFloat(); } PrintClocks( " idMath::Log( tst )", 1, bestClocks ); - + bestClocks = 0; tst = rnd.CRandomFloat(); - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { tst = fabs( tst ) + 1.0f; StartRecordTime( start ); tst = idMath::Log16( tst ); @@ -1173,57 +1297,62 @@ void TestMath() { tst = rnd.CRandomFloat(); } PrintClocks( " idMath::Log16( tst )", 1, bestClocks ); - + idLib::common->Printf( "testvar = %f\n", testvar ); - + idMat3 resultMat3; idQuat fromQuat, toQuat, resultQuat; idCQuat cq; idAngles ang; - + fromQuat = idAngles( 30, 45, 0 ).ToQuat(); toQuat = idAngles( 45, 0, 0 ).ToQuat(); cq = idAngles( 30, 45, 0 ).ToQuat().ToCQuat(); ang = idAngles( 30, 40, 50 ); - + bestClocks = 0; - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); resultMat3 = fromQuat.ToMat3(); StopRecordTime( end ); GetBest( start, end, bestClocks ); } PrintClocks( " idQuat::ToMat3()", 1, bestClocks ); - + bestClocks = 0; - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); resultQuat.Slerp( fromQuat, toQuat, 0.3f ); StopRecordTime( end ); GetBest( start, end, bestClocks ); } PrintClocks( " idQuat::Slerp()", 1, bestClocks ); - + bestClocks = 0; - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); resultQuat = cq.ToQuat(); StopRecordTime( end ); GetBest( start, end, bestClocks ); } PrintClocks( " idCQuat::ToQuat()", 1, bestClocks ); - + bestClocks = 0; - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); resultQuat = ang.ToQuat(); StopRecordTime( end ); GetBest( start, end, bestClocks ); } PrintClocks( " idAngles::ToQuat()", 1, bestClocks ); - + bestClocks = 0; - for ( i = 0; i < NUMTESTS; i++ ) { + for( i = 0; i < NUMTESTS; i++ ) + { StartRecordTime( start ); resultMat3 = ang.ToMat3(); StopRecordTime( end ); @@ -1237,65 +1366,74 @@ void TestMath() { idSIMD::Test_f ============ */ -void idSIMD::Test_f( const idCmdArgs &args ) { +void idSIMD::Test_f( const idCmdArgs& args ) +{ // RB begin #if defined(_WIN32) SetThreadPriority( GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL ); #endif // RB end - + p_simd = processor; p_generic = generic; - - if ( idStr::Length( args.Argv( 1 ) ) != 0 ) { + + if( idStr::Length( args.Argv( 1 ) ) != 0 ) + { cpuid_t cpuid = idLib::sys->GetProcessorId(); idStr argString = args.Args(); - + argString.Replace( " ", "" ); - - if ( idStr::Icmp( argString, "SSE" ) == 0 ) { - if ( !( cpuid & CPUID_MMX ) || !( cpuid & CPUID_SSE ) ) { + +#if defined(USE_INTRINSICS) + if( idStr::Icmp( argString, "SSE" ) == 0 ) + { + if( !( cpuid & CPUID_MMX ) || !( cpuid & CPUID_SSE ) ) + { common->Printf( "CPU does not support MMX & SSE\n" ); return; } - p_simd = new (TAG_MATH) idSIMD_SSE; - } else { + p_simd = new( TAG_MATH ) idSIMD_SSE; + } + else +#endif + { common->Printf( "invalid argument, use: MMX, 3DNow, SSE, SSE2, SSE3, AltiVec\n" ); return; } } - + idLib::common->SetRefreshOnPrint( true ); - + idLib::common->Printf( "using %s for SIMD processing\n", p_simd->GetName() ); - + GetBaseClocks(); - + TestMath(); TestMinMax(); TestMemcpy(); TestMemset(); - - idLib::common->Printf("====================================\n" ); - + + idLib::common->Printf( "====================================\n" ); + TestBlendJoints(); TestBlendJointsFast(); TestConvertJointQuatsToJointMats(); TestConvertJointMatsToJointQuats(); TestTransformJoints(); TestUntransformJoints(); - - idLib::common->Printf("====================================\n" ); - + + idLib::common->Printf( "====================================\n" ); + idLib::common->SetRefreshOnPrint( false ); - - if ( p_simd != processor ) { + + if( p_simd != processor ) + { delete p_simd; } p_simd = NULL; p_generic = NULL; - + // RB begin #if defined(_WIN32) SetThreadPriority( GetCurrentThread(), THREAD_PRIORITY_NORMAL ); diff --git a/neo/idlib/math/Simd_SSE.cpp b/neo/idlib/math/Simd_SSE.cpp index ed333479..fda623af 100644 --- a/neo/idlib/math/Simd_SSE.cpp +++ b/neo/idlib/math/Simd_SSE.cpp @@ -38,6 +38,7 @@ If you have questions concerning this license or the applicable additional terms // E //=============================================================== +#if defined(USE_INTRINSICS) #include @@ -973,3 +974,5 @@ void VPCALL idSIMD_SSE::UntransformJoints( idJointMat* jointMats, const int* par } } +#endif // #if defined(USE_INTRINSICS) + diff --git a/neo/idlib/math/Simd_SSE.h b/neo/idlib/math/Simd_SSE.h index c42f3f27..855f4c3e 100644 --- a/neo/idlib/math/Simd_SSE.h +++ b/neo/idlib/math/Simd_SSE.h @@ -3,6 +3,7 @@ Doom 3 BFG Edition GPL Source Code Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company. +Copyright (C) 2013 Robert Beckebans This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code"). @@ -37,6 +38,8 @@ If you have questions concerning this license or the applicable additional terms =============================================================================== */ +#if defined(USE_INTRINSICS) + class idSIMD_SSE : public idSIMD_Generic { public: @@ -50,4 +53,6 @@ public: virtual void VPCALL UntransformJoints( idJointMat* jointMats, const int* parents, const int firstJoint, const int lastJoint ); }; +#endif + #endif /* !__MATH_SIMD_SSE_H__ */ diff --git a/neo/idlib/math/VecX.h b/neo/idlib/math/VecX.h index 775d7afa..c852d85c 100644 --- a/neo/idlib/math/VecX.h +++ b/neo/idlib/math/VecX.h @@ -45,7 +45,10 @@ NOTE: due to the temporary memory pool idVecX cannot be used by multiple threads #define VECX_QUAD( x ) ( ( ( ( x ) + 3 ) & ~3 ) * sizeof( float ) ) #define VECX_CLEAREND() int s = size; while( s < ( ( s + 3) & ~3 ) ) { p[s++] = 0.0f; } #define VECX_ALLOCA( n ) ( (float *) _alloca16( VECX_QUAD( n ) ) ) + +#if defined(USE_INTRINSICS) #define VECX_SIMD +#endif class idVecX { diff --git a/neo/idlib/math/Vector.h b/neo/idlib/math/Vector.h index d1fe7f97..4f988ac8 100644 --- a/neo/idlib/math/Vector.h +++ b/neo/idlib/math/Vector.h @@ -516,6 +516,11 @@ ID_INLINE idVec3 operator*( const float a, const idVec3 b ) return idVec3( b.x * a, b.y * a, b.z * a ); } +ID_INLINE idVec3 operator/( const float a, const idVec3 b ) +{ + return idVec3( a / b.x, a / b.y, a / b.z ); +} + ID_INLINE idVec3 idVec3::operator+( const idVec3& a ) const { return idVec3( x + a.x, y + a.y, z + a.z ); diff --git a/neo/idlib/sys/sys_intrinsics.h b/neo/idlib/sys/sys_intrinsics.h index 6d301e0b..6cf1beca 100644 --- a/neo/idlib/sys/sys_intrinsics.h +++ b/neo/idlib/sys/sys_intrinsics.h @@ -3,6 +3,7 @@ Doom 3 BFG Edition GPL Source Code Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company. +Copyright (C) 2013 Robert Beckebans This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code"). @@ -28,8 +29,11 @@ If you have questions concerning this license or the applicable additional terms #ifndef __SYS_INTRIINSICS_H__ #define __SYS_INTRIINSICS_H__ -#include +#define USE_INTRINSICS +#if defined(USE_INTRINSICS) +#include +#endif /* ================================================================================================ @@ -91,6 +95,7 @@ ID_INLINE_EXTERN float __frndz( float x ) ================================================================================================ */ +#if defined(USE_INTRINSICS) // The code below assumes that a cache line is 64 bytes. // We specify the cache line size as 128 here to make the code consistent with the consoles. #define CACHE_LINE_SIZE 128 @@ -122,6 +127,26 @@ ID_FORCE_INLINE void FlushCacheLine( const void* ptr, int offset ) _mm_clflush( bytePtr + 64 ); } +/* +================================================ +#endif + Other +================================================ +*/ +#else + +#define CACHE_LINE_SIZE 128 + +ID_INLINE void Prefetch( const void* ptr, int offset ) {} +ID_INLINE void ZeroCacheLine( void* ptr, int offset ) +{ + byte* bytePtr = ( byte* )( ( ( ( uintptr_t )( ptr ) ) + ( offset ) ) & ~( CACHE_LINE_SIZE - 1 ) ); + memset( bytePtr, 0, CACHE_LINE_SIZE ); +} +ID_INLINE void FlushCacheLine( const void* ptr, int offset ) {} + +#endif + /* ================================================ Block Clear Macros @@ -168,6 +193,8 @@ ID_INLINE_EXTERN int CACHE_LINE_CLEAR_OVERFLOW_COUNT( int size ) ================================================================================================ */ +#if defined(USE_INTRINSICS) + /* ================================================ PC Windows @@ -194,6 +221,7 @@ ID_INLINE_EXTERN int CACHE_LINE_CLEAR_OVERFLOW_COUNT( int size ) #endif // DG end + // make the intrinsics "type unsafe" typedef union DECLSPEC_INTRINTYPE _CRT_ALIGN( 16 ) __m128c { @@ -275,4 +303,6 @@ ID_FORCE_INLINE_EXTERN __m128 _mm_div16_ps( __m128 x, __m128 y ) // load idBounds::GetMaxs() #define _mm_loadu_bounds_1( bounds ) _mm_perm_ps( _mm_loadh_pi( _mm_load_ss( & bounds[1].x ), (__m64 *) & bounds[1].y ), _MM_SHUFFLE( 1, 3, 2, 0 ) ) +#endif // #if defined(USE_INTRINSICS) + #endif // !__SYS_INTRIINSICS_H__ diff --git a/neo/renderer/BufferObject.cpp b/neo/renderer/BufferObject.cpp index a6f9986a..f511df23 100644 --- a/neo/renderer/BufferObject.cpp +++ b/neo/renderer/BufferObject.cpp @@ -79,6 +79,7 @@ void UnbindBufferObjects() qglBindBufferARB( GL_ELEMENT_ARRAY_BUFFER_ARB, 0 ); } +#if defined(USE_INTRINSICS) void CopyBuffer( byte* dst, const byte* src, int numBytes ) { @@ -121,6 +122,16 @@ void CopyBuffer( byte* dst, const byte* src, int numBytes ) _mm_sfence(); } +#else + +void CopyBuffer( byte* dst, const byte* src, int numBytes ) +{ + assert_16_byte_aligned( dst ); + assert_16_byte_aligned( src ); + memcpy( dst, src, numBytes ); +} + +#endif /* ================================================================================================ diff --git a/neo/renderer/DXT/DXTCodec.h b/neo/renderer/DXT/DXTCodec.h index 2734b786..2ac137b2 100644 --- a/neo/renderer/DXT/DXTCodec.h +++ b/neo/renderer/DXT/DXTCodec.h @@ -3,6 +3,7 @@ Doom 3 BFG Edition GPL Source Code Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company. +Copyright (C) 2013 Robert Beckebans This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code"). @@ -305,7 +306,11 @@ idDxtEncoder::CompressImageDXT1Fast */ ID_INLINE void idDxtEncoder::CompressImageDXT1Fast( const byte* inBuf, byte* outBuf, int width, int height ) { +#if defined(USE_INTRINSICS) CompressImageDXT1Fast_SSE2( inBuf, outBuf, width, height ); +#else + CompressImageDXT1Fast_Generic( inBuf, outBuf, width, height ); +#endif } /* @@ -315,7 +320,11 @@ idDxtEncoder::CompressImageDXT1AlphaFast */ ID_INLINE void idDxtEncoder::CompressImageDXT1AlphaFast( const byte* inBuf, byte* outBuf, int width, int height ) { +#if defined(USE_INTRINSICS) CompressImageDXT1AlphaFast_SSE2( inBuf, outBuf, width, height ); +#else + CompressImageDXT1AlphaFast_Generic( inBuf, outBuf, width, height ); +#endif } /* @@ -325,7 +334,11 @@ idDxtEncoder::CompressImageDXT5Fast */ ID_INLINE void idDxtEncoder::CompressImageDXT5Fast( const byte* inBuf, byte* outBuf, int width, int height ) { +#if defined(USE_INTRINSICS) CompressImageDXT5Fast_SSE2( inBuf, outBuf, width, height ); +#else + CompressImageDXT5Fast_Generic( inBuf, outBuf, width, height ); +#endif } /* @@ -345,7 +358,11 @@ idDxtEncoder::CompressYCoCgDXT5Fast */ ID_INLINE void idDxtEncoder::CompressYCoCgDXT5Fast( const byte* inBuf, byte* outBuf, int width, int height ) { +#if defined(USE_INTRINSICS) CompressYCoCgDXT5Fast_SSE2( inBuf, outBuf, width, height ); +#else + CompressYCoCgDXT5Fast_Generic( inBuf, outBuf, width, height ); +#endif } /* @@ -365,7 +382,11 @@ idDxtEncoder::CompressNormalMapDXT5Fast */ ID_INLINE void idDxtEncoder::CompressNormalMapDXT5Fast( const byte* inBuf, byte* outBuf, int width, int height ) { +#if defined(USE_INTRINSICS) CompressNormalMapDXT5Fast_SSE2( inBuf, outBuf, width, height ); +#else + CompressNormalMapDXT5Fast_Generic( inBuf, outBuf, width, height ); +#endif } /* diff --git a/neo/renderer/DXT/DXTEncoder_SSE2.cpp b/neo/renderer/DXT/DXTEncoder_SSE2.cpp index 3d597c59..74d59dd7 100644 --- a/neo/renderer/DXT/DXTEncoder_SSE2.cpp +++ b/neo/renderer/DXT/DXTEncoder_SSE2.cpp @@ -34,6 +34,7 @@ Contains the DxtEncoder implementation for SSE2. #include "DXTCodec_local.h" #include "DXTCodec.h" +#if defined(USE_INTRINSICS) //#define TEST_COMPRESSION #ifdef TEST_COMPRESSION @@ -1627,3 +1628,4 @@ void idDxtEncoder::CompressNormalMapDXT5Fast_SSE2( const byte* inBuf, byte* outB #endif } +#endif // #if defined(USE_INTRINSICS) \ No newline at end of file diff --git a/neo/renderer/GLMatrix.cpp b/neo/renderer/GLMatrix.cpp index 152be6de..4fa6fc6a 100644 --- a/neo/renderer/GLMatrix.cpp +++ b/neo/renderer/GLMatrix.cpp @@ -74,7 +74,7 @@ R_MatrixMultiply */ void R_MatrixMultiply( const float a[16], const float b[16], float out[16] ) { - +#if defined(USE_INTRINSICS) __m128 a0 = _mm_loadu_ps( a + 0 * 4 ); __m128 a1 = _mm_loadu_ps( a + 1 * 4 ); __m128 a2 = _mm_loadu_ps( a + 2 * 4 ); @@ -110,6 +110,41 @@ void R_MatrixMultiply( const float a[16], const float b[16], float out[16] ) _mm_storeu_ps( out + 2 * 4, t2 ); _mm_storeu_ps( out + 3 * 4, t3 ); +#else + + /* + for ( int i = 0; i < 4; i++ ) { + for ( int j = 0; j < 4; j++ ) { + out[ i * 4 + j ] = + a[ i * 4 + 0 ] * b[ 0 * 4 + j ] + + a[ i * 4 + 1 ] * b[ 1 * 4 + j ] + + a[ i * 4 + 2 ] * b[ 2 * 4 + j ] + + a[ i * 4 + 3 ] * b[ 3 * 4 + j ]; + } + } + */ + + out[0 * 4 + 0] = a[0 * 4 + 0] * b[0 * 4 + 0] + a[0 * 4 + 1] * b[1 * 4 + 0] + a[0 * 4 + 2] * b[2 * 4 + 0] + a[0 * 4 + 3] * b[3 * 4 + 0]; + out[0 * 4 + 1] = a[0 * 4 + 0] * b[0 * 4 + 1] + a[0 * 4 + 1] * b[1 * 4 + 1] + a[0 * 4 + 2] * b[2 * 4 + 1] + a[0 * 4 + 3] * b[3 * 4 + 1]; + out[0 * 4 + 2] = a[0 * 4 + 0] * b[0 * 4 + 2] + a[0 * 4 + 1] * b[1 * 4 + 2] + a[0 * 4 + 2] * b[2 * 4 + 2] + a[0 * 4 + 3] * b[3 * 4 + 2]; + out[0 * 4 + 3] = a[0 * 4 + 0] * b[0 * 4 + 3] + a[0 * 4 + 1] * b[1 * 4 + 3] + a[0 * 4 + 2] * b[2 * 4 + 3] + a[0 * 4 + 3] * b[3 * 4 + 3]; + + out[1 * 4 + 0] = a[1 * 4 + 0] * b[0 * 4 + 0] + a[1 * 4 + 1] * b[1 * 4 + 0] + a[1 * 4 + 2] * b[2 * 4 + 0] + a[1 * 4 + 3] * b[3 * 4 + 0]; + out[1 * 4 + 1] = a[1 * 4 + 0] * b[0 * 4 + 1] + a[1 * 4 + 1] * b[1 * 4 + 1] + a[1 * 4 + 2] * b[2 * 4 + 1] + a[1 * 4 + 3] * b[3 * 4 + 1]; + out[1 * 4 + 2] = a[1 * 4 + 0] * b[0 * 4 + 2] + a[1 * 4 + 1] * b[1 * 4 + 2] + a[1 * 4 + 2] * b[2 * 4 + 2] + a[1 * 4 + 3] * b[3 * 4 + 2]; + out[1 * 4 + 3] = a[1 * 4 + 0] * b[0 * 4 + 3] + a[1 * 4 + 1] * b[1 * 4 + 3] + a[1 * 4 + 2] * b[2 * 4 + 3] + a[1 * 4 + 3] * b[3 * 4 + 3]; + + out[2 * 4 + 0] = a[2 * 4 + 0] * b[0 * 4 + 0] + a[2 * 4 + 1] * b[1 * 4 + 0] + a[2 * 4 + 2] * b[2 * 4 + 0] + a[2 * 4 + 3] * b[3 * 4 + 0]; + out[2 * 4 + 1] = a[2 * 4 + 0] * b[0 * 4 + 1] + a[2 * 4 + 1] * b[1 * 4 + 1] + a[2 * 4 + 2] * b[2 * 4 + 1] + a[2 * 4 + 3] * b[3 * 4 + 1]; + out[2 * 4 + 2] = a[2 * 4 + 0] * b[0 * 4 + 2] + a[2 * 4 + 1] * b[1 * 4 + 2] + a[2 * 4 + 2] * b[2 * 4 + 2] + a[2 * 4 + 3] * b[3 * 4 + 2]; + out[2 * 4 + 3] = a[2 * 4 + 0] * b[0 * 4 + 3] + a[2 * 4 + 1] * b[1 * 4 + 3] + a[2 * 4 + 2] * b[2 * 4 + 3] + a[2 * 4 + 3] * b[3 * 4 + 3]; + + out[3 * 4 + 0] = a[3 * 4 + 0] * b[0 * 4 + 0] + a[3 * 4 + 1] * b[1 * 4 + 0] + a[3 * 4 + 2] * b[2 * 4 + 0] + a[3 * 4 + 3] * b[3 * 4 + 0]; + out[3 * 4 + 1] = a[3 * 4 + 0] * b[0 * 4 + 1] + a[3 * 4 + 1] * b[1 * 4 + 1] + a[3 * 4 + 2] * b[2 * 4 + 1] + a[3 * 4 + 3] * b[3 * 4 + 1]; + out[3 * 4 + 2] = a[3 * 4 + 0] * b[0 * 4 + 2] + a[3 * 4 + 1] * b[1 * 4 + 2] + a[3 * 4 + 2] * b[2 * 4 + 2] + a[3 * 4 + 3] * b[3 * 4 + 2]; + out[3 * 4 + 3] = a[3 * 4 + 0] * b[0 * 4 + 3] + a[3 * 4 + 1] * b[1 * 4 + 3] + a[3 * 4 + 2] * b[2 * 4 + 3] + a[3 * 4 + 3] * b[3 * 4 + 3]; + +#endif } /* diff --git a/neo/renderer/ModelDecal.cpp b/neo/renderer/ModelDecal.cpp index c635a593..85ad9bb2 100644 --- a/neo/renderer/ModelDecal.cpp +++ b/neo/renderer/ModelDecal.cpp @@ -302,10 +302,10 @@ static void R_DecalPointCullStatic( byte* cullBits, const idPlane* planes, const assert_16_byte_aligned( cullBits ); assert_16_byte_aligned( verts ); - +#if defined(USE_INTRINSICS) idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts ); - const __m128 vector_float_zero = { 0.0f, 0.0f, 0.0f, 0.0f }; + const __m128 vector_float_zero = _mm_setzero_ps(); const __m128i vector_int_mask0 = _mm_set1_epi32( 1 << 0 ); const __m128i vector_int_mask1 = _mm_set1_epi32( 1 << 1 ); const __m128i vector_int_mask2 = _mm_set1_epi32( 1 << 2 ); @@ -406,6 +406,39 @@ static void R_DecalPointCullStatic( byte* cullBits, const idPlane* planes, const } } +#else + + idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts ); + + for( int i = 0; i < numVerts; ) + { + + const int nextNumVerts = vertsODS.FetchNextBatch() - 1; + + for( ; i <= nextNumVerts; i++ ) + { + const idVec3& v = vertsODS[i].xyz; + + const float d0 = planes[0].Distance( v ); + const float d1 = planes[1].Distance( v ); + const float d2 = planes[2].Distance( v ); + const float d3 = planes[3].Distance( v ); + const float d4 = planes[4].Distance( v ); + const float d5 = planes[5].Distance( v ); + + byte bits; + bits = IEEE_FLT_SIGNBITNOTSET( d0 ) << 0; + bits |= IEEE_FLT_SIGNBITNOTSET( d1 ) << 1; + bits |= IEEE_FLT_SIGNBITNOTSET( d2 ) << 2; + bits |= IEEE_FLT_SIGNBITNOTSET( d3 ) << 3; + bits |= IEEE_FLT_SIGNBITNOTSET( d4 ) << 4; + bits |= IEEE_FLT_SIGNBITNOTSET( d5 ) << 5; + + cullBits[i] = bits; + } + } + +#endif } /* @@ -637,6 +670,7 @@ static void R_CopyDecalSurface( idDrawVert* verts, int numVerts, triIndex_t* ind assert( ( ( decal->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 ); assert_16_byte_aligned( fadeColor ); +#if defined(USE_INTRINSICS) const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 ); const __m128i vector_short_num_verts = _mm_packs_epi32( vector_int_num_verts, vector_int_num_verts ); @@ -678,6 +712,28 @@ static void R_CopyDecalSurface( idDrawVert* verts, int numVerts, triIndex_t* ind _mm_sfence(); +#else + + // copy vertices and apply depth/time based fading + for( int i = 0; i < decal->numVerts; i++ ) + { + // NOTE: bad out-of-order write-combined write, SIMD code does the right thing + verts[numVerts + i] = decal->verts[i]; + for( int j = 0; j < 4; j++ ) + { + verts[numVerts + i].color[j] = idMath::Ftob( fadeColor[j] * decal->vertDepthFade[i] ); + } + } + + // copy indices + assert( ( decal->numIndexes & 1 ) == 0 ); + for( int i = 0; i < decal->numIndexes; i += 2 ) + { + assert( decal->indexes[i + 0] < decal->numVerts && decal->indexes[i + 1] < decal->numVerts ); + WriteIndexPair( &indexes[numIndexes + i], numVerts + decal->indexes[i + 0], numVerts + decal->indexes[i + 1] ); + } + +#endif } /* diff --git a/neo/renderer/ModelOverlay.cpp b/neo/renderer/ModelOverlay.cpp index c7610850..d2cf2569 100644 --- a/neo/renderer/ModelOverlay.cpp +++ b/neo/renderer/ModelOverlay.cpp @@ -3,6 +3,7 @@ Doom 3 BFG Edition GPL Source Code Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company. +Copyright (C) 2013 Robert Beckebans This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code"). @@ -111,7 +112,7 @@ static void R_OverlayPointCullStatic( byte* cullBits, halfFloat_t* texCoordS, ha assert_16_byte_aligned( texCoordT ); assert_16_byte_aligned( verts ); - +#if defined(USE_INTRINSICS) idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts ); const __m128 vector_float_zero = { 0.0f, 0.0f, 0.0f, 0.0f }; @@ -187,6 +188,41 @@ static void R_OverlayPointCullStatic( byte* cullBits, halfFloat_t* texCoordS, ha } } +#else + + idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts ); + + for( int i = 0; i < numVerts; ) + { + + const int nextNumVerts = vertsODS.FetchNextBatch() - 1; + + for( ; i <= nextNumVerts; i++ ) + { + const idVec3& v = vertsODS[i].xyz; + + const float d0 = planes[0].Distance( v ); + const float d1 = planes[1].Distance( v ); + const float d2 = 1.0f - d0; + const float d3 = 1.0f - d1; + + halfFloat_t s = Scalar_FastF32toF16( d0 ); + halfFloat_t t = Scalar_FastF32toF16( d1 ); + + texCoordS[i] = s; + texCoordT[i] = t; + + byte bits; + bits = IEEE_FLT_SIGNBITSET( d0 ) << 0; + bits |= IEEE_FLT_SIGNBITSET( d1 ) << 1; + bits |= IEEE_FLT_SIGNBITSET( d2 ) << 2; + bits |= IEEE_FLT_SIGNBITSET( d3 ) << 3; + + cullBits[i] = bits; + } + } + +#endif } /* @@ -201,7 +237,7 @@ static void R_OverlayPointCullSkinned( byte* cullBits, halfFloat_t* texCoordS, h assert_16_byte_aligned( texCoordT ); assert_16_byte_aligned( verts ); - +#if defined(USE_INTRINSICS) idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts ); const __m128 vector_float_zero = { 0.0f, 0.0f, 0.0f, 0.0f }; @@ -277,6 +313,41 @@ static void R_OverlayPointCullSkinned( byte* cullBits, halfFloat_t* texCoordS, h } } +#else + + idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts ); + + for( int i = 0; i < numVerts; ) + { + + const int nextNumVerts = vertsODS.FetchNextBatch() - 1; + + for( ; i <= nextNumVerts; i++ ) + { + const idVec3 transformed = Scalar_LoadSkinnedDrawVertPosition( vertsODS[i], joints ); + + const float d0 = planes[0].Distance( transformed ); + const float d1 = planes[1].Distance( transformed ); + const float d2 = 1.0f - d0; + const float d3 = 1.0f - d1; + + halfFloat_t s = Scalar_FastF32toF16( d0 ); + halfFloat_t t = Scalar_FastF32toF16( d1 ); + + texCoordS[i] = s; + texCoordT[i] = t; + + byte bits; + bits = IEEE_FLT_SIGNBITSET( d0 ) << 0; + bits |= IEEE_FLT_SIGNBITSET( d1 ) << 1; + bits |= IEEE_FLT_SIGNBITSET( d2 ) << 2; + bits |= IEEE_FLT_SIGNBITSET( d3 ) << 3; + + cullBits[i] = bits; + } + } + +#endif } /* @@ -486,6 +557,7 @@ static void R_CopyOverlaySurface( idDrawVert* verts, int numVerts, triIndex_t* i assert( ( ( overlay->numVerts * sizeof( idDrawVert ) ) & 15 ) == 0 ); assert( ( ( overlay->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 ); +#if defined(USE_INTRINSICS) const __m128i vector_int_clear_last = _mm_set_epi32( 0, -1, -1, -1 ); const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 ); @@ -524,6 +596,30 @@ static void R_CopyOverlaySurface( idDrawVert* verts, int numVerts, triIndex_t* i _mm_sfence(); +#else + + // copy vertices + for( int i = 0; i < overlay->numVerts; i++ ) + { + const overlayVertex_t& overlayVert = overlay->verts[i]; + + // NOTE: bad out-of-order write-combined write, SIMD code does the right thing + verts[numVerts + i] = sourceVerts[overlayVert.vertexNum]; + + // RB begin + verts[numVerts + i].SetTexCoordS( overlayVert.st[0] ); + verts[numVerts + i].SetTexCoordT( overlayVert.st[1] ); + // RB end + } + + // copy indexes + for( int i = 0; i < overlay->numIndexes; i += 2 ) + { + assert( overlay->indexes[i + 0] < overlay->numVerts && overlay->indexes[i + 1] < overlay->numVerts ); + WriteIndexPair( &indexes[numIndexes + i], numVerts + overlay->indexes[i + 0], numVerts + overlay->indexes[i + 1] ); + } + +#endif } /* diff --git a/neo/renderer/Model_md5.cpp b/neo/renderer/Model_md5.cpp index 8621b7ac..6d132ded 100644 --- a/neo/renderer/Model_md5.cpp +++ b/neo/renderer/Model_md5.cpp @@ -32,10 +32,10 @@ If you have questions concerning this license or the applicable additional terms #include "tr_local.h" #include "Model_local.h" - +#if defined(USE_INTRINSICS) static const __m128 vector_float_posInfinity = { idMath::INFINITY, idMath::INFINITY, idMath::INFINITY, idMath::INFINITY }; static const __m128 vector_float_negInfinity = { -idMath::INFINITY, -idMath::INFINITY, -idMath::INFINITY, -idMath::INFINITY }; - +#endif static const char* MD5_SnapshotName = "_MD5_Snapshot_"; @@ -561,6 +561,7 @@ idMD5Mesh::CalculateBounds */ void idMD5Mesh::CalculateBounds( const idJointMat* entJoints, idBounds& bounds ) const { +#if defined(USE_INTRINSICS) __m128 minX = vector_float_posInfinity; __m128 minY = vector_float_posInfinity; @@ -595,6 +596,17 @@ void idMD5Mesh::CalculateBounds( const idJointMat* entJoints, idBounds& bounds ) _mm_store_ss( bounds.ToFloatPtr() + 4, _mm_splat_ps( maxY, 3 ) ); _mm_store_ss( bounds.ToFloatPtr() + 5, _mm_splat_ps( maxZ, 3 ) ); +#else + + bounds.Clear(); + for( int i = 0; i < numMeshJoints; i++ ) + { + const idJointMat& joint = entJoints[meshJoints[i]]; + bounds.AddPoint( joint.GetTranslation() ); + } + bounds.ExpandSelf( maxJointVertDist ); + +#endif } /* @@ -1220,6 +1232,7 @@ static void TransformJoints( idJointMat* __restrict outJoints, const int numJoin assert_16_byte_aligned( inFloats1 ); assert_16_byte_aligned( inFloats2 ); +#if defined(USE_INTRINSICS) const __m128 mask_keep_last = __m128c( _mm_set_epi32( 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 ) ); @@ -1296,6 +1309,14 @@ static void TransformJoints( idJointMat* __restrict outJoints, const int numJoin _mm_store_ps( outFloats + 1 * 12 + 8, ri1 ); } +#else + + for( int i = 0; i < numJoints; i++ ) + { + idJointMat::Multiply( outJoints[i], inJoints1[i], inJoints2[i] ); + } + +#endif } /* diff --git a/neo/renderer/jobs/ShadowShared.cpp b/neo/renderer/jobs/ShadowShared.cpp index 843f9b4b..d983a84e 100644 --- a/neo/renderer/jobs/ShadowShared.cpp +++ b/neo/renderer/jobs/ShadowShared.cpp @@ -92,7 +92,7 @@ static void R_ShadowVolumeCullBits( byte* cullBits, byte& totalOr, const float r assert_16_byte_aligned( cullBits ); assert_16_byte_aligned( verts ); - +#if defined(USE_INTRINSICS) idODSStreamedArray< idShadowVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts ); const __m128 vector_float_radius = _mm_splat_ps( _mm_load_ss( &radius ), 0 ); @@ -215,6 +215,56 @@ static void R_ShadowVolumeCullBits( byte* cullBits, byte& totalOr, const float r totalOr = ( byte ) _mm_cvtsi128_si32( vecTotalOrByte ); +#else + + idODSStreamedArray< idShadowVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts ); + + byte tOr = 0; + for( int i = 0; i < numVerts; ) + { + + const int nextNumVerts = vertsODS.FetchNextBatch() - 1; + + for( ; i <= nextNumVerts; i++ ) + { + const idVec3& v = vertsODS[i].xyzw.ToVec3(); + + const float d0 = planes[0].Distance( v ); + const float d1 = planes[1].Distance( v ); + const float d2 = planes[2].Distance( v ); + const float d3 = planes[3].Distance( v ); + + const float t0 = d0 + radius; + const float t1 = d1 + radius; + const float t2 = d2 + radius; + const float t3 = d3 + radius; + + const float s0 = d0 - radius; + const float s1 = d1 - radius; + const float s2 = d2 - radius; + const float s3 = d3 - radius; + + byte bits; + bits = IEEE_FLT_SIGNBITSET( t0 ) << 0; + bits |= IEEE_FLT_SIGNBITSET( t1 ) << 1; + bits |= IEEE_FLT_SIGNBITSET( t2 ) << 2; + bits |= IEEE_FLT_SIGNBITSET( t3 ) << 3; + + bits |= IEEE_FLT_SIGNBITSET( s0 ) << 4; + bits |= IEEE_FLT_SIGNBITSET( s1 ) << 5; + bits |= IEEE_FLT_SIGNBITSET( s2 ) << 6; + bits |= IEEE_FLT_SIGNBITSET( s3 ) << 7; + + bits ^= 0x0F; // flip lower four bits + + tOr |= bits; + cullBits[i] = bits; + } + } + + totalOr = tOr; + +#endif } /* diff --git a/neo/renderer/jobs/dynamicshadowvolume/DynamicShadowVolume.cpp b/neo/renderer/jobs/dynamicshadowvolume/DynamicShadowVolume.cpp index e2d0ae45..e4de3142 100644 --- a/neo/renderer/jobs/dynamicshadowvolume/DynamicShadowVolume.cpp +++ b/neo/renderer/jobs/dynamicshadowvolume/DynamicShadowVolume.cpp @@ -31,14 +31,16 @@ If you have questions concerning this license or the applicable additional terms #include "../../../idlib/sys/sys_intrinsics.h" #include "../../../idlib/geometry/DrawVert_intrinsics.h" - +#if defined(USE_INTRINSICS) static const __m128i vector_int_neg_one = _mm_set_epi32( -1, -1, -1, -1 ); +#endif /* ===================== TriangleFacing_SSE2 ===================== */ +#if defined(USE_INTRINSICS) static ID_FORCE_INLINE __m128i TriangleFacing_SSE2( const __m128& vert0X, const __m128& vert0Y, const __m128& vert0Z, const __m128& vert1X, const __m128& vert1Y, const __m128& vert1Z, const __m128& vert2X, const __m128& vert2Y, const __m128& vert2Z, @@ -60,6 +62,7 @@ static ID_FORCE_INLINE __m128i TriangleFacing_SSE2( const __m128& vert0X, const const __m128 delta = _mm_nmsub_ps( lightOriginX, normalX, _mm_nmsub_ps( lightOriginY, normalY, _mm_nmsub_ps( lightOriginZ, normalZ, normalW ) ) ); return _mm_castps_si128( _mm_cmplt_ps( delta, _mm_setzero_ps() ) ); } +#endif /* ===================== @@ -68,6 +71,7 @@ TriangleCulled The clip space of the 'lightProject' is assumed to be in the range [0, 1]. ===================== */ +#if defined(USE_INTRINSICS) static ID_FORCE_INLINE __m128i TriangleCulled_SSE2( const __m128& vert0X, const __m128& vert0Y, const __m128& vert0Z, const __m128& vert1X, const __m128& vert1Y, const __m128& vert1Z, const __m128& vert2X, const __m128& vert2Y, const __m128& vert2Z, @@ -128,6 +132,92 @@ static ID_FORCE_INLINE __m128i TriangleCulled_SSE2( const __m128& vert0X, const return _mm_castps_si128( _mm_cmpeq_ps( b0, zero ) ); } +#else + +/* +===================== +TriangleFacing + +Returns 255 if the triangle is facing the light origin, otherwise returns 0. +===================== +*/ +static byte TriangleFacing_Generic( const idVec3& v1, const idVec3& v2, const idVec3& v3, const idVec3& lightOrigin ) +{ + const float sx = v2.x - v1.x; + const float sy = v2.y - v1.y; + const float sz = v2.z - v1.z; + + const float tx = v3.x - v1.x; + const float ty = v3.y - v1.y; + const float tz = v3.z - v1.z; + + const float normalX = ty * sz - tz * sy; + const float normalY = tz * sx - tx * sz; + const float normalZ = tx * sy - ty * sx; + const float normalW = normalX * v1.x + normalY * v1.y + normalZ * v1.z; + + const float d = lightOrigin.x * normalX + lightOrigin.y * normalY + lightOrigin.z * normalZ - normalW; + return ( d > 0.0f ) ? 255 : 0; +} + +/* +===================== +TriangleCulled + +Returns 255 if the triangle is culled to the light projection matrix, otherwise returns 0. +The clip space of the 'lightProject' is assumed to be in the range [0, 1]. +===================== +*/ +static byte TriangleCulled_Generic( const idVec3& v1, const idVec3& v2, const idVec3& v3, const idRenderMatrix& lightProject ) +{ + // transform the triangle + idVec4 c[3]; + for( int i = 0; i < 4; i++ ) + { + c[0][i] = v1[0] * lightProject[i][0] + v1[1] * lightProject[i][1] + v1[2] * lightProject[i][2] + lightProject[i][3]; + c[1][i] = v2[0] * lightProject[i][0] + v2[1] * lightProject[i][1] + v2[2] * lightProject[i][2] + lightProject[i][3]; + c[2][i] = v3[0] * lightProject[i][0] + v3[1] * lightProject[i][1] + v3[2] * lightProject[i][2] + lightProject[i][3]; + } + + // calculate the culled bits + int bits = 0; + for( int i = 0; i < 3; i++ ) + { + const float minW = 0.0f; + const float maxW = c[i][3]; + + if( c[i][0] > minW ) + { + bits |= ( 1 << 0 ); + } + if( c[i][0] < maxW ) + { + bits |= ( 1 << 1 ); + } + if( c[i][1] > minW ) + { + bits |= ( 1 << 2 ); + } + if( c[i][1] < maxW ) + { + bits |= ( 1 << 3 ); + } + if( c[i][2] > minW ) + { + bits |= ( 1 << 4 ); + } + if( c[i][2] < maxW ) + { + bits |= ( 1 << 5 ); + } + } + + // if any bits weren't set, the triangle is completely off one side of the frustum + return ( bits != 63 ) ? 255 : 0; +} + +#endif + /* ===================== @@ -159,6 +249,7 @@ static int CalculateTriangleFacingCulledStatic( byte* __restrict facing, byte* _ const idVec3 lineDir = lineDelta * lineLengthRcp; const float lineLength = lineLengthSqr * lineLengthRcp; +#if defined(USE_INTRINSICS) idODSStreamedIndexedArray< idDrawVert, triIndex_t, 32, SBT_QUAD, 4* 3 > indexedVertsODS( verts, numVerts, indexes, numIndexes ); @@ -271,6 +362,61 @@ static int CalculateTriangleFacingCulledStatic( byte* __restrict facing, byte* _ return _mm_cvtsi128_si32( numFrontFacing ); +#else + + idODSStreamedIndexedArray< idDrawVert, triIndex_t, 32, SBT_QUAD, 1 > indexedVertsODS( verts, numVerts, indexes, numIndexes ); + + const byte cullShadowTrianglesToLightMask = cullShadowTrianglesToLight ? 255 : 0; + + int numFrontFacing = 0; + + for( int i = 0, j = 0; i < numIndexes; ) + { + + const int batchStart = i; + const int batchEnd = indexedVertsODS.FetchNextBatch(); + const int indexStart = j; + + for( ; i <= batchEnd - 3; i += 3, j++ ) + { + const idVec3& v1 = indexedVertsODS[i + 0].xyz; + const idVec3& v2 = indexedVertsODS[i + 1].xyz; + const idVec3& v3 = indexedVertsODS[i + 2].xyz; + + const byte triangleCulled = TriangleCulled_Generic( v1, v2, v3, lightProject ); + + byte triangleFacing = TriangleFacing_Generic( v1, v2, v3, lightOrigin ); + + // optionally make triangles that are outside the light frustum facing so they do not contribute to the shadow volume + triangleFacing |= ( triangleCulled & cullShadowTrianglesToLightMask ); + + culled[j] = triangleCulled; + facing[j] = triangleFacing; + + // count the number of facing triangles + numFrontFacing += ( triangleFacing & 1 ); + } + + if( insideShadowVolume != NULL ) + { + for( int k = batchStart, n = indexStart; k <= batchEnd - 3; k += 3, n++ ) + { + if( !facing[n] ) + { + if( R_LineIntersectsTriangleExpandedWithSphere( lineStart, lineEnd, lineDir, lineLength, radius, indexedVertsODS[k + 2].xyz, indexedVertsODS[k + 1].xyz, indexedVertsODS[k + 0].xyz ) ) + { + *insideShadowVolume = true; + insideShadowVolume = NULL; + break; + } + } + } + } + } + + return numFrontFacing; + +#endif } /* @@ -303,6 +449,7 @@ static int CalculateTriangleFacingCulledSkinned( byte* __restrict facing, byte* const idVec3 lineDir = lineDelta * lineLengthRcp; const float lineLength = lineLengthSqr * lineLengthRcp; +#if defined(USE_INTRINSICS) idODSStreamedArray< idDrawVert, 32, SBT_DOUBLE, 1 > vertsODS( verts, numVerts ); @@ -448,6 +595,82 @@ static int CalculateTriangleFacingCulledSkinned( byte* __restrict facing, byte* return _mm_cvtsi128_si32( numFrontFacing ); +#else + + idODSStreamedArray< idDrawVert, 32, SBT_DOUBLE, 1 > vertsODS( verts, numVerts ); + + for( int i = 0; i < numVerts; ) + { + + const int nextNumVerts = vertsODS.FetchNextBatch() - 1; + + for( ; i <= nextNumVerts; i++ ) + { + tempVerts[i].ToVec3() = Scalar_LoadSkinnedDrawVertPosition( vertsODS[i], joints ); + tempVerts[i].w = 1.0f; + } + } + + idODSStreamedArray< triIndex_t, 256, SBT_QUAD, 1 > indexesODS( indexes, numIndexes ); + + const byte cullShadowTrianglesToLightMask = cullShadowTrianglesToLight ? 255 : 0; + + int numFrontFacing = 0; + + for( int i = 0, j = 0; i < numIndexes; ) + { + + const int batchStart = i; + const int batchEnd = indexesODS.FetchNextBatch(); + const int indexStart = j; + + for( ; i <= batchEnd - 3; i += 3, j++ ) + { + const int i0 = indexesODS[i + 0]; + const int i1 = indexesODS[i + 1]; + const int i2 = indexesODS[i + 2]; + + const idVec3& v1 = tempVerts[i0].ToVec3(); + const idVec3& v2 = tempVerts[i1].ToVec3(); + const idVec3& v3 = tempVerts[i2].ToVec3(); + + const byte triangleCulled = TriangleCulled_Generic( v1, v2, v3, lightProject ); + + byte triangleFacing = TriangleFacing_Generic( v1, v2, v3, lightOrigin ); + + // optionally make triangles that are outside the light frustum facing so they do not contribute to the shadow volume + triangleFacing |= ( triangleCulled & cullShadowTrianglesToLightMask ); + + culled[j] = triangleCulled; + facing[j] = triangleFacing; + + // count the number of facing triangles + numFrontFacing += ( triangleFacing & 1 ); + } + + if( insideShadowVolume != NULL ) + { + for( int k = batchStart, n = indexStart; k <= batchEnd - 3; k += 3, n++ ) + { + if( !facing[n] ) + { + const int i0 = indexesODS[k + 0]; + const int i1 = indexesODS[k + 1]; + const int i2 = indexesODS[k + 2]; + if( R_LineIntersectsTriangleExpandedWithSphere( lineStart, lineEnd, lineDir, lineLength, radius, tempVerts[i2].ToVec3(), tempVerts[i1].ToVec3(), tempVerts[i0].ToVec3() ) ) + { + *insideShadowVolume = true; + insideShadowVolume = NULL; + break; + } + } + } + } + } + + return numFrontFacing; + +#endif } /* @@ -461,6 +684,7 @@ static void StreamOut( void* dst, const void* src, int numBytes ) assert_16_byte_aligned( dst ); assert_16_byte_aligned( src ); +#if defined(USE_INTRINSICS) int i = 0; for( ; i + 128 <= numBytes; i += 128 ) { @@ -486,6 +710,9 @@ static void StreamOut( void* dst, const void* src, int numBytes ) __m128i d = _mm_load_si128( ( __m128i* )( ( byte* )src + i ) ); _mm_stream_si128( ( __m128i* )( ( byte* )dst + i ), d ); } +#else + memcpy( dst, src, numBytes ); +#endif } /* @@ -706,7 +933,9 @@ static void R_CreateShadowVolumeTriangles( triIndex_t* __restrict shadowIndices, numShadowIndexesTotal = numShadowIndices; +#if defined(USE_INTRINSICS) _mm_sfence(); +#endif #else // NOTE: this code will not work on the SPU because it tries to write directly to the destination @@ -893,7 +1122,9 @@ void R_CreateLightTriangles( triIndex_t* __restrict lightIndices, triIndex_t* __ numLightIndicesTotal = numLightIndices; +#if defined(USE_INTRINSICS) _mm_sfence(); +#endif #else // NOTE: this code will not work on the SPU because it tries to write directly to the destination diff --git a/neo/renderer/tr_trace.cpp b/neo/renderer/tr_trace.cpp index ce886462..c4bffb4b 100644 --- a/neo/renderer/tr_trace.cpp +++ b/neo/renderer/tr_trace.cpp @@ -3,6 +3,7 @@ Doom 3 BFG Edition GPL Source Code Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company. +Copyright (C) 2013 Robert Beckebans This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code"). @@ -44,7 +45,7 @@ static void R_TracePointCullStatic( byte* cullBits, byte& totalOr, const float r assert_16_byte_aligned( cullBits ); assert_16_byte_aligned( verts ); - +#if defined(USE_INTRINSICS) idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts ); const __m128 vector_float_radius = _mm_splat_ps( _mm_load_ss( &radius ), 0 ); @@ -167,6 +168,56 @@ static void R_TracePointCullStatic( byte* cullBits, byte& totalOr, const float r totalOr = ( byte ) _mm_cvtsi128_si32( vecTotalOrByte ); +#else + + idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts ); + + byte tOr = 0; + for( int i = 0; i < numVerts; ) + { + + const int nextNumVerts = vertsODS.FetchNextBatch() - 1; + + for( ; i <= nextNumVerts; i++ ) + { + const idVec3& v = vertsODS[i].xyz; + + const float d0 = planes[0].Distance( v ); + const float d1 = planes[1].Distance( v ); + const float d2 = planes[2].Distance( v ); + const float d3 = planes[3].Distance( v ); + + const float t0 = d0 + radius; + const float t1 = d1 + radius; + const float t2 = d2 + radius; + const float t3 = d3 + radius; + + const float s0 = d0 - radius; + const float s1 = d1 - radius; + const float s2 = d2 - radius; + const float s3 = d3 - radius; + + byte bits; + bits = IEEE_FLT_SIGNBITSET( t0 ) << 0; + bits |= IEEE_FLT_SIGNBITSET( t1 ) << 1; + bits |= IEEE_FLT_SIGNBITSET( t2 ) << 2; + bits |= IEEE_FLT_SIGNBITSET( t3 ) << 3; + + bits |= IEEE_FLT_SIGNBITSET( s0 ) << 4; + bits |= IEEE_FLT_SIGNBITSET( s1 ) << 5; + bits |= IEEE_FLT_SIGNBITSET( s2 ) << 6; + bits |= IEEE_FLT_SIGNBITSET( s3 ) << 7; + + bits ^= 0x0F; // flip lower four bits + + tOr |= bits; + cullBits[i] = bits; + } + } + + totalOr = tOr; + +#endif } /* @@ -179,7 +230,7 @@ static void R_TracePointCullSkinned( byte* cullBits, byte& totalOr, const float assert_16_byte_aligned( cullBits ); assert_16_byte_aligned( verts ); - +#if defined(USE_INTRINSICS) idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts ); const __m128 vector_float_radius = _mm_splat_ps( _mm_load_ss( &radius ), 0 ); @@ -302,6 +353,56 @@ static void R_TracePointCullSkinned( byte* cullBits, byte& totalOr, const float totalOr = ( byte ) _mm_cvtsi128_si32( vecTotalOrByte ); +#else + + idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts ); + + byte tOr = 0; + for( int i = 0; i < numVerts; ) + { + + const int nextNumVerts = vertsODS.FetchNextBatch() - 1; + + for( ; i <= nextNumVerts; i++ ) + { + const idVec3 v = Scalar_LoadSkinnedDrawVertPosition( vertsODS[i], joints ); + + const float d0 = planes[0].Distance( v ); + const float d1 = planes[1].Distance( v ); + const float d2 = planes[2].Distance( v ); + const float d3 = planes[3].Distance( v ); + + const float t0 = d0 + radius; + const float t1 = d1 + radius; + const float t2 = d2 + radius; + const float t3 = d3 + radius; + + const float s0 = d0 - radius; + const float s1 = d1 - radius; + const float s2 = d2 - radius; + const float s3 = d3 - radius; + + byte bits; + bits = IEEE_FLT_SIGNBITSET( t0 ) << 0; + bits |= IEEE_FLT_SIGNBITSET( t1 ) << 1; + bits |= IEEE_FLT_SIGNBITSET( t2 ) << 2; + bits |= IEEE_FLT_SIGNBITSET( t3 ) << 3; + + bits |= IEEE_FLT_SIGNBITSET( s0 ) << 4; + bits |= IEEE_FLT_SIGNBITSET( s1 ) << 5; + bits |= IEEE_FLT_SIGNBITSET( s2 ) << 6; + bits |= IEEE_FLT_SIGNBITSET( s3 ) << 7; + + bits ^= 0x0F; // flip lower four bits + + tOr |= bits; + cullBits[i] = bits; + } + } + + totalOr = tOr; + +#endif } /* diff --git a/neo/renderer/tr_trisurf.cpp b/neo/renderer/tr_trisurf.cpp index 97a2533b..d698d4f2 100644 --- a/neo/renderer/tr_trisurf.cpp +++ b/neo/renderer/tr_trisurf.cpp @@ -1722,10 +1722,12 @@ void R_TestDegenerateTextureSpace( srfTriangles_t* tri ) const idDrawVert& b = tri->verts[tri->indexes[i + 1]]; const idDrawVert& c = tri->verts[tri->indexes[i + 2]]; - if( a.st == b.st || b.st == c.st || c.st == a.st ) + // RB: compare texcoords instead of pointers + if( a.GetTexCoord() == b.GetTexCoord() || b.GetTexCoord() == c.GetTexCoord() || c.GetTexCoord() == a.GetTexCoord() ) { c_degenerate++; } + // RB end } if( c_degenerate ) From 2a4970c86c74eceda2b06e90c797ce935149deaf Mon Sep 17 00:00:00 2001 From: Robert Beckebans Date: Sat, 1 Jun 2013 15:15:18 +0200 Subject: [PATCH 3/6] Added CMake batch file for OpenAL --- neo/cmake-vs2012-32bit-openal.bat | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 neo/cmake-vs2012-32bit-openal.bat diff --git a/neo/cmake-vs2012-32bit-openal.bat b/neo/cmake-vs2012-32bit-openal.bat new file mode 100644 index 00000000..8064e4c5 --- /dev/null +++ b/neo/cmake-vs2012-32bit-openal.bat @@ -0,0 +1,6 @@ +cd .. +del /s /q build +mkdir build +cd build +cmake -G "Visual Studio 11" -DCMAKE_INSTALL_PREFIX=../bin/win8-32 -DOPENAL=ON ../neo +pause \ No newline at end of file From 3b67eabf79807201efc0da6f0d56fc89f7b899f3 Mon Sep 17 00:00:00 2001 From: Robert Beckebans Date: Sat, 1 Jun 2013 18:29:12 +0200 Subject: [PATCH 4/6] Fixed critical bug in the generic C++ code of DotProduct_SIMD that caused massive errors in the physics system --- neo/idlib/math/Lcp.cpp | 166 +++++++++++++++++------------------------ 1 file changed, 67 insertions(+), 99 deletions(-) diff --git a/neo/idlib/math/Lcp.cpp b/neo/idlib/math/Lcp.cpp index eb8c5354..807bb270 100644 --- a/neo/idlib/math/Lcp.cpp +++ b/neo/idlib/math/Lcp.cpp @@ -220,31 +220,49 @@ static float DotProduct_SIMD( const float* src0, const float* src1, const int co #else + // RB: the old loop caused completely broken rigid body physics and NaN errors +#if 1 float s0 = 0.0f; float s1 = 0.0f; float s2 = 0.0f; float s3 = 0.0f; + int i = 0; - for( ; i < count - 3; i += 4 ) + for( ; i + 4 <= count; i += 4 ) { - s0 += src0[i + 4] * src1[i + 4]; - s1 += src0[i + 5] * src1[i + 5]; - s2 += src0[i + 6] * src1[i + 6]; - s3 += src0[i + 7] * src1[i + 7]; + s0 += src0[i + 0] * src1[i + 0]; + s1 += src0[i + 1] * src1[i + 1]; + s2 += src0[i + 2] * src1[i + 2]; + s3 += src0[i + 3] * src1[i + 3]; } + switch( count - i ) { NODEFAULT; + + case 4: + s3 += src0[i + 3] * src1[i + 3]; case 3: - s0 += src0[i + 2] * src1[i + 2]; + s2 += src0[i + 2] * src1[i + 2]; case 2: s1 += src0[i + 1] * src1[i + 1]; case 1: - s2 += src0[i + 0] * src1[i + 0]; + s0 += src0[i + 0] * src1[i + 0]; case 0: break; } return s0 + s1 + s2 + s3; +#else + + float s = 0; + for( int i = 0; i < count; i++ ) + { + s += src0[i] * src1[i]; + } + + return s; +#endif + // RB end #endif } @@ -1519,114 +1537,64 @@ static void GetMaxStep_SIMD( const float* f, const float* a, const float* delta_ _mm_store_ss( & maxStep, vMaxStep ); limit = _mm_cvtsi128_si32( vLimit ); limitSide = _mm_cvtsi128_si32( vLimitSide ); + #else - int i; - float s; // default to a full step for the current variable - if( idMath::Fabs( delta_a[d] ) > LCP_DELTA_ACCEL_EPSILON ) { - maxStep = -a[d] / delta_a[d]; + float negAccel = -a[d]; + float deltaAccel = delta_a[d]; + int m0 = ( fabs( deltaAccel ) > LCP_DELTA_ACCEL_EPSILON ); + float step = negAccel / ( m0 ? deltaAccel : 1.0f ); + maxStep = m0 ? step : 0.0f; + limit = d; + limitSide = 0; } - else - { - maxStep = 0.0f; - } - limit = d; - limitSide = 0; // test the current variable - if( dir < 0.0f ) { - if( lo[d] != -idMath::INFINITY ) - { - s = ( lo[d] - f[d] ) / dir; - if( s < maxStep ) - { - maxStep = s; - limitSide = -1; - } - } - } - else - { - if( hi[d] != idMath::INFINITY ) - { - s = ( hi[d] - f[d] ) / dir; - if( s < maxStep ) - { - maxStep = s; - limitSide = 1; - } - } + float deltaForce = dir; + float forceLimit = ( deltaForce < 0.0f ) ? lo[d] : hi[d]; + float step = ( forceLimit - f[d] ) / deltaForce; + int setSide = ( deltaForce < 0.0f ) ? -1 : 1; + int m0 = ( fabs( deltaForce ) > LCP_DELTA_FORCE_EPSILON ); + int m1 = ( fabs( forceLimit ) != idMath::INFINITY ); + int m2 = ( step < maxStep ); + int m3 = ( m0 & m1 & m2 ); + maxStep = m3 ? step : maxStep; + limit = m3 ? d : limit; + limitSide = m3 ? setSide : limitSide; } // test the clamped bounded variables - for( i = numUnbounded; i < numClamped; i++ ) + for( int i = numUnbounded; i < numClamped; i++ ) { - if( delta_f[i] < -LCP_DELTA_FORCE_EPSILON ) - { - // if there is a low boundary - if( lo[i] != -idMath::INFINITY ) - { - s = ( lo[i] - f[i] ) / delta_f[i]; - if( s < maxStep ) - { - maxStep = s; - limit = i; - limitSide = -1; - } - } - } - else if( delta_f[i] > LCP_DELTA_FORCE_EPSILON ) - { - // if there is a high boundary - if( hi[i] != idMath::INFINITY ) - { - s = ( hi[i] - f[i] ) / delta_f[i]; - if( s < maxStep ) - { - maxStep = s; - limit = i; - limitSide = 1; - } - } - } + float deltaForce = delta_f[i]; + float forceLimit = ( deltaForce < 0.0f ) ? lo[i] : hi[i]; + int m0 = ( fabs( deltaForce ) > LCP_DELTA_FORCE_EPSILON ); + float step = ( forceLimit - f[i] ) / ( m0 ? deltaForce : 1.0f ); + int setSide = ( deltaForce < 0.0f ) ? -1 : 1; + int m1 = ( fabs( forceLimit ) != idMath::INFINITY ); + int m2 = ( step < maxStep ); + int m3 = ( m0 & m1 & m2 ); + maxStep = m3 ? step : maxStep; + limit = m3 ? i : limit; + limitSide = m3 ? setSide : limitSide; } // test the not clamped bounded variables - for( i = numClamped; i < d; i++ ) + for( int i = numClamped; i < d; i++ ) { - if( side[i] == -1 ) - { - if( delta_a[i] >= -LCP_DELTA_ACCEL_EPSILON ) - { - continue; - } - } - else if( side[i] == 1 ) - { - if( delta_a[i] <= LCP_DELTA_ACCEL_EPSILON ) - { - continue; - } - } - else - { - continue; - } - // ignore variables for which the force is not allowed to take any substantial value - if( lo[i] >= -LCP_BOUND_EPSILON && hi[i] <= LCP_BOUND_EPSILON ) - { - continue; - } - s = -a[i] / delta_a[i]; - if( s < maxStep ) - { - maxStep = s; - limit = i; - limitSide = 0; - } + float negAccel = -a[i]; + float deltaAccel = delta_a[i]; + int m0 = ( side[i] * deltaAccel > LCP_DELTA_ACCEL_EPSILON ); + float step = negAccel / ( m0 ? deltaAccel : 1.0f ); + int m1 = ( lo[i] < -LCP_BOUND_EPSILON || hi[i] > LCP_BOUND_EPSILON ); + int m2 = ( step < maxStep ); + int m3 = ( m0 & m1 & m2 ); + maxStep = m3 ? step : maxStep; + limit = m3 ? i : limit; + limitSide = m3 ? 0 : limitSide; } #endif From e6b4326f86dad46c0c0115355be449c4deef8983 Mon Sep 17 00:00:00 2001 From: Dmitry Shapovalov Date: Mon, 3 Jun 2013 14:56:25 +0600 Subject: [PATCH 5/6] Code duplication in idWeapon::Clear --- neo/d3xp/Weapon.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/neo/d3xp/Weapon.cpp b/neo/d3xp/Weapon.cpp index 1afd7caf..babad778 100644 --- a/neo/d3xp/Weapon.cpp +++ b/neo/d3xp/Weapon.cpp @@ -747,11 +747,6 @@ void idWeapon::Clear() WEAPON_RAISEWEAPON.Unlink(); WEAPON_LOWERWEAPON.Unlink(); - if( muzzleFlashHandle != -1 ) - { - gameRenderWorld->FreeLightDef( muzzleFlashHandle ); - muzzleFlashHandle = -1; - } if( muzzleFlashHandle != -1 ) { gameRenderWorld->FreeLightDef( muzzleFlashHandle ); From 88b23611f9f74cde7a0b9de034ee83273e182590 Mon Sep 17 00:00:00 2001 From: Daniel Gibson Date: Sun, 23 Jun 2013 19:16:23 +0200 Subject: [PATCH 6/6] Fix DEBUG_THREADS on FreeBSD There was a typo in an #include, furthermore FreeBSD has no pthread_getname_np equivalent. I added Sys_GetThreadName() so the code is a bit cleaner. --- neo/idlib/sys/posix/posix_thread.cpp | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/neo/idlib/sys/posix/posix_thread.cpp b/neo/idlib/sys/posix/posix_thread.cpp index a0bd8fca..d51a93f0 100644 --- a/neo/idlib/sys/posix/posix_thread.cpp +++ b/neo/idlib/sys/posix/posix_thread.cpp @@ -31,7 +31,7 @@ If you have questions concerning this license or the applicable additional terms #include "../../precompiled.h" #ifdef __FreeBSD__ -#include // for pthread_set_name_np +#include // for pthread_set_name_np #endif // DG: Note: On Linux you need at least (e)glibc 2.12 to be able to set the threadname @@ -63,18 +63,15 @@ static int Sys_SetThreadName( pthread_t handle, const char* name ) ret = pthread_setname_np( handle, name ); if( ret != 0 ) idLib::common->Printf( "Setting threadname \"%s\" failed, reason: %s (%i)\n", name, strerror( errno ), errno ); - // pthread_getname_np(pthread_t, char*, size_t) #elif defined(__FreeBSD__) // according to http://www.freebsd.org/cgi/man.cgi?query=pthread_set_name_np&sektion=3 // the interface is void pthread_set_name_np(pthread_t tid, const char *name); pthread_set_name_np( handle, name ); // doesn't return anything - // seems like there is no get_name equivalent #endif /* TODO: OSX: // according to http://stackoverflow.com/a/7989973 // this needs to be called in the thread to be named! ret = pthread_setname_np(name); - // int pthread_getname_np(pthread_t, char*, size_t); // so we'd have to wrap the xthread_t function in Sys_CreateThread and set the name in the wrapping function... */ @@ -82,7 +79,24 @@ static int Sys_SetThreadName( pthread_t handle, const char* name ) return ret; } -// TODO: Sys_GetThreadName() ? +static int Sys_GetThreadName( pthread_t handle, char* namebuf, size_t buflen ) +{ + int ret = 0; +#ifdef __linux__ + ret = pthread_getname_np( handle, namebuf, buflen ); + if( ret != 0 ) + idLib::common->Printf( "Getting threadname failed, reason: %s (%i)\n", strerror( errno ), errno ); +#elif defined(__FreeBSD__) + // seems like there is no pthread_getname_np equivalent on FreeBSD + idStr::snPrintf( namebuf, buflen, "Can't read threadname on this platform!" ); +#endif + /* TODO: OSX: + // int pthread_getname_np(pthread_t, char*, size_t); + */ + + return ret; +} + #endif // DEBUG_THREADS @@ -228,7 +242,7 @@ void Sys_DestroyThread( uintptr_t threadHandle ) name[0] = '\0'; #if defined(DEBUG_THREADS) - pthread_getname_np( threadHandle, name, sizeof( name ) ); + Sys_GetThreadName( ( pthread_t )threadHandle, name, sizeof( name ) ); #endif #if 0 //!defined(__ANDROID__)