diff --git a/neo/idlib/geometry/DrawVert.h b/neo/idlib/geometry/DrawVert.h index 2797d427..0d1d1dbf 100644 --- a/neo/idlib/geometry/DrawVert.h +++ b/neo/idlib/geometry/DrawVert.h @@ -193,6 +193,7 @@ Assumes input is in the range [-1, 1] ID_INLINE void VertexFloatToByte( const float & x, const float & y, const float & z, byte * bval ) { assert_4_byte_aligned( bval ); // for __stvebx +#ifdef ID_WIN_X86_SSE2_INTRIN const __m128 vector_float_one = { 1.0f, 1.0f, 1.0f, 1.0f }; const __m128 vector_float_half = { 0.5f, 0.5f, 0.5f, 0.5f }; @@ -209,6 +210,13 @@ ID_INLINE void VertexFloatToByte( const float & x, const float & y, const float bval[1] = (byte)_mm_extract_epi16( xyz16, 1 ); bval[2] = (byte)_mm_extract_epi16( xyz16, 2 ); +#else + + bval[0] = VERTEX_FLOAT_TO_BYTE( x ); + bval[1] = VERTEX_FLOAT_TO_BYTE( y ); + bval[2] = VERTEX_FLOAT_TO_BYTE( z ); + +#endif } /* @@ -609,6 +617,7 @@ ID_INLINE void WriteDrawVerts16( idDrawVert * destVerts, const idDrawVert * loca assert_16_byte_aligned( destVerts ); assert_16_byte_aligned( localVerts ); +#ifdef ID_WIN_X86_SSE2_INTRIN for ( int i = 0; i < numVerts; i++ ) { __m128i v0 = _mm_load_si128( (const __m128i *)( (byte *)( localVerts + i ) + 0 ) ); @@ -617,6 +626,11 @@ ID_INLINE void WriteDrawVerts16( idDrawVert * destVerts, const idDrawVert * loca _mm_stream_si128( (__m128i *)( (byte *)( destVerts + i ) + 16 ), v1 ); } +#else + + memcpy( destVerts, localVerts, numVerts * sizeof( idDrawVert ) ); + +#endif } /* diff --git a/neo/idlib/geometry/DrawVert_intrinsics.h b/neo/idlib/geometry/DrawVert_intrinsics.h index 97df61c3..dd5a1aba 100644 --- a/neo/idlib/geometry/DrawVert_intrinsics.h +++ b/neo/idlib/geometry/DrawVert_intrinsics.h @@ -29,6 +29,7 @@ If you have questions concerning this license or the applicable additional terms #ifndef __DRAWVERT_INTRINSICS_H__ #define __DRAWVERT_INTRINSICS_H__ +#ifdef ID_WIN_X86_SSE2_INTRIN static const __m128i vector_int_f32_sign_mask = _mm_set1_epi32( 1U << IEEE_FLT_SIGN_BIT ); static const __m128i vector_int_f32_exponent_mask = _mm_set1_epi32( ( ( 1U << IEEE_FLT_EXPONENT_BITS ) - 1 ) << IEEE_FLT_MANTISSA_BITS ); @@ -50,12 +51,14 @@ static const __m128 vector_float_last_one = { 0.0f, 0.0f, 0.0f, 1.0 static const __m128 vector_float_1_over_255 = { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }; static const __m128 vector_float_1_over_4 = { 1.0f / 4.0f, 1.0f / 4.0f, 1.0f / 4.0f, 1.0f / 4.0f }; +#endif /* ==================== FastF32toF16 ==================== */ +#ifdef ID_WIN_X86_SSE2_INTRIN ID_INLINE_EXTERN __m128i FastF32toF16( __m128i f32_bits ) { __m128i f16_sign = _mm_srli_epi32( _mm_and_si128( f32_bits, vector_int_f32_sign_mask ), f32_to_f16_sign_shift ); @@ -77,6 +80,7 @@ ID_INLINE_EXTERN __m128i FastF32toF16( __m128i f32_bits ) { return _mm_packs_epi32( flt16, flt16 ); } +#endif ID_INLINE_EXTERN halfFloat_t Scalar_FastF32toF16( float f32 ) { const int f32_sign_mask = 1U << IEEE_FLT_SIGN_BIT; @@ -115,6 +119,7 @@ ID_INLINE_EXTERN halfFloat_t Scalar_FastF32toF16( float f32 ) { LoadSkinnedDrawVertPosition ==================== */ +#ifdef ID_WIN_X86_SSE2_INTRIN ID_INLINE_EXTERN __m128 LoadSkinnedDrawVertPosition( const idDrawVert & base, const idJointMat * joints ) { const idJointMat & j0 = joints[base.color[0]]; @@ -176,6 +181,7 @@ ID_INLINE_EXTERN __m128 LoadSkinnedDrawVertPosition( const idDrawVert & base, co return r0; } +#endif ID_INLINE_EXTERN idVec3 Scalar_LoadSkinnedDrawVertPosition( const idDrawVert & vert, const idJointMat * joints ) { const idJointMat & j0 = joints[vert.color[0]]; diff --git a/neo/idlib/geometry/RenderMatrix.cpp b/neo/idlib/geometry/RenderMatrix.cpp index 618cc16c..b15e702a 100644 --- a/neo/idlib/geometry/RenderMatrix.cpp +++ b/neo/idlib/geometry/RenderMatrix.cpp @@ -92,6 +92,7 @@ SIMD constants ================================================================================================ */ +#ifdef ID_WIN_X86_SSE2_INTRIN static const __m128i vector_int_1 = _mm_set1_epi32( 1 ); static const __m128i vector_int_4 = _mm_set1_epi32( 4 ); @@ -117,6 +118,7 @@ static const __m128 vector_float_pos_one = { +1.0f, +1.0f, +1.0f, +1.0f }; static const __m128 vector_float_neg_one = { -1.0f, -1.0f, -1.0f, -1.0f }; static const __m128 vector_float_last_one = { 0.0f, 0.0f, 0.0f, 1.0f }; +#endif /* ================================================================================================ @@ -531,6 +533,7 @@ front bits: bit 5 = pos-Z is front facing ======================== */ +#ifdef ID_WIN_X86_SSE2_INTRIN static int GetBoxFrontBits_SSE2( const __m128 & b0, const __m128 & b1, const __m128 & viewOrigin ) { const __m128 dir0 = _mm_sub_ps( viewOrigin, b0 ); @@ -542,6 +545,22 @@ static int GetBoxFrontBits_SSE2( const __m128 & b0, const __m128 & b1, const __m return frontBits; } +#else + +static int GetBoxFrontBits_Generic( const idBounds & bounds, const idVec3 & viewOrigin ) { + idVec3 dir0 = viewOrigin - bounds[0]; + idVec3 dir1 = bounds[1] - viewOrigin; + int frontBits = 0; + frontBits |= IEEE_FLT_SIGNBITSET( dir0.x ) << 0; + frontBits |= IEEE_FLT_SIGNBITSET( dir0.y ) << 1; + frontBits |= IEEE_FLT_SIGNBITSET( dir0.z ) << 2; + frontBits |= IEEE_FLT_SIGNBITSET( dir1.x ) << 3; + frontBits |= IEEE_FLT_SIGNBITSET( dir1.y ) << 4; + frontBits |= IEEE_FLT_SIGNBITSET( dir1.z ) << 5; + return frontBits; +} + +#endif /* ================================================================================================ @@ -720,6 +739,7 @@ The result matrix will transform the unit-cube to exactly cover the bounds. void idRenderMatrix::OffsetScaleForBounds( const idRenderMatrix & src, const idBounds & bounds, idRenderMatrix & out ) { assert( &src != &out ); +#ifdef ID_WIN_X86_SSE2_INTRIN __m128 b0 = _mm_loadu_bounds_0( bounds ); __m128 b1 = _mm_loadu_bounds_1( bounds ); @@ -766,6 +786,32 @@ void idRenderMatrix::OffsetScaleForBounds( const idRenderMatrix & src, const idB _mm_storeu_ps( out.m + 2*4, a2 ); _mm_storeu_ps( out.m + 3*4, a3 ); +#else + + const idVec3 offset = ( bounds[1] + bounds[0] ) * 0.5f; + const idVec3 scale = ( bounds[1] - bounds[0] ) * 0.5f; + + out[0][0] = src[0][0] * scale[0]; + out[0][1] = src[0][1] * scale[1]; + out[0][2] = src[0][2] * scale[2]; + out[0][3] = src[0][3] + src[0][0] * offset[0] + src[0][1] * offset[1] + src[0][2] * offset[2]; + + out[1][0] = src[1][0] * scale[0]; + out[1][1] = src[1][1] * scale[1]; + out[1][2] = src[1][2] * scale[2]; + out[1][3] = src[1][3] + src[1][0] * offset[0] + src[1][1] * offset[1] + src[1][2] * offset[2]; + + out[2][0] = src[2][0] * scale[0]; + out[2][1] = src[2][1] * scale[1]; + out[2][2] = src[2][2] * scale[2]; + out[2][3] = src[2][3] + src[2][0] * offset[0] + src[2][1] * offset[1] + src[2][2] * offset[2]; + + out[3][0] = src[3][0] * scale[0]; + out[3][1] = src[3][1] * scale[1]; + out[3][2] = src[3][2] * scale[2]; + out[3][3] = src[3][3] + src[3][0] * offset[0] + src[3][1] * offset[1] + src[3][2] * offset[2]; + +#endif } /* @@ -779,6 +825,7 @@ The result matrix will transform the bounds to exactly cover the unit-cube. void idRenderMatrix::InverseOffsetScaleForBounds( const idRenderMatrix & src, const idBounds & bounds, idRenderMatrix & out ) { assert( &src != &out ); +#ifdef ID_WIN_X86_SSE2_INTRIN __m128 b0 = _mm_loadu_bounds_0( bounds ); __m128 b1 = _mm_loadu_bounds_1( bounds ); @@ -810,6 +857,32 @@ void idRenderMatrix::InverseOffsetScaleForBounds( const idRenderMatrix & src, co _mm_storeu_ps( out.m + 2*4, a2 ); _mm_storeu_ps( out.m + 3*4, a3 ); +#else + + const idVec3 offset = -0.5f * ( bounds[1] + bounds[0] ); + const idVec3 scale = 2.0f / ( bounds[1] - bounds[0] ); + + out[0][0] = scale[0] * src[0][0]; + out[0][1] = scale[0] * src[0][1]; + out[0][2] = scale[0] * src[0][2]; + out[0][3] = scale[0] * ( src[0][3] + offset[0] ); + + out[1][0] = scale[1] * src[1][0]; + out[1][1] = scale[1] * src[1][1]; + out[1][2] = scale[1] * src[1][2]; + out[1][3] = scale[1] * ( src[1][3] + offset[1] ); + + out[2][0] = scale[2] * src[2][0]; + out[2][1] = scale[2] * src[2][1]; + out[2][2] = scale[2] * src[2][2]; + out[2][3] = scale[2] * ( src[2][3] + offset[2] ); + + out[3][0] = src[3][0]; + out[3][1] = src[3][1]; + out[3][2] = src[3][2]; + out[3][3] = src[3][3]; + +#endif } /* @@ -820,6 +893,7 @@ idRenderMatrix::Transpose void idRenderMatrix::Transpose( const idRenderMatrix & src, idRenderMatrix & out ) { assert( &src != &out ); +#ifdef ID_WIN_X86_SSE2_INTRIN const __m128 a0 = _mm_loadu_ps( src.m + 0*4 ); const __m128 a1 = _mm_loadu_ps( src.m + 1*4 ); @@ -841,6 +915,15 @@ void idRenderMatrix::Transpose( const idRenderMatrix & src, idRenderMatrix & out _mm_storeu_ps( out.m + 2*4, t2 ); _mm_storeu_ps( out.m + 3*4, t3 ); +#else + + for ( int i = 0; i < 4; i++ ) { + for ( int j = 0; j < 4; j++ ) { + out[i][j] = src[j][i]; + } + } + +#endif } /* @@ -850,6 +933,7 @@ idRenderMatrix::Multiply */ void idRenderMatrix::Multiply( const idRenderMatrix & a, const idRenderMatrix & b, idRenderMatrix & out ) { +#ifdef ID_WIN_X86_SSE2_INTRIN __m128 a0 = _mm_loadu_ps( a.m + 0*4 ); __m128 a1 = _mm_loadu_ps( a.m + 1*4 ); @@ -886,6 +970,41 @@ void idRenderMatrix::Multiply( const idRenderMatrix & a, const idRenderMatrix & _mm_storeu_ps( out.m + 2*4, t2 ); _mm_storeu_ps( out.m + 3*4, t3 ); +#else + + /* + for ( int i = 0 ; i < 4 ; i++ ) { + for ( int j = 0 ; j < 4 ; j++ ) { + out.m[ i * 4 + j ] = + a.m[ i * 4 + 0 ] * b.m[ 0 * 4 + j ] + + a.m[ i * 4 + 1 ] * b.m[ 1 * 4 + j ] + + a.m[ i * 4 + 2 ] * b.m[ 2 * 4 + j ] + + a.m[ i * 4 + 3 ] * b.m[ 3 * 4 + j ]; + } + } + */ + + out.m[0*4+0] = a.m[0*4+0]*b.m[0*4+0] + a.m[0*4+1]*b.m[1*4+0] + a.m[0*4+2]*b.m[2*4+0] + a.m[0*4+3]*b.m[3*4+0]; + out.m[0*4+1] = a.m[0*4+0]*b.m[0*4+1] + a.m[0*4+1]*b.m[1*4+1] + a.m[0*4+2]*b.m[2*4+1] + a.m[0*4+3]*b.m[3*4+1]; + out.m[0*4+2] = a.m[0*4+0]*b.m[0*4+2] + a.m[0*4+1]*b.m[1*4+2] + a.m[0*4+2]*b.m[2*4+2] + a.m[0*4+3]*b.m[3*4+2]; + out.m[0*4+3] = a.m[0*4+0]*b.m[0*4+3] + a.m[0*4+1]*b.m[1*4+3] + a.m[0*4+2]*b.m[2*4+3] + a.m[0*4+3]*b.m[3*4+3]; + + out.m[1*4+0] = a.m[1*4+0]*b.m[0*4+0] + a.m[1*4+1]*b.m[1*4+0] + a.m[1*4+2]*b.m[2*4+0] + a.m[1*4+3]*b.m[3*4+0]; + out.m[1*4+1] = a.m[1*4+0]*b.m[0*4+1] + a.m[1*4+1]*b.m[1*4+1] + a.m[1*4+2]*b.m[2*4+1] + a.m[1*4+3]*b.m[3*4+1]; + out.m[1*4+2] = a.m[1*4+0]*b.m[0*4+2] + a.m[1*4+1]*b.m[1*4+2] + a.m[1*4+2]*b.m[2*4+2] + a.m[1*4+3]*b.m[3*4+2]; + out.m[1*4+3] = a.m[1*4+0]*b.m[0*4+3] + a.m[1*4+1]*b.m[1*4+3] + a.m[1*4+2]*b.m[2*4+3] + a.m[1*4+3]*b.m[3*4+3]; + + out.m[2*4+0] = a.m[2*4+0]*b.m[0*4+0] + a.m[2*4+1]*b.m[1*4+0] + a.m[2*4+2]*b.m[2*4+0] + a.m[2*4+3]*b.m[3*4+0]; + out.m[2*4+1] = a.m[2*4+0]*b.m[0*4+1] + a.m[2*4+1]*b.m[1*4+1] + a.m[2*4+2]*b.m[2*4+1] + a.m[2*4+3]*b.m[3*4+1]; + out.m[2*4+2] = a.m[2*4+0]*b.m[0*4+2] + a.m[2*4+1]*b.m[1*4+2] + a.m[2*4+2]*b.m[2*4+2] + a.m[2*4+3]*b.m[3*4+2]; + out.m[2*4+3] = a.m[2*4+0]*b.m[0*4+3] + a.m[2*4+1]*b.m[1*4+3] + a.m[2*4+2]*b.m[2*4+3] + a.m[2*4+3]*b.m[3*4+3]; + + out.m[3*4+0] = a.m[3*4+0]*b.m[0*4+0] + a.m[3*4+1]*b.m[1*4+0] + a.m[3*4+2]*b.m[2*4+0] + a.m[3*4+3]*b.m[3*4+0]; + out.m[3*4+1] = a.m[3*4+0]*b.m[0*4+1] + a.m[3*4+1]*b.m[1*4+1] + a.m[3*4+2]*b.m[2*4+1] + a.m[3*4+3]*b.m[3*4+1]; + out.m[3*4+2] = a.m[3*4+0]*b.m[0*4+2] + a.m[3*4+1]*b.m[1*4+2] + a.m[3*4+2]*b.m[2*4+2] + a.m[3*4+3]*b.m[3*4+2]; + out.m[3*4+3] = a.m[3*4+0]*b.m[0*4+3] + a.m[3*4+1]*b.m[1*4+3] + a.m[3*4+2]*b.m[2*4+3] + a.m[3*4+3]*b.m[3*4+3]; + +#endif } /* @@ -905,6 +1024,7 @@ can get really, really small. */ bool idRenderMatrix::Inverse( const idRenderMatrix & src, idRenderMatrix & out ) { +#ifdef ID_WIN_X86_SSE2_INTRIN const __m128 r0 = _mm_loadu_ps( src.m + 0 * 4 ); const __m128 r1 = _mm_loadu_ps( src.m + 1 * 4 ); @@ -1009,6 +1129,87 @@ bool idRenderMatrix::Inverse( const idRenderMatrix & src, idRenderMatrix & out ) _mm_storeu_ps( out.m + 2 * 4, _mm_mul_ps( adjoint_r2, rcpDet ) ); _mm_storeu_ps( out.m + 3 * 4, _mm_mul_ps( adjoint_r3, rcpDet ) ); +#else + + const int FRL = 4; + + // 84+4+16 = 104 multiplications + // 1 division + + // 2x2 sub-determinants required to calculate 4x4 determinant + const float det2_01_01 = src.m[0*FRL+0] * src.m[1*FRL+1] - src.m[0*FRL+1] * src.m[1*FRL+0]; + const float det2_01_02 = src.m[0*FRL+0] * src.m[1*FRL+2] - src.m[0*FRL+2] * src.m[1*FRL+0]; + const float det2_01_03 = src.m[0*FRL+0] * src.m[1*FRL+3] - src.m[0*FRL+3] * src.m[1*FRL+0]; + const float det2_01_12 = src.m[0*FRL+1] * src.m[1*FRL+2] - src.m[0*FRL+2] * src.m[1*FRL+1]; + const float det2_01_13 = src.m[0*FRL+1] * src.m[1*FRL+3] - src.m[0*FRL+3] * src.m[1*FRL+1]; + const float det2_01_23 = src.m[0*FRL+2] * src.m[1*FRL+3] - src.m[0*FRL+3] * src.m[1*FRL+2]; + + // 3x3 sub-determinants required to calculate 4x4 determinant + const float det3_201_012 = src.m[2*FRL+0] * det2_01_12 - src.m[2*FRL+1] * det2_01_02 + src.m[2*FRL+2] * det2_01_01; + const float det3_201_013 = src.m[2*FRL+0] * det2_01_13 - src.m[2*FRL+1] * det2_01_03 + src.m[2*FRL+3] * det2_01_01; + const float det3_201_023 = src.m[2*FRL+0] * det2_01_23 - src.m[2*FRL+2] * det2_01_03 + src.m[2*FRL+3] * det2_01_02; + const float det3_201_123 = src.m[2*FRL+1] * det2_01_23 - src.m[2*FRL+2] * det2_01_13 + src.m[2*FRL+3] * det2_01_12; + + const float det = ( - det3_201_123 * src.m[3*FRL+0] + det3_201_023 * src.m[3*FRL+1] - det3_201_013 * src.m[3*FRL+2] + det3_201_012 * src.m[3*FRL+3] ); + + if ( idMath::Fabs( det ) < RENDER_MATRIX_INVERSE_EPSILON ) { + return false; + } + + const float rcpDet = 1.0f / det; + + // remaining 2x2 sub-determinants + const float det2_03_01 = src.m[0*FRL+0] * src.m[3*FRL+1] - src.m[0*FRL+1] * src.m[3*FRL+0]; + const float det2_03_02 = src.m[0*FRL+0] * src.m[3*FRL+2] - src.m[0*FRL+2] * src.m[3*FRL+0]; + const float det2_03_03 = src.m[0*FRL+0] * src.m[3*FRL+3] - src.m[0*FRL+3] * src.m[3*FRL+0]; + const float det2_03_12 = src.m[0*FRL+1] * src.m[3*FRL+2] - src.m[0*FRL+2] * src.m[3*FRL+1]; + const float det2_03_13 = src.m[0*FRL+1] * src.m[3*FRL+3] - src.m[0*FRL+3] * src.m[3*FRL+1]; + const float det2_03_23 = src.m[0*FRL+2] * src.m[3*FRL+3] - src.m[0*FRL+3] * src.m[3*FRL+2]; + + const float det2_13_01 = src.m[1*FRL+0] * src.m[3*FRL+1] - src.m[1*FRL+1] * src.m[3*FRL+0]; + const float det2_13_02 = src.m[1*FRL+0] * src.m[3*FRL+2] - src.m[1*FRL+2] * src.m[3*FRL+0]; + const float det2_13_03 = src.m[1*FRL+0] * src.m[3*FRL+3] - src.m[1*FRL+3] * src.m[3*FRL+0]; + const float det2_13_12 = src.m[1*FRL+1] * src.m[3*FRL+2] - src.m[1*FRL+2] * src.m[3*FRL+1]; + const float det2_13_13 = src.m[1*FRL+1] * src.m[3*FRL+3] - src.m[1*FRL+3] * src.m[3*FRL+1]; + const float det2_13_23 = src.m[1*FRL+2] * src.m[3*FRL+3] - src.m[1*FRL+3] * src.m[3*FRL+2]; + + // remaining 3x3 sub-determinants + const float det3_203_012 = src.m[2*FRL+0] * det2_03_12 - src.m[2*FRL+1] * det2_03_02 + src.m[2*FRL+2] * det2_03_01; + const float det3_203_013 = src.m[2*FRL+0] * det2_03_13 - src.m[2*FRL+1] * det2_03_03 + src.m[2*FRL+3] * det2_03_01; + const float det3_203_023 = src.m[2*FRL+0] * det2_03_23 - src.m[2*FRL+2] * det2_03_03 + src.m[2*FRL+3] * det2_03_02; + const float det3_203_123 = src.m[2*FRL+1] * det2_03_23 - src.m[2*FRL+2] * det2_03_13 + src.m[2*FRL+3] * det2_03_12; + + const float det3_213_012 = src.m[2*FRL+0] * det2_13_12 - src.m[2*FRL+1] * det2_13_02 + src.m[2*FRL+2] * det2_13_01; + const float det3_213_013 = src.m[2*FRL+0] * det2_13_13 - src.m[2*FRL+1] * det2_13_03 + src.m[2*FRL+3] * det2_13_01; + const float det3_213_023 = src.m[2*FRL+0] * det2_13_23 - src.m[2*FRL+2] * det2_13_03 + src.m[2*FRL+3] * det2_13_02; + const float det3_213_123 = src.m[2*FRL+1] * det2_13_23 - src.m[2*FRL+2] * det2_13_13 + src.m[2*FRL+3] * det2_13_12; + + const float det3_301_012 = src.m[3*FRL+0] * det2_01_12 - src.m[3*FRL+1] * det2_01_02 + src.m[3*FRL+2] * det2_01_01; + const float det3_301_013 = src.m[3*FRL+0] * det2_01_13 - src.m[3*FRL+1] * det2_01_03 + src.m[3*FRL+3] * det2_01_01; + const float det3_301_023 = src.m[3*FRL+0] * det2_01_23 - src.m[3*FRL+2] * det2_01_03 + src.m[3*FRL+3] * det2_01_02; + const float det3_301_123 = src.m[3*FRL+1] * det2_01_23 - src.m[3*FRL+2] * det2_01_13 + src.m[3*FRL+3] * det2_01_12; + + out.m[0*FRL+0] = - det3_213_123 * rcpDet; + out.m[1*FRL+0] = + det3_213_023 * rcpDet; + out.m[2*FRL+0] = - det3_213_013 * rcpDet; + out.m[3*FRL+0] = + det3_213_012 * rcpDet; + + out.m[0*FRL+1] = + det3_203_123 * rcpDet; + out.m[1*FRL+1] = - det3_203_023 * rcpDet; + out.m[2*FRL+1] = + det3_203_013 * rcpDet; + out.m[3*FRL+1] = - det3_203_012 * rcpDet; + + out.m[0*FRL+2] = + det3_301_123 * rcpDet; + out.m[1*FRL+2] = - det3_301_023 * rcpDet; + out.m[2*FRL+2] = + det3_301_013 * rcpDet; + out.m[3*FRL+2] = - det3_301_012 * rcpDet; + + out.m[0*FRL+3] = - det3_201_123 * rcpDet; + out.m[1*FRL+3] = + det3_201_023 * rcpDet; + out.m[2*FRL+3] = - det3_201_013 * rcpDet; + out.m[3*FRL+3] = + det3_201_012 * rcpDet; + +#endif return true; } @@ -1133,6 +1334,7 @@ bool idRenderMatrix::InverseByDoubles( const idRenderMatrix & src, idRenderMatri DeterminantIsNegative ======================== */ +#ifdef ID_WIN_X86_SSE2_INTRIN void DeterminantIsNegative( bool & negativeDeterminant, const __m128 & r0, const __m128 & r1, const __m128 & r2, const __m128 & r3 ) { @@ -1177,6 +1379,30 @@ void DeterminantIsNegative( bool & negativeDeterminant, const __m128 & r0, const negativeDeterminant = _mm_movemask_ps( result ) & 1; } +#else + +void DeterminantIsNegative( bool & negativeDeterminant, const float * row0, const float * row1, const float * row2, const float * row3 ) { + + // 2x2 sub-determinants required to calculate 4x4 determinant + const float det2_01_01 = row0[0] * row1[1] - row0[1] * row1[0]; + const float det2_01_02 = row0[0] * row1[2] - row0[2] * row1[0]; + const float det2_01_03 = row0[0] * row1[3] - row0[3] * row1[0]; + const float det2_01_12 = row0[1] * row1[2] - row0[2] * row1[1]; + const float det2_01_13 = row0[1] * row1[3] - row0[3] * row1[1]; + const float det2_01_23 = row0[2] * row1[3] - row0[3] * row1[2]; + + // 3x3 sub-determinants required to calculate 4x4 determinant + const float det3_201_012 = row2[0] * det2_01_12 - row2[1] * det2_01_02 + row2[2] * det2_01_01; + const float det3_201_013 = row2[0] * det2_01_13 - row2[1] * det2_01_03 + row2[3] * det2_01_01; + const float det3_201_023 = row2[0] * det2_01_23 - row2[2] * det2_01_03 + row2[3] * det2_01_02; + const float det3_201_123 = row2[1] * det2_01_23 - row2[2] * det2_01_13 + row2[3] * det2_01_12; + + const float det = ( - det3_201_123 * row3[0] + det3_201_023 * row3[1] - det3_201_013 * row3[2] + det3_201_012 * row3[3] ); + + negativeDeterminant = ( det < 0.0f ); +} + +#endif /* ======================== @@ -1189,6 +1415,7 @@ void idRenderMatrix::CopyMatrix( const idRenderMatrix & matrix, idVec4 & row0, i assert_16_byte_aligned( row2.ToFloatPtr() ); assert_16_byte_aligned( row3.ToFloatPtr() ); +#ifdef ID_WIN_X86_SSE2_INTRIN const __m128 r0 = _mm_loadu_ps( matrix.m + 0 * 4 ); const __m128 r1 = _mm_loadu_ps( matrix.m + 1 * 4 ); @@ -1200,6 +1427,14 @@ void idRenderMatrix::CopyMatrix( const idRenderMatrix & matrix, idVec4 & row0, i _mm_store_ps( row2.ToFloatPtr(), r2 ); _mm_store_ps( row3.ToFloatPtr(), r3 ); +#else + + memcpy( row0.ToFloatPtr(), matrix[0], sizeof( idVec4 ) ); + memcpy( row1.ToFloatPtr(), matrix[1], sizeof( idVec4 ) ); + memcpy( row2.ToFloatPtr(), matrix[2], sizeof( idVec4 ) ); + memcpy( row3.ToFloatPtr(), matrix[3], sizeof( idVec4 ) ); + +#endif } /* @@ -1213,6 +1448,7 @@ void idRenderMatrix::SetMVP( const idRenderMatrix & mvp, idVec4 & row0, idVec4 & assert_16_byte_aligned( row2.ToFloatPtr() ); assert_16_byte_aligned( row3.ToFloatPtr() ); +#ifdef ID_WIN_X86_SSE2_INTRIN const __m128 r0 = _mm_loadu_ps( mvp.m + 0 * 4 ); const __m128 r1 = _mm_loadu_ps( mvp.m + 1 * 4 ); @@ -1226,6 +1462,16 @@ void idRenderMatrix::SetMVP( const idRenderMatrix & mvp, idVec4 & row0, idVec4 & DeterminantIsNegative( negativeDeterminant, r0, r1, r2, r3 ); +#else + + memcpy( row0.ToFloatPtr(), mvp[0], sizeof( idVec4 ) ); + memcpy( row1.ToFloatPtr(), mvp[1], sizeof( idVec4 ) ); + memcpy( row2.ToFloatPtr(), mvp[2], sizeof( idVec4 ) ); + memcpy( row3.ToFloatPtr(), mvp[3], sizeof( idVec4 ) ); + + DeterminantIsNegative( negativeDeterminant, mvp[0], mvp[1], mvp[2], mvp[3] ); + +#endif } /* @@ -1239,6 +1485,7 @@ void idRenderMatrix::SetMVPForBounds( const idRenderMatrix & mvp, const idBounds assert_16_byte_aligned( row2.ToFloatPtr() ); assert_16_byte_aligned( row3.ToFloatPtr() ); +#ifdef ID_WIN_X86_SSE2_INTRIN __m128 b0 = _mm_loadu_bounds_0( bounds ); __m128 b1 = _mm_loadu_bounds_1( bounds ); @@ -1287,6 +1534,34 @@ void idRenderMatrix::SetMVPForBounds( const idRenderMatrix & mvp, const idBounds DeterminantIsNegative( negativeDeterminant, r0, r1, r2, r3 ); +#else + + const idVec3 offset = ( bounds[1] + bounds[0] ) * 0.5f; + const idVec3 scale = ( bounds[1] - bounds[0] ) * 0.5f; + + row0[0] = mvp[0][0] * scale[0]; + row0[1] = mvp[0][1] * scale[1]; + row0[2] = mvp[0][2] * scale[2]; + row0[3] = mvp[0][3] + mvp[0][0] * offset[0] + mvp[0][1] * offset[1] + mvp[0][2] * offset[2]; + + row1[0] = mvp[1][0] * scale[0]; + row1[1] = mvp[1][1] * scale[1]; + row1[2] = mvp[1][2] * scale[2]; + row1[3] = mvp[1][3] + mvp[1][0] * offset[0] + mvp[1][1] * offset[1] + mvp[1][2] * offset[2]; + + row2[0] = mvp[2][0] * scale[0]; + row2[1] = mvp[2][1] * scale[1]; + row2[2] = mvp[2][2] * scale[2]; + row2[3] = mvp[2][3] + mvp[2][0] * offset[0] + mvp[2][1] * offset[1] + mvp[2][2] * offset[2]; + + row3[0] = mvp[3][0] * scale[0]; + row3[1] = mvp[3][1] * scale[1]; + row3[2] = mvp[3][2] * scale[2]; + row3[3] = mvp[3][3] + mvp[3][0] * offset[0] + mvp[3][1] * offset[1] + mvp[3][2] * offset[2]; + + DeterminantIsNegative( negativeDeterminant, row0.ToFloatPtr(), row1.ToFloatPtr(), row2.ToFloatPtr(), row3.ToFloatPtr() ); + +#endif } /* @@ -1300,6 +1575,7 @@ void idRenderMatrix::SetMVPForInverseProject( const idRenderMatrix & mvp, const assert_16_byte_aligned( row2.ToFloatPtr() ); assert_16_byte_aligned( row3.ToFloatPtr() ); +#ifdef ID_WIN_X86_SSE2_INTRIN __m128 r0 = _mm_loadu_ps( mvp.m + 0 * 4 ); __m128 r1 = _mm_loadu_ps( mvp.m + 1 * 4 ); @@ -1338,6 +1614,31 @@ void idRenderMatrix::SetMVPForInverseProject( const idRenderMatrix & mvp, const DeterminantIsNegative( negativeDeterminant, t0, t1, t2, t3 ); +#else + + row0[0] = mvp.m[0*4+0]*inverseProject.m[0*4+0] + mvp.m[0*4+1]*inverseProject.m[1*4+0] + mvp.m[0*4+2]*inverseProject.m[2*4+0] + mvp.m[0*4+3]*inverseProject.m[3*4+0]; + row0[1] = mvp.m[0*4+0]*inverseProject.m[0*4+1] + mvp.m[0*4+1]*inverseProject.m[1*4+1] + mvp.m[0*4+2]*inverseProject.m[2*4+1] + mvp.m[0*4+3]*inverseProject.m[3*4+1]; + row0[2] = mvp.m[0*4+0]*inverseProject.m[0*4+2] + mvp.m[0*4+1]*inverseProject.m[1*4+2] + mvp.m[0*4+2]*inverseProject.m[2*4+2] + mvp.m[0*4+3]*inverseProject.m[3*4+2]; + row0[3] = mvp.m[0*4+0]*inverseProject.m[0*4+3] + mvp.m[0*4+1]*inverseProject.m[1*4+3] + mvp.m[0*4+2]*inverseProject.m[2*4+3] + mvp.m[0*4+3]*inverseProject.m[3*4+3]; + + row1[0] = mvp.m[1*4+0]*inverseProject.m[0*4+0] + mvp.m[1*4+1]*inverseProject.m[1*4+0] + mvp.m[1*4+2]*inverseProject.m[2*4+0] + mvp.m[1*4+3]*inverseProject.m[3*4+0]; + row1[1] = mvp.m[1*4+0]*inverseProject.m[0*4+1] + mvp.m[1*4+1]*inverseProject.m[1*4+1] + mvp.m[1*4+2]*inverseProject.m[2*4+1] + mvp.m[1*4+3]*inverseProject.m[3*4+1]; + row1[2] = mvp.m[1*4+0]*inverseProject.m[0*4+2] + mvp.m[1*4+1]*inverseProject.m[1*4+2] + mvp.m[1*4+2]*inverseProject.m[2*4+2] + mvp.m[1*4+3]*inverseProject.m[3*4+2]; + row1[3] = mvp.m[1*4+0]*inverseProject.m[0*4+3] + mvp.m[1*4+1]*inverseProject.m[1*4+3] + mvp.m[1*4+2]*inverseProject.m[2*4+3] + mvp.m[1*4+3]*inverseProject.m[3*4+3]; + + row2[0] = mvp.m[2*4+0]*inverseProject.m[0*4+0] + mvp.m[2*4+1]*inverseProject.m[1*4+0] + mvp.m[2*4+2]*inverseProject.m[2*4+0] + mvp.m[2*4+3]*inverseProject.m[3*4+0]; + row2[1] = mvp.m[2*4+0]*inverseProject.m[0*4+1] + mvp.m[2*4+1]*inverseProject.m[1*4+1] + mvp.m[2*4+2]*inverseProject.m[2*4+1] + mvp.m[2*4+3]*inverseProject.m[3*4+1]; + row2[2] = mvp.m[2*4+0]*inverseProject.m[0*4+2] + mvp.m[2*4+1]*inverseProject.m[1*4+2] + mvp.m[2*4+2]*inverseProject.m[2*4+2] + mvp.m[2*4+3]*inverseProject.m[3*4+2]; + row2[3] = mvp.m[2*4+0]*inverseProject.m[0*4+3] + mvp.m[2*4+1]*inverseProject.m[1*4+3] + mvp.m[2*4+2]*inverseProject.m[2*4+3] + mvp.m[2*4+3]*inverseProject.m[3*4+3]; + + row3[0] = mvp.m[3*4+0]*inverseProject.m[0*4+0] + mvp.m[3*4+1]*inverseProject.m[1*4+0] + mvp.m[3*4+2]*inverseProject.m[2*4+0] + mvp.m[3*4+3]*inverseProject.m[3*4+0]; + row3[1] = mvp.m[3*4+0]*inverseProject.m[0*4+1] + mvp.m[3*4+1]*inverseProject.m[1*4+1] + mvp.m[3*4+2]*inverseProject.m[2*4+1] + mvp.m[3*4+3]*inverseProject.m[3*4+1]; + row3[2] = mvp.m[3*4+0]*inverseProject.m[0*4+2] + mvp.m[3*4+1]*inverseProject.m[1*4+2] + mvp.m[3*4+2]*inverseProject.m[2*4+2] + mvp.m[3*4+3]*inverseProject.m[3*4+2]; + row3[3] = mvp.m[3*4+0]*inverseProject.m[0*4+3] + mvp.m[3*4+1]*inverseProject.m[1*4+3] + mvp.m[3*4+2]*inverseProject.m[2*4+3] + mvp.m[3*4+3]*inverseProject.m[3*4+3]; + + DeterminantIsNegative( negativeDeterminant, row0.ToFloatPtr(), row1.ToFloatPtr(), row2.ToFloatPtr(), row3.ToFloatPtr() ); + +#endif } /* @@ -1398,6 +1699,7 @@ frustum plane, but only while also being behind another one. */ bool idRenderMatrix::CullBoundsToMVPbits( const idRenderMatrix & mvp, const idBounds & bounds, byte * outBits, bool zeroToOne ) { +#ifdef ID_WIN_X86_SSE2_INTRIN __m128 mvp0 = _mm_loadu_ps( mvp[0] ); __m128 mvp1 = _mm_loadu_ps( mvp[1] ); @@ -1497,6 +1799,48 @@ bool idRenderMatrix::CullBoundsToMVPbits( const idRenderMatrix & mvp, const idBo return ( bits != 63 ); +#else + + int bits = 0; + + idVec3 v; + for ( int x = 0; x < 2; x++ ) { + v[0] = bounds[x][0]; + for ( int y = 0; y < 2; y++ ) { + v[1] = bounds[y][1]; + for ( int z = 0; z < 2; z++ ) { + v[2] = bounds[z][2]; + + idVec4 c; + for ( int i = 0; i < 4; i++ ) { + c[i] = v[0] * mvp[i][0] + v[1] * mvp[i][1] + v[2] * mvp[i][2] + mvp[i][3]; + } + + const float minW = zeroToOne ? 0.0f : -c[3]; + const float maxW = c[3]; +#if defined( CLIP_SPACE_D3D ) // the D3D clip space Z is in the range [0,1] so always compare Z vs zero whether 'zeroToOne' is true or false + const float minZ = 0.0f; +#else + const float minZ = minW; +#endif + + if ( c[0] > minW ) { bits |= ( 1 << 0 ); } + if ( c[0] < maxW ) { bits |= ( 1 << 1 ); } + if ( c[1] > minW ) { bits |= ( 1 << 2 ); } + if ( c[1] < maxW ) { bits |= ( 1 << 3 ); } + if ( c[2] > minZ ) { bits |= ( 1 << 4 ); } // NOTE: using minZ + if ( c[2] < maxW ) { bits |= ( 1 << 5 ); } + } + } + } + + // store out a bit set for each side where the bounds is outside the clip space + *outBits = (byte)( bits ^ 63 ); + + // if any bits weren't set, the bounds is completely off one side of the frustum + return ( bits != 63 ); + +#endif } /* @@ -1519,6 +1863,7 @@ frustum plane, but only while also being behind another one. bool idRenderMatrix::CullExtrudedBoundsToMVPbits( const idRenderMatrix & mvp, const idBounds & bounds, const idVec3 & extrudeDirection, const idPlane & clipPlane, byte * outBits, bool zeroToOne ) { assert( idMath::Fabs( extrudeDirection * clipPlane.Normal() ) >= idMath::FLT_SMALLEST_NON_DENORMAL ); +#ifdef ID_WIN_X86_SSE2_INTRIN __m128 mvp0 = _mm_loadu_ps( mvp[0] ); __m128 mvp1 = _mm_loadu_ps( mvp[1] ); @@ -1736,6 +2081,62 @@ bool idRenderMatrix::CullExtrudedBoundsToMVPbits( const idRenderMatrix & mvp, co return ( bits != 63 ); +#else + + int bits = 0; + + float closing = extrudeDirection * clipPlane.Normal(); + float invClosing = -1.0f / closing; + + idVec3 v; + for ( int x = 0; x < 2; x++ ) { + v[0] = bounds[x][0]; + for ( int y = 0; y < 2; y++ ) { + v[1] = bounds[y][1]; + for ( int z = 0; z < 2; z++ ) { + v[2] = bounds[z][2]; + + for ( int extrude = 0; extrude <= 1; extrude++ ) { + + idVec3 test; + if ( extrude ) { + const float extrudeDist = clipPlane.Distance( v ) * invClosing; + test = v + extrudeDirection * extrudeDist; + } else { + test = v; + } + + idVec4 c; + for ( int i = 0; i < 4; i++ ) { + c[i] = test[0] * mvp[i][0] + test[1] * mvp[i][1] + test[2] * mvp[i][2] + mvp[i][3]; + } + + const float minW = zeroToOne ? 0.0f : -c[3]; + const float maxW = c[3]; +#if defined( CLIP_SPACE_D3D ) // the D3D clip space Z is in the range [0,1] so always compare Z vs zero whether 'zeroToOne' is true or false + const float minZ = 0.0f; +#else + const float minZ = minW; +#endif + + if ( c[0] > minW ) { bits |= ( 1 << 0 ); } + if ( c[0] < maxW ) { bits |= ( 1 << 1 ); } + if ( c[1] > minW ) { bits |= ( 1 << 2 ); } + if ( c[1] < maxW ) { bits |= ( 1 << 3 ); } + if ( c[2] > minZ ) { bits |= ( 1 << 4 ); } // NOTE: using minZ + if ( c[2] < maxW ) { bits |= ( 1 << 5 ); } + } + } + } + } + + // store out a bit set for each side where the bounds is outside the clip space + *outBits = (byte)(bits ^ 63); + + // if any bits weren't set, the bounds is completely off one side of the frustum + return ( bits != 63 ); + +#endif } /* @@ -1753,6 +2154,7 @@ is W=0 clipped. ======================== */ void idRenderMatrix::ProjectedBounds( idBounds & projected, const idRenderMatrix & mvp, const idBounds & bounds, bool windowSpace ) { +#ifdef ID_WIN_X86_SSE2_INTRIN __m128 mvp0 = _mm_loadu_ps( mvp[0] ); __m128 mvp1 = _mm_loadu_ps( mvp[1] ); @@ -1885,6 +2287,78 @@ void idRenderMatrix::ProjectedBounds( idBounds & projected, const idRenderMatrix _mm_store_ss( & projected[1].y, maxY ); _mm_store_ss( & projected[1].z, maxZ ); +#else + + for ( int i = 0; i < 3; i++ ) { + projected[0][i] = RENDER_MATRIX_INFINITY; + projected[1][i] = - RENDER_MATRIX_INFINITY; + } + + idVec3 v; + for ( int x = 0; x < 2; x++ ) { + v[0] = bounds[x][0]; + for ( int y = 0; y < 2; y++ ) { + v[1] = bounds[y][1]; + for ( int z = 0; z < 2; z++ ) { + v[2] = bounds[z][2]; + + float tx = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2] + mvp[0][3]; + float ty = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2] + mvp[1][3]; + float tz = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3]; + float tw = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3]; + + if ( tw <= idMath::FLT_SMALLEST_NON_DENORMAL ) { + projected[0][0] = -RENDER_MATRIX_INFINITY; + projected[0][1] = -RENDER_MATRIX_INFINITY; + projected[0][2] = -RENDER_MATRIX_INFINITY; + projected[1][0] = RENDER_MATRIX_INFINITY; + projected[1][1] = RENDER_MATRIX_INFINITY; + // NOTE: projected[1][1] is still valid + continue; + } + + float rw = 1.0f / tw; + + tx = tx * rw; + ty = ty * rw; + tz = tz * rw; + + projected[0][0] = Min( projected[0][0], tx ); + projected[0][1] = Min( projected[0][1], ty ); + projected[0][2] = Min( projected[0][2], tz ); + + projected[1][0] = Max( projected[1][0], tx ); + projected[1][1] = Max( projected[1][1], ty ); + projected[1][2] = Max( projected[1][2], tz ); + } + } + } + + if ( windowSpace ) { + // convert to window coords + projected[0][0] = projected[0][0] * 0.5f + 0.5f; + projected[1][0] = projected[1][0] * 0.5f + 0.5f; + + projected[0][1] = projected[0][1] * 0.5f + 0.5f; + projected[1][1] = projected[1][1] * 0.5f + 0.5f; + +#if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1] + projected[0][2] = projected[0][2] * 0.5f + 0.5f; + projected[1][2] = projected[1][2] * 0.5f + 0.5f; +#endif + + // clamp to [0, 1] range + projected[0][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][0] ); + projected[1][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][0] ); + + projected[0][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][1] ); + projected[1][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][1] ); + + projected[0][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][2] ); + projected[1][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][2] ); + } + +#endif } /* @@ -1920,6 +2394,7 @@ void idRenderMatrix::ProjectedNearClippedBounds( idBounds & projected, const idR - X + */ +#ifdef ID_WIN_X86_SSE2_INTRIN const __m128 mvp0 = _mm_loadu_ps( mvp[0] ); const __m128 mvp1 = _mm_loadu_ps( mvp[1] ); @@ -2181,6 +2656,306 @@ void idRenderMatrix::ProjectedNearClippedBounds( idBounds & projected, const idR _mm_store_ss( & projected[1].y, maxY ); _mm_store_ss( & projected[1].z, maxZ ); +#elif 1 + +{ + const idVec3 points[8] = { + idVec3( bounds[0][0], bounds[0][1], bounds[0][2] ), + idVec3( bounds[1][0], bounds[0][1], bounds[0][2] ), + idVec3( bounds[1][0], bounds[1][1], bounds[0][2] ), + idVec3( bounds[0][0], bounds[1][1], bounds[0][2] ), + idVec3( bounds[0][0], bounds[0][1], bounds[1][2] ), + idVec3( bounds[1][0], bounds[0][1], bounds[1][2] ), + idVec3( bounds[1][0], bounds[1][1], bounds[1][2] ), + idVec3( bounds[0][0], bounds[1][1], bounds[1][2] ) + }; + + idVec4 projectedPoints[8]; + for ( int i = 0; i < 8; i++ ) { + const idVec3 & v = points[i]; + projectedPoints[i].x = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2] + mvp[0][3]; + projectedPoints[i].y = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2] + mvp[1][3]; + projectedPoints[i].z = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3]; + projectedPoints[i].w = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3]; + } + + const idVec4 & p0 = projectedPoints[0]; + const idVec4 & p1 = projectedPoints[1]; + const idVec4 & p2 = projectedPoints[2]; + const idVec4 & p3 = projectedPoints[3]; + const idVec4 & p4 = projectedPoints[4]; + const idVec4 & p5 = projectedPoints[5]; + const idVec4 & p6 = projectedPoints[6]; + const idVec4 & p7 = projectedPoints[7]; + +#if defined( CLIP_SPACE_D3D ) // the D3D near plane is at Z=0 instead of Z=-1 + const float d0 = p0.z; + const float d1 = p1.z; + const float d2 = p2.z; + const float d3 = p3.z; + const float d4 = p4.z; + const float d5 = p5.z; + const float d6 = p6.z; + const float d7 = p7.z; +#else + const float d0 = p0.z + p0.w; + const float d1 = p1.z + p1.w; + const float d2 = p2.z + p2.w; + const float d3 = p3.z + p3.w; + const float d4 = p4.z + p4.w; + const float d5 = p5.z + p5.w; + const float d6 = p6.z + p6.w; + const float d7 = p7.z + p7.w; +#endif + + const float deltaA = d0 - d1; + const float deltaB = d1 - d2; + const float deltaC = d2 - d3; + const float deltaD = d3 - d0; + + const float deltaE = d4 - d5; + const float deltaF = d5 - d6; + const float deltaG = d6 - d7; + const float deltaH = d7 - d4; + + const float deltaI = d0 - d4; + const float deltaJ = d1 - d5; + const float deltaK = d2 - d6; + const float deltaL = d3 - d7; + + const float fractionA = ( fabs( deltaA ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d0 / deltaA ) : 0.0f; + const float fractionB = ( fabs( deltaB ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d1 / deltaB ) : 0.0f; + const float fractionC = ( fabs( deltaC ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d2 / deltaC ) : 0.0f; + const float fractionD = ( fabs( deltaD ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d3 / deltaD ) : 0.0f; + + const float fractionE = ( fabs( deltaE ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d4 / deltaE ) : 0.0f; + const float fractionF = ( fabs( deltaF ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d5 / deltaF ) : 0.0f; + const float fractionG = ( fabs( deltaG ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d6 / deltaG ) : 0.0f; + const float fractionH = ( fabs( deltaH ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d7 / deltaH ) : 0.0f; + + const float fractionI = ( fabs( deltaI ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d0 / deltaI ) : 0.0f; + const float fractionJ = ( fabs( deltaJ ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d1 / deltaJ ) : 0.0f; + const float fractionK = ( fabs( deltaK ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d2 / deltaK ) : 0.0f; + const float fractionL = ( fabs( deltaL ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d3 / deltaL ) : 0.0f; + + const bool clipA = ( fractionA > 0.0f && fractionA < 1.0f ); + const bool clipB = ( fractionB > 0.0f && fractionB < 1.0f ); + const bool clipC = ( fractionC > 0.0f && fractionC < 1.0f ); + const bool clipD = ( fractionD > 0.0f && fractionD < 1.0f ); + + const bool clipE = ( fractionE > 0.0f && fractionE < 1.0f ); + const bool clipF = ( fractionF > 0.0f && fractionF < 1.0f ); + const bool clipG = ( fractionG > 0.0f && fractionG < 1.0f ); + const bool clipH = ( fractionH > 0.0f && fractionH < 1.0f ); + + const bool clipI = ( fractionI > 0.0f && fractionI < 1.0f ); + const bool clipJ = ( fractionJ > 0.0f && fractionJ < 1.0f ); + const bool clipK = ( fractionK > 0.0f && fractionK < 1.0f ); + const bool clipL = ( fractionL > 0.0f && fractionL < 1.0f ); + + const idVec4 intersectionA = p0 + fractionA * ( p1 - p0 ); + const idVec4 intersectionB = p1 + fractionB * ( p2 - p1 ); + const idVec4 intersectionC = p2 + fractionC * ( p3 - p2 ); + const idVec4 intersectionD = p3 + fractionD * ( p0 - p3 ); + + const idVec4 intersectionE = p4 + fractionE * ( p5 - p4 ); + const idVec4 intersectionF = p5 + fractionF * ( p6 - p5 ); + const idVec4 intersectionG = p6 + fractionG * ( p7 - p6 ); + const idVec4 intersectionH = p7 + fractionH * ( p4 - p7 ); + + const idVec4 intersectionI = p0 + fractionI * ( p4 - p0 ); + const idVec4 intersectionJ = p1 + fractionJ * ( p5 - p1 ); + const idVec4 intersectionK = p2 + fractionK * ( p6 - p2 ); + const idVec4 intersectionL = p3 + fractionL * ( p7 - p3 ); + + idVec4 edgeVerts[24]; + + edgeVerts[ 0] = ( clipA && d0 < 0.0f ) ? intersectionA : p0; + edgeVerts[ 2] = ( clipB && d1 < 0.0f ) ? intersectionB : p1; + edgeVerts[ 4] = ( clipC && d2 < 0.0f ) ? intersectionC : p2; + edgeVerts[ 6] = ( clipD && d3 < 0.0f ) ? intersectionD : p3; + + edgeVerts[ 1] = ( clipA && d1 < 0.0f ) ? intersectionA : p1; + edgeVerts[ 3] = ( clipB && d2 < 0.0f ) ? intersectionB : p2; + edgeVerts[ 5] = ( clipC && d3 < 0.0f ) ? intersectionC : p3; + edgeVerts[ 7] = ( clipD && d0 < 0.0f ) ? intersectionD : p0; + + edgeVerts[ 8] = ( clipE && d4 < 0.0f ) ? intersectionE : p4; + edgeVerts[10] = ( clipF && d5 < 0.0f ) ? intersectionF : p5; + edgeVerts[12] = ( clipG && d6 < 0.0f ) ? intersectionG : p6; + edgeVerts[14] = ( clipH && d7 < 0.0f ) ? intersectionH : p7; + + edgeVerts[ 9] = ( clipE && d5 < 0.0f ) ? intersectionE : p5; + edgeVerts[11] = ( clipF && d6 < 0.0f ) ? intersectionF : p6; + edgeVerts[13] = ( clipG && d7 < 0.0f ) ? intersectionG : p7; + edgeVerts[15] = ( clipH && d4 < 0.0f ) ? intersectionH : p4; + + edgeVerts[16] = ( clipI && d0 < 0.0f ) ? intersectionI : p0; + edgeVerts[18] = ( clipJ && d1 < 0.0f ) ? intersectionJ : p1; + edgeVerts[20] = ( clipK && d2 < 0.0f ) ? intersectionK : p2; + edgeVerts[22] = ( clipL && d3 < 0.0f ) ? intersectionL : p3; + + edgeVerts[17] = ( clipI && d4 < 0.0f ) ? intersectionI : p4; + edgeVerts[19] = ( clipJ && d5 < 0.0f ) ? intersectionJ : p5; + edgeVerts[21] = ( clipK && d6 < 0.0f ) ? intersectionK : p6; + edgeVerts[23] = ( clipL && d7 < 0.0f ) ? intersectionL : p7; + + idBounds projBnds; + for ( int i = 0; i < 3; i++ ) { + projBnds[0][i] = RENDER_MATRIX_INFINITY; + projBnds[1][i] = - RENDER_MATRIX_INFINITY; + } + + for ( int i = 0; i < 24; i++ ) { + const idVec4 & v = edgeVerts[i]; + + if ( v.w <= idMath::FLT_SMALLEST_NON_DENORMAL ) { + continue; + } + + const float rw = 1.0f / v.w; + + const float px = v.x * rw; + const float py = v.y * rw; + const float pz = v.z * rw; + + projBnds[0][0] = Min( projBnds[0][0], px ); + projBnds[0][1] = Min( projBnds[0][1], py ); + projBnds[0][2] = Min( projBnds[0][2], pz ); + + projBnds[1][0] = Max( projBnds[1][0], px ); + projBnds[1][1] = Max( projBnds[1][1], py ); + projBnds[1][2] = Max( projBnds[1][2], pz ); + } + + if ( windowSpace ) { + // convert to window coords + projBnds[0][0] = projBnds[0][0] * 0.5f + 0.5f; + projBnds[1][0] = projBnds[1][0] * 0.5f + 0.5f; + + projBnds[0][1] = projBnds[0][1] * 0.5f + 0.5f; + projBnds[1][1] = projBnds[1][1] * 0.5f + 0.5f; + +#if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1] + projBnds[0][2] = projBnds[0][2] * 0.5f + 0.5f; + projBnds[1][2] = projBnds[1][2] * 0.5f + 0.5f; +#endif + + // clamp to [0, 1] range + projBnds[0][0] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[0][0] ); + projBnds[1][0] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[1][0] ); + + projBnds[0][1] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[0][1] ); + projBnds[1][1] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[1][1] ); + + projBnds[0][2] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[0][2] ); + projBnds[1][2] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[1][2] ); + } + + assert( projected[0].Compare( projBnds[0], 0.01f ) ); + assert( projected[1].Compare( projBnds[1], 0.01f ) ); +} + +#else + + const idVec3 points[8] = { + idVec3( bounds[0][0], bounds[0][1], bounds[0][2] ), + idVec3( bounds[1][0], bounds[0][1], bounds[0][2] ), + idVec3( bounds[1][0], bounds[1][1], bounds[0][2] ), + idVec3( bounds[0][0], bounds[1][1], bounds[0][2] ), + idVec3( bounds[0][0], bounds[0][1], bounds[1][2] ), + idVec3( bounds[1][0], bounds[0][1], bounds[1][2] ), + idVec3( bounds[1][0], bounds[1][1], bounds[1][2] ), + idVec3( bounds[0][0], bounds[1][1], bounds[1][2] ) + }; + + idVec4 projectedPoints[8]; + for ( int i = 0; i < 8; i++ ) { + const idVec3 & v = points[i]; + projectedPoints[i].x = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2] + mvp[0][3]; + projectedPoints[i].y = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2] + mvp[1][3]; + projectedPoints[i].z = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3]; + projectedPoints[i].w = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3]; + } + + idVec4 edgeVerts[24]; + for ( int i = 0; i < 3; i++ ) { + int offset0 = ( i & 1 ) * 4; + int offset1 = ( i & 1 ) * 4 + ( i & 2 ) * 2; + int offset3 = ~( i >> 1 ) & 1; + for ( int j = 0; j < 4; j++ ) { + const idVec4 p0 = projectedPoints[offset0 + ( ( j + 0 ) & 3 )]; + const idVec4 p1 = projectedPoints[offset1 + ( ( j + offset3 ) & 3 )]; + +#if defined( CLIP_SPACE_D3D ) // the D3D near plane is at Z=0 instead of Z=-1 + const float d0 = p0.z; + const float d1 = p1.z; +#else + const float d0 = p0.z + p0.w; + const float d1 = p1.z + p1.w; +#endif + const float delta = d0 - d1; + const float fraction = idMath::Fabs( delta ) > idMath::FLT_SMALLEST_NON_DENORMAL ? ( d0 / delta ) : 1.0f; + const bool clip = ( fraction > 0.0f && fraction < 1.0f ); + const idVec4 intersection = p0 + fraction * ( p1 - p0 ); + + edgeVerts[i * 8 + j * 2 + 0] = ( clip && d0 < 0.0f ) ? intersection : p0; + edgeVerts[i * 8 + j * 2 + 1] = ( clip && d1 < 0.0f ) ? intersection : p1; + } + } + + for ( int i = 0; i < 3; i++ ) { + projected[0][i] = RENDER_MATRIX_INFINITY; + projected[1][i] = - RENDER_MATRIX_INFINITY; + } + + for ( int i = 0; i < 24; i++ ) { + const idVec4 & v = edgeVerts[i]; + + if ( v.w <= idMath::FLT_SMALLEST_NON_DENORMAL ) { + continue; + } + + const float rw = 1.0f / v.w; + + const float px = v.x * rw; + const float py = v.y * rw; + const float pz = v.z * rw; + + projected[0][0] = Min( projected[0][0], px ); + projected[0][1] = Min( projected[0][1], py ); + projected[0][2] = Min( projected[0][2], pz ); + + projected[1][0] = Max( projected[1][0], px ); + projected[1][1] = Max( projected[1][1], py ); + projected[1][2] = Max( projected[1][2], pz ); + } + + if ( windowSpace ) { + // convert to window coords + projected[0][0] = projected[0][0] * 0.5f + 0.5f; + projected[1][0] = projected[1][0] * 0.5f + 0.5f; + + projected[0][1] = projected[0][1] * 0.5f + 0.5f; + projected[1][1] = projected[1][1] * 0.5f + 0.5f; + +#if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1] + projected[0][2] = projected[0][2] * 0.5f + 0.5f; + projected[1][2] = projected[1][2] * 0.5f + 0.5f; +#endif + + // clamp to [0, 1] range + projected[0][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][0] ); + projected[1][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][0] ); + + projected[0][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][1] ); + projected[1][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][1] ); + + projected[0][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][2] ); + projected[1][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][2] ); + } + +#endif } #if 0 @@ -2233,6 +3008,7 @@ static idVec3 LocalNearClipCenterFromMVP( const idRenderMatrix & mvp ) { return idVec3( x * invW, y * invW, z * invW ); } +#ifdef ID_WIN_X86_SSE2_INTRIN /* ======================== @@ -2383,6 +3159,90 @@ static int ClipHomogeneousPolygonToUnitCube_SSE2( idVec4 * points, int numPoints return numPoints; } +#else + +/* +======================== +ClipHomogeneousLineToSide + +Clips a line with homogeneous coordinates to the axis aligned plane[axis] = side. +======================== +*/ +static idVec4 ClipHomogeneousLineToSide( const idVec4 & p0, const idVec4 & p1, int axis, float side ) { + const float d0 = p0.w * side - p0[axis]; + const float d1 = p1.w * side - p1[axis]; + const float delta = d0 - d1; + const float f = idMath::Fabs( delta ) > idMath::FLT_SMALLEST_NON_DENORMAL ? ( d0 / delta ) : 1.0f; + const float c = idMath::ClampFloat( 0.0f, 1.0f, f ); + return p0 + c * ( p1 - p0 ); +} + +/* +======================== +ClipHomogeneousPolygonToSide + +Clips a polygon with homogeneous coordinates to the axis aligned plane[axis] = sign * offset. +======================== +*/ +static int ClipHomogeneousPolygonToSide_Generic( idVec4 * __restrict newPoints, idVec4 * __restrict points, int numPoints, int axis, float sign, float offset ) { + assert( newPoints != points ); + + assert( numPoints < 16 ); + int sides[16]; + + const float side = sign * offset; + + // calculate the plane side for each original point and calculate all potential new points + for ( int i = 0; i < numPoints; i++ ) { + int j = ( i + 1 ) & ( ( i + 1 - numPoints ) >> 31 ); + sides[i] = sign * points[i][axis] < offset * points[i].w; + newPoints[i * 2 + 0] = points[i]; + newPoints[i * 2 + 1] = ClipHomogeneousLineToSide( points[i], points[j], axis, side ); + }; + + // repeat the first side at the end to avoid having to wrap around + sides[numPoints] = sides[0]; + + // compact the array of points + int numNewPoints = 0; + for ( int i = 0; i < numPoints; i++ ) { + if ( sides[i + 0] != 0 ) { + newPoints[numNewPoints++] = newPoints[i * 2 + 0]; + } + if ( ( sides[i + 0] ^ sides[i + 1] ) != 0 ) { + newPoints[numNewPoints++] = newPoints[i * 2 + 1]; + } + } + + assert( numNewPoints <= 16 ); + return numNewPoints; +} + +/* +======================== +ClipHomogeneousPolygonToUnitCube + +Clips a polygon with homogeneous coordinates to all six axis aligned unit cube planes. +======================== +*/ +static int ClipHomogeneousPolygonToUnitCube_Generic( idVec4 * points, int numPoints ) { + assert( numPoints < 16 - 6 ); + ALIGNTYPE16 idVec4 newPoints[2 * 16]; // the C clip code temporarily doubles the points + +#if defined( CLIP_SPACE_D3D ) // the D3D near plane is at Z=0 instead of Z=-1 + numPoints = ClipHomogeneousPolygonToSide_Generic( newPoints, points, numPoints, 2, -1.0f, 0.0f ); // near +#else + numPoints = ClipHomogeneousPolygonToSide_Generic( newPoints, points, numPoints, 2, -1.0f, 1.0f ); // near +#endif + numPoints = ClipHomogeneousPolygonToSide_Generic( points, newPoints, numPoints, 2, +1.0f, 1.0f ); // far + numPoints = ClipHomogeneousPolygonToSide_Generic( newPoints, points, numPoints, 1, -1.0f, 1.0f ); // bottom + numPoints = ClipHomogeneousPolygonToSide_Generic( points, newPoints, numPoints, 1, +1.0f, 1.0f ); // top + numPoints = ClipHomogeneousPolygonToSide_Generic( newPoints, points, numPoints, 0, -1.0f, 1.0f ); // left + numPoints = ClipHomogeneousPolygonToSide_Generic( points, newPoints, numPoints, 0, +1.0f, 1.0f ); // right + return numPoints; +} + +#endif /* ======================== @@ -2401,6 +3261,7 @@ the given bounds in which case the projected bounds should be set to fully cover ======================== */ void idRenderMatrix::ProjectedFullyClippedBounds( idBounds & projected, const idRenderMatrix & mvp, const idBounds & bounds, bool windowSpace ) { +#ifdef ID_WIN_X86_SSE2_INTRIN const __m128 mvp0 = _mm_loadu_ps( mvp[0] ); const __m128 mvp1 = _mm_loadu_ps( mvp[1] ); @@ -2551,6 +3412,95 @@ void idRenderMatrix::ProjectedFullyClippedBounds( idBounds & projected, const id _mm_store_ss( & projected[1].y, maxY ); _mm_store_ss( & projected[1].z, maxZ ); +#else + + const idVec3 points[8] = { + idVec3( bounds[0][0], bounds[0][1], bounds[0][2] ), + idVec3( bounds[1][0], bounds[0][1], bounds[0][2] ), + idVec3( bounds[1][0], bounds[1][1], bounds[0][2] ), + idVec3( bounds[0][0], bounds[1][1], bounds[0][2] ), + idVec3( bounds[0][0], bounds[0][1], bounds[1][2] ), + idVec3( bounds[1][0], bounds[0][1], bounds[1][2] ), + idVec3( bounds[1][0], bounds[1][1], bounds[1][2] ), + idVec3( bounds[0][0], bounds[1][1], bounds[1][2] ) + }; + + idVec4 projectedPoints[8]; + for ( int i = 0; i < 8; i++ ) { + const idVec3 & v = points[i]; + projectedPoints[i].x = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2] + mvp[0][3]; + projectedPoints[i].y = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2] + mvp[1][3]; + projectedPoints[i].z = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3]; + projectedPoints[i].w = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3]; + } + + idVec4 clippedPoints[6 * 16]; + int numClippedPoints = 0; + for ( int i = 0; i < 6; i++ ) { + clippedPoints[numClippedPoints + 0] = projectedPoints[boxPolygonVertices[i][0]]; + clippedPoints[numClippedPoints + 1] = projectedPoints[boxPolygonVertices[i][1]]; + clippedPoints[numClippedPoints + 2] = projectedPoints[boxPolygonVertices[i][2]]; + clippedPoints[numClippedPoints + 3] = projectedPoints[boxPolygonVertices[i][3]]; + numClippedPoints += ClipHomogeneousPolygonToUnitCube_Generic( &clippedPoints[numClippedPoints], 4 ); + } + + // test if the center of the near clip plane is inside the given bounding box + const idVec3 localNearClipCenter = LocalNearClipCenterFromMVP( mvp ); + const bool inside = bounds.Expand( RENDER_MATRIX_PROJECTION_EPSILON ).ContainsPoint( localNearClipCenter ); + + for ( int i = 0; i < 3; i++ ) { + projected[0][i] = RENDER_MATRIX_INFINITY; + projected[1][i] = - RENDER_MATRIX_INFINITY; + } + if ( inside ) { + projected[0][2] = -1.0f; + } + + for ( int i = 0; i < numClippedPoints; i++ ) { + const idVec4 & c = clippedPoints[i]; + + assert( c.w > idMath::FLT_SMALLEST_NON_DENORMAL ); + + const float rw = 1.0f / c.w; + + const float px = c.x * rw; + const float py = c.y * rw; + const float pz = c.z * rw; + + projected[0][0] = Min( projected[0][0], px ); + projected[0][1] = Min( projected[0][1], py ); + projected[0][2] = Min( projected[0][2], pz ); + + projected[1][0] = Max( projected[1][0], px ); + projected[1][1] = Max( projected[1][1], py ); + projected[1][2] = Max( projected[1][2], pz ); + } + + if ( windowSpace ) { + // convert to window coords + projected[0][0] = projected[0][0] * 0.5f + 0.5f; + projected[1][0] = projected[1][0] * 0.5f + 0.5f; + + projected[0][1] = projected[0][1] * 0.5f + 0.5f; + projected[1][1] = projected[1][1] * 0.5f + 0.5f; + +#if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1] + projected[0][2] = projected[0][2] * 0.5f + 0.5f; + projected[1][2] = projected[1][2] * 0.5f + 0.5f; +#endif + + // clamp to [0, 1] range + projected[0][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][0] ); + projected[1][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][0] ); + + projected[0][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][1] ); + projected[1][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][1] ); + + projected[0][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][2] ); + projected[1][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][2] ); + } + +#endif } /* @@ -2564,6 +3514,7 @@ The given bounding box is not clipped to the MVP so the depth bounds may not be ======================== */ void idRenderMatrix::DepthBoundsForBounds( float & min, float & max, const idRenderMatrix & mvp, const idBounds & bounds, bool windowSpace ) { +#ifdef ID_WIN_X86_SSE2_INTRIN __m128 mvp2 = _mm_loadu_ps( mvp[2] ); __m128 mvp3 = _mm_loadu_ps( mvp[3] ); @@ -2630,6 +3581,46 @@ void idRenderMatrix::DepthBoundsForBounds( float & min, float & max, const idRen _mm_store_ss( & min, minv ); _mm_store_ss( & max, maxv ); +#else + + float localMin = RENDER_MATRIX_INFINITY; + float localMax = - RENDER_MATRIX_INFINITY; + + idVec3 v; + for ( int x = 0; x < 2; x++ ) { + v[0] = bounds[x][0]; + for ( int y = 0; y < 2; y++ ) { + v[1] = bounds[y][1]; + for ( int z = 0; z < 2; z++ ) { + v[2] = bounds[z][2]; + + float tz = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3]; + float tw = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3]; + + if ( tw > idMath::FLT_SMALLEST_NON_DENORMAL ) { + tz = tz / tw; + } else { + tz = -RENDER_MATRIX_INFINITY; + } + + localMin = Min( localMin, tz ); + localMax = Max( localMax, tz ); + } + } + } + + if ( windowSpace ) { + // convert to window coords +#if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1] + min = localMin * 0.5f + 0.5f; + max = localMax * 0.5f + 0.5f; +#endif + // clamp to the [0, 1] range + min = Max( min, 0.0f ); + max = Min( max, 1.0f ); + } + +#endif } /* @@ -2646,6 +3637,7 @@ The extruded bounding box is not clipped to the MVP so the depth bounds may not void idRenderMatrix::DepthBoundsForExtrudedBounds( float & min, float & max, const idRenderMatrix & mvp, const idBounds & bounds, const idVec3 & extrudeDirection, const idPlane & clipPlane, bool windowSpace ) { assert( idMath::Fabs( extrudeDirection * clipPlane.Normal() ) >= idMath::FLT_SMALLEST_NON_DENORMAL ); +#ifdef ID_WIN_X86_SSE2_INTRIN __m128 mvp2 = _mm_loadu_ps( mvp[2] ); __m128 mvp3 = _mm_loadu_ps( mvp[3] ); @@ -2800,6 +3792,60 @@ void idRenderMatrix::DepthBoundsForExtrudedBounds( float & min, float & max, con _mm_store_ss( & min, minv ); _mm_store_ss( & max, maxv ); +#else + + const float closing = extrudeDirection * clipPlane.Normal(); + const float invClosing = -1.0f / closing; + + float localMin = RENDER_MATRIX_INFINITY; + float localMax = - RENDER_MATRIX_INFINITY; + + idVec3 v; + for ( int x = 0; x < 2; x++ ) { + v[0] = bounds[x][0]; + for ( int y = 0; y < 2; y++ ) { + v[1] = bounds[y][1]; + for ( int z = 0; z < 2; z++ ) { + v[2] = bounds[z][2]; + + for ( int extrude = 0; extrude <= 1; extrude++ ) { + + idVec3 test; + if ( extrude ) { + float extrudeDist = clipPlane.Distance( v ) * invClosing; + test = v + extrudeDirection * extrudeDist; + } else { + test = v; + } + + float tz = test[0] * mvp[2][0] + test[1] * mvp[2][1] + test[2] * mvp[2][2] + mvp[2][3]; + float tw = test[0] * mvp[3][0] + test[1] * mvp[3][1] + test[2] * mvp[3][2] + mvp[3][3]; + + if ( tw > idMath::FLT_SMALLEST_NON_DENORMAL ) { + tz = tz / tw; + } else { + tz = -RENDER_MATRIX_INFINITY; + } + + localMin = Min( localMin, tz ); + localMax = Max( localMax, tz ); + } + } + } + } + + if ( windowSpace ) { + // convert to window coords +#if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1] + min = localMin * 0.5f + 0.5f; + max = localMax * 0.5f + 0.5f; +#endif + // clamp to the [0, 1] range + min = Max( min, 0.0f ); + max = Min( max, 1.0f ); + } + +#endif } /* @@ -2855,6 +3901,7 @@ testing if the center of the far clipping plane is contained inside the shadow v ======================== */ void idRenderMatrix::DepthBoundsForShadowBounds( float & min, float & max, const idRenderMatrix & mvp, const idBounds & bounds, const idVec3 & localLightOrigin, bool windowSpace ) { +#ifdef ID_WIN_X86_SSE2_INTRIN const __m128 mvp0 = _mm_loadu_ps( mvp[0] ); const __m128 mvp1 = _mm_loadu_ps( mvp[1] ); @@ -3029,6 +4076,108 @@ void idRenderMatrix::DepthBoundsForShadowBounds( float & min, float & max, const _mm_store_ss( & min, minZ ); _mm_store_ss( & max, maxZ ); +#else + + const idVec3 points[8] = { + idVec3( bounds[0][0], bounds[0][1], bounds[0][2] ), + idVec3( bounds[1][0], bounds[0][1], bounds[0][2] ), + idVec3( bounds[1][0], bounds[1][1], bounds[0][2] ), + idVec3( bounds[0][0], bounds[1][1], bounds[0][2] ), + idVec3( bounds[0][0], bounds[0][1], bounds[1][2] ), + idVec3( bounds[1][0], bounds[0][1], bounds[1][2] ), + idVec3( bounds[1][0], bounds[1][1], bounds[1][2] ), + idVec3( bounds[0][0], bounds[1][1], bounds[1][2] ) + }; + + // calculate the front facing polygon bits + int frontBits = GetBoxFrontBits_Generic( bounds, localLightOrigin ); + + // bounding box corners + ALIGNTYPE16 idVec4 projectedNearPoints[8]; + for ( int i = 0; i < 8; i++ ) { + const idVec3 & v = points[i]; + projectedNearPoints[i].x = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2] + mvp[0][3]; + projectedNearPoints[i].y = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2] + mvp[1][3]; + projectedNearPoints[i].z = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3]; + projectedNearPoints[i].w = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3]; + } + + // bounding box corners projected to infinity from the light position + ALIGNTYPE16 idVec4 projectedFarPoints[8]; + for ( int i = 0; i < 8; i++ ) { + const idVec3 v = points[i] - localLightOrigin; + projectedFarPoints[i].x = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2]; + projectedFarPoints[i].y = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2]; + projectedFarPoints[i].z = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2]; + projectedFarPoints[i].w = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2]; + } + + ALIGNTYPE16 idVec4 clippedPoints[( 6 + 12 ) * 16]; + int numClippedPoints = 0; + + // clip the front facing bounding box polygons at the near cap + const frontPolygons_t & frontPolygons = boxFrontPolygonsForFrontBits[frontBits]; + for ( int i = 0; i < frontPolygons.count; i++ ) { + const int polygon = frontPolygons.indices[i]; + clippedPoints[numClippedPoints + 0] = projectedNearPoints[boxPolygonVertices[polygon][0]]; + clippedPoints[numClippedPoints + 1] = projectedNearPoints[boxPolygonVertices[polygon][1]]; + clippedPoints[numClippedPoints + 2] = projectedNearPoints[boxPolygonVertices[polygon][2]]; + clippedPoints[numClippedPoints + 3] = projectedNearPoints[boxPolygonVertices[polygon][3]]; + numClippedPoints += ClipHomogeneousPolygonToUnitCube_Generic( &clippedPoints[numClippedPoints], 4 ); + } + + // clip the front facing bounding box polygons projected to the far cap + for ( int i = 0; i < frontPolygons.count; i++ ) { + const int polygon = frontPolygons.indices[i]; + clippedPoints[numClippedPoints + 0] = projectedFarPoints[boxPolygonVertices[polygon][0]]; + clippedPoints[numClippedPoints + 1] = projectedFarPoints[boxPolygonVertices[polygon][1]]; + clippedPoints[numClippedPoints + 2] = projectedFarPoints[boxPolygonVertices[polygon][2]]; + clippedPoints[numClippedPoints + 3] = projectedFarPoints[boxPolygonVertices[polygon][3]]; + numClippedPoints += ClipHomogeneousPolygonToUnitCube_Generic( &clippedPoints[numClippedPoints], 4 ); + } + + // clip the silhouette edge polygons that stretch to infinity + const silhouetteEdges_t & silhouetteEdges = boxSilhouetteEdgesForFrontBits[frontBits]; + for ( int i = 0; i < silhouetteEdges.count; i++ ) { + const int edge = silhouetteEdges.indices[i]; + clippedPoints[numClippedPoints + 0] = projectedNearPoints[boxEdgeVertices[edge][0]]; + clippedPoints[numClippedPoints + 1] = projectedNearPoints[boxEdgeVertices[edge][1]]; + clippedPoints[numClippedPoints + 2] = projectedFarPoints[boxEdgeVertices[edge][1]]; + clippedPoints[numClippedPoints + 3] = projectedFarPoints[boxEdgeVertices[edge][0]]; + numClippedPoints += ClipHomogeneousPolygonToUnitCube_Generic( &clippedPoints[numClippedPoints], 4 ); + } + + // test if the center of the near clip plane is inside the infinite shadow volume + const idVec3 localNearClipCenter = LocalNearClipCenterFromMVP( mvp ); + const bool inside = PointInsideInfiniteShadow( bounds, localLightOrigin, localNearClipCenter, RENDER_MATRIX_PROJECTION_EPSILON ); + + min = inside ? -1.0f : RENDER_MATRIX_INFINITY; + max = - RENDER_MATRIX_INFINITY; + + for ( int i = 0; i < numClippedPoints; i++ ) { + const idVec4 & c = clippedPoints[i]; + + assert( c.w > idMath::FLT_SMALLEST_NON_DENORMAL ); + + const float rw = 1.0f / c.w; + const float pz = c.z * rw; + + min = Min( min, pz ); + max = Max( max, pz ); + } + + if ( windowSpace ) { + // convert to window coords +#if !defined( CLIP_SPACE_D3D ) // the D3D clip space Z is already in the range [0,1] + min = min * 0.5f + 0.5f; + max = max * 0.5f + 0.5f; +#endif + // clamp to [0, 1] range + min = idMath::ClampFloat( 0.0f, 1.0f, min ); + max = idMath::ClampFloat( 0.0f, 1.0f, max ); + } + +#endif } /* @@ -3122,6 +4271,7 @@ idRenderMatrix::GetFrustumCorners void idRenderMatrix::GetFrustumCorners( frustumCorners_t & corners, const idRenderMatrix & frustumTransform, const idBounds & frustumBounds ) { assert_16_byte_aligned( &corners ); +#ifdef ID_WIN_X86_SSE2_INTRIN __m128 mvp0 = _mm_loadu_ps( frustumTransform[0] ); __m128 mvp1 = _mm_loadu_ps( frustumTransform[1] ); @@ -3194,6 +4344,33 @@ void idRenderMatrix::GetFrustumCorners( frustumCorners_t & corners, const idRend _mm_store_ps( corners.z + 0, z0 ); _mm_store_ps( corners.z + 4, z1 ); +#else + + idVec3 v; + for ( int x = 0; x < 2; x++ ) { + v[0] = frustumBounds[x][0]; + for ( int y = 0; y < 2; y++ ) { + v[1] = frustumBounds[y][1]; + for ( int z = 0; z < 2; z++ ) { + v[2] = frustumBounds[z][2]; + + float tx = v[0] * frustumTransform[0][0] + v[1] * frustumTransform[0][1] + v[2] * frustumTransform[0][2] + frustumTransform[0][3]; + float ty = v[0] * frustumTransform[1][0] + v[1] * frustumTransform[1][1] + v[2] * frustumTransform[1][2] + frustumTransform[1][3]; + float tz = v[0] * frustumTransform[2][0] + v[1] * frustumTransform[2][1] + v[2] * frustumTransform[2][2] + frustumTransform[2][3]; + float tw = v[0] * frustumTransform[3][0] + v[1] * frustumTransform[3][1] + v[2] * frustumTransform[3][2] + frustumTransform[3][3]; + + assert( tw > idMath::FLT_SMALLEST_NON_DENORMAL ); + + float rw = 1.0f / tw; + + corners.x[(z<<2)|(y<<1)|(x<<0)] = tx * rw; + corners.y[(z<<2)|(y<<1)|(x<<0)] = ty * rw; + corners.z[(z<<2)|(y<<1)|(x<<0)] = tz * rw; + } + } + } + +#endif } /* @@ -3204,6 +4381,7 @@ idRenderMatrix::CullFrustumCornersToPlane frustumCull_t idRenderMatrix::CullFrustumCornersToPlane( const frustumCorners_t & corners, const idPlane & plane ) { assert_16_byte_aligned( &corners ); +#ifdef ID_WIN_X86_SSE2_INTRIN __m128 vp = _mm_loadu_ps( plane.ToFloatPtr() ); @@ -3235,4 +4413,26 @@ frustumCull_t idRenderMatrix::CullFrustumCornersToPlane( const frustumCorners_t return (frustumCull_t) ( front | ( back << 1 ) ); +#else + + bool front = false; + bool back = false; + for ( int i = 0; i < 8; i++ ) { + const float d = corners.x[i] * plane[0] + corners.y[i] * plane[1] + corners.z[i] * plane[2] + plane[3]; + if ( d >= 0.0f ) { + front = true; + } else if ( d <= 0.0f ) { + back = true; + } + if ( back && front ) { + return FRUSTUM_CULL_CROSS; + } + } + if ( front ) { + return FRUSTUM_CULL_FRONT; + } else { + return FRUSTUM_CULL_BACK; + } + +#endif } diff --git a/neo/idlib/math/Lcp.cpp b/neo/idlib/math/Lcp.cpp index 01d4c5a5..c24037f5 100644 --- a/neo/idlib/math/Lcp.cpp +++ b/neo/idlib/math/Lcp.cpp @@ -44,6 +44,7 @@ const float LCP_DELTA_FORCE_EPSILON = 1e-9f; #define IGNORE_UNSATISFIABLE_VARIABLES +#if defined( ID_WIN_X86_SSE_ASM ) || defined( ID_WIN_X86_SSE_INTRIN ) ALIGN16( const __m128 SIMD_SP_zero ) = { 0.0f, 0.0f, 0.0f, 0.0f }; ALIGN16( const __m128 SIMD_SP_one ) = { 1.0f, 1.0f, 1.0f, 1.0f }; @@ -67,6 +68,8 @@ ALIGN16( const unsigned int SIMD_DW_four[4] ) = { 4, 4, 4, 4 }; ALIGN16( const unsigned int SIMD_DW_index[4] ) = { 0, 1, 2, 3 }; ALIGN16( const int SIMD_DW_not3[4] ) = { ~3, ~3, ~3, ~3 }; +#endif + /* ======================== Multiply_SIMD @@ -82,6 +85,7 @@ static void Multiply_SIMD( float * dst, const float * src0, const float * src1, dst[i] = src0[i] * src1[i]; } +#ifdef ID_WIN_X86_SSE_INTRIN for ( ; i + 4 <= count; i += 4 ) { assert_16_byte_aligned( &dst[i] ); @@ -94,6 +98,20 @@ static void Multiply_SIMD( float * dst, const float * src0, const float * src1, _mm_store_ps( dst + i, s0 ); } +#else + + for ( ; i + 4 <= count; i += 4 ) { + assert_16_byte_aligned( &dst[i] ); + assert_16_byte_aligned( &src0[i] ); + assert_16_byte_aligned( &src1[i] ); + + dst[i+0] = src0[i+0] * src1[i+0]; + dst[i+1] = src0[i+1] * src1[i+1]; + dst[i+2] = src0[i+2] * src1[i+2]; + dst[i+3] = src0[i+3] * src1[i+3]; + } + +#endif for ( ; i < count; i++ ) { dst[i] = src0[i] * src1[i]; @@ -115,6 +133,7 @@ static void MultiplyAdd_SIMD( float * dst, const float constant, const float * s dst[i] += constant * src[i]; } +#ifdef ID_WIN_X86_SSE_INTRIN __m128 c = _mm_load1_ps( & constant ); for ( ; i + 4 <= count; i += 4 ) { @@ -127,6 +146,19 @@ static void MultiplyAdd_SIMD( float * dst, const float constant, const float * s _mm_store_ps( dst + i, s ); } +#else + + for ( ; i + 4 <= count; i += 4 ) { + assert_16_byte_aligned( &src[i] ); + assert_16_byte_aligned( &dst[i] ); + + dst[i+0] += constant * src[i+0]; + dst[i+1] += constant * src[i+1]; + dst[i+2] += constant * src[i+2]; + dst[i+3] += constant * src[i+3]; + } + +#endif for ( ; i < count; i++ ) { dst[i] += constant * src[i]; @@ -144,7 +176,7 @@ static float DotProduct_SIMD( const float * src0, const float * src1, const int assert_16_byte_aligned( src0 ); assert_16_byte_aligned( src1 ); -#ifndef _lint +#ifdef ID_WIN_X86_SSE_INTRIN __m128 sum = (__m128 &) SIMD_SP_zero; int i = 0; @@ -266,7 +298,7 @@ static void LowerTriangularSolve_SIMD( const idMatX & L, float * x, const float int i = skip; -#ifndef _lint +#ifdef ID_WIN_X86_SSE_INTRIN // work up to a multiple of 4 rows for ( ; ( i & 3 ) != 0 && i < n; i++ ) { @@ -520,7 +552,7 @@ static void LowerTriangularSolveTranspose_SIMD( const idMatX & L, float * x, con const float * lptr = L.ToFloatPtr() + m * nc + m - 4; float * xptr = x + m; -#ifndef _lint +#ifdef ID_WIN_X86_SSE2_INTRIN // process 4 rows at a time for ( int i = m; i >= 4; i -= 4 ) { @@ -850,7 +882,7 @@ static bool LDLT_Factor_SIMD( idMatX & mat, idVecX & invDiag, const int n ) { mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d; } -#ifndef _lint +#ifdef ID_WIN_X86_SSE2_INTRIN __m128 vzero = _mm_setzero_ps(); for ( int i = 4; i < n; i += 4 ) { @@ -1210,6 +1242,7 @@ static void GetMaxStep_SIMD( const float * f, const float * a, const float * del const float * lo, const float * hi, const int * side, int numUnbounded, int numClamped, int d, float dir, float & maxStep, int & limit, int & limitSide ) { +#ifdef ID_WIN_X86_SSE2_INTRIN __m128 vMaxStep; __m128i vLimit; @@ -1332,6 +1365,65 @@ static void GetMaxStep_SIMD( const float * f, const float * a, const float * del _mm_store_ss( & maxStep, vMaxStep ); limit = _mm_cvtsi128_si32( vLimit ); limitSide = _mm_cvtsi128_si32( vLimitSide ); + +#else + + // default to a full step for the current variable + { + float negAccel = -a[d]; + float deltaAccel = delta_a[d]; + int m0 = ( fabs( deltaAccel ) > LCP_DELTA_ACCEL_EPSILON ); + float step = negAccel / ( m0 ? deltaAccel : 1.0f ); + maxStep = m0 ? step : 0.0f; + limit = d; + limitSide = 0; + } + + // test the current variable + { + float deltaForce = dir; + float forceLimit = ( deltaForce < 0.0f ) ? lo[d] : hi[d]; + float step = ( forceLimit - f[d] ) / deltaForce; + int setSide = ( deltaForce < 0.0f ) ? -1 : 1; + int m0 = ( fabs( deltaForce ) > LCP_DELTA_FORCE_EPSILON ); + int m1 = ( fabs( forceLimit ) != idMath::INFINITY ); + int m2 = ( step < maxStep ); + int m3 = ( m0 & m1 & m2 ); + maxStep = m3 ? step : maxStep; + limit = m3 ? d : limit; + limitSide = m3 ? setSide : limitSide; + } + + // test the clamped bounded variables + for ( int i = numUnbounded; i < numClamped; i++ ) { + float deltaForce = delta_f[i]; + float forceLimit = ( deltaForce < 0.0f ) ? lo[i] : hi[i]; + int m0 = ( fabs( deltaForce ) > LCP_DELTA_FORCE_EPSILON ); + float step = ( forceLimit - f[i] ) / ( m0 ? deltaForce : 1.0f ); + int setSide = ( deltaForce < 0.0f ) ? -1 : 1; + int m1 = ( fabs( forceLimit ) != idMath::INFINITY ); + int m2 = ( step < maxStep ); + int m3 = ( m0 & m1 & m2 ); + maxStep = m3 ? step : maxStep; + limit = m3 ? i : limit; + limitSide = m3 ? setSide : limitSide; + } + + // test the not clamped bounded variables + for ( int i = numClamped; i < d; i++ ) { + float negAccel = -a[i]; + float deltaAccel = delta_a[i]; + int m0 = ( side[i] * deltaAccel > LCP_DELTA_ACCEL_EPSILON ); + float step = negAccel / ( m0 ? deltaAccel : 1.0f ); + int m1 = ( lo[i] < -LCP_BOUND_EPSILON || hi[i] > LCP_BOUND_EPSILON ); + int m2 = ( step < maxStep ); + int m3 = ( m0 & m1 & m2 ); + maxStep = m3 ? step : maxStep; + limit = m3 ? i : limit; + limitSide = m3 ? 0 : limitSide; + } + +#endif } /* diff --git a/neo/idlib/math/MatX.cpp b/neo/idlib/math/MatX.cpp index 78cbff33..dc1702c1 100644 --- a/neo/idlib/math/MatX.cpp +++ b/neo/idlib/math/MatX.cpp @@ -171,6 +171,7 @@ void idMatX::CopyLowerToUpperTriangle() { assert( ( GetNumColumns() & 3 ) == 0 ); assert( GetNumColumns() >= GetNumRows() ); +#ifdef ID_WIN_X86_SSE_INTRIN const int n = GetNumColumns(); const int m = GetNumRows(); @@ -307,6 +308,20 @@ void idMatX::CopyLowerToUpperTriangle() { _mm_store_ps( basePtr + n0, r0 ); } +#else + + const int n = GetNumColumns(); + const int m = GetNumRows(); + for ( int i = 0; i < m; i++ ) { + const float * __restrict ptr = ToFloatPtr() + ( i + 1 ) * n + i; + float * __restrict dstPtr = ToFloatPtr() + i * n; + for ( int j = i + 1; j < m; j++ ) { + dstPtr[j] = ptr[0]; + ptr += n; + } + } + +#endif #ifdef _DEBUG for ( int i = 0; i < numRows; i++ ) { diff --git a/neo/idlib/math/MatX.h b/neo/idlib/math/MatX.h index 059a5f3f..876a9091 100644 --- a/neo/idlib/math/MatX.h +++ b/neo/idlib/math/MatX.h @@ -389,7 +389,7 @@ idMatX::operator= ID_INLINE idMatX &idMatX::operator=( const idMatX &a ) { SetSize( a.numRows, a.numColumns ); int s = a.numRows * a.numColumns; -#ifdef MATX_SIMD +#if defined(ID_WIN_X86_SSE_INTRIN) && defined(MATX_SIMD) for ( int i = 0; i < s; i += 4 ) { _mm_store_ps( mat + i, _mm_load_ps( a.mat + i ) ); } @@ -410,7 +410,7 @@ ID_INLINE idMatX idMatX::operator*( const float a ) const { m.SetTempSize( numRows, numColumns ); int s = numRows * numColumns; -#ifdef MATX_SIMD +#if defined(ID_WIN_X86_SSE_INTRIN) && defined(MATX_SIMD) __m128 va = _mm_load1_ps( & a ); for ( int i = 0; i < s; i += 4 ) { _mm_store_ps( m.mat + i, _mm_mul_ps( _mm_load_ps( mat + i ), va ) ); @@ -462,7 +462,7 @@ ID_INLINE idMatX idMatX::operator+( const idMatX &a ) const { assert( numRows == a.numRows && numColumns == a.numColumns ); m.SetTempSize( numRows, numColumns ); int s = numRows * numColumns; -#ifdef MATX_SIMD +#if defined(ID_WIN_X86_SSE_INTRIN) && defined(MATX_SIMD) for ( int i = 0; i < s; i += 4 ) { _mm_store_ps( m.mat + i, _mm_add_ps( _mm_load_ps( mat + i ), _mm_load_ps( a.mat + i ) ) ); } @@ -485,7 +485,7 @@ ID_INLINE idMatX idMatX::operator-( const idMatX &a ) const { assert( numRows == a.numRows && numColumns == a.numColumns ); m.SetTempSize( numRows, numColumns ); int s = numRows * numColumns; -#ifdef MATX_SIMD +#if defined(ID_WIN_X86_SSE_INTRIN) && defined(MATX_SIMD) for ( int i = 0; i < s; i += 4 ) { _mm_store_ps( m.mat + i, _mm_sub_ps( _mm_load_ps( mat + i ), _mm_load_ps( a.mat + i ) ) ); } @@ -504,7 +504,7 @@ idMatX::operator*= */ ID_INLINE idMatX &idMatX::operator*=( const float a ) { int s = numRows * numColumns; -#ifdef MATX_SIMD +#if defined(ID_WIN_X86_SSE_INTRIN) && defined(MATX_SIMD) __m128 va = _mm_load1_ps( & a ); for ( int i = 0; i < s; i += 4 ) { _mm_store_ps( mat + i, _mm_mul_ps( _mm_load_ps( mat + i ), va ) ); @@ -537,7 +537,7 @@ idMatX::operator+= ID_INLINE idMatX &idMatX::operator+=( const idMatX &a ) { assert( numRows == a.numRows && numColumns == a.numColumns ); int s = numRows * numColumns; -#ifdef MATX_SIMD +#if defined(ID_WIN_X86_SSE_INTRIN) && defined(MATX_SIMD) for ( int i = 0; i < s; i += 4 ) { _mm_store_ps( mat + i, _mm_add_ps( _mm_load_ps( mat + i ), _mm_load_ps( a.mat + i ) ) ); } @@ -558,7 +558,7 @@ idMatX::operator-= ID_INLINE idMatX &idMatX::operator-=( const idMatX &a ) { assert( numRows == a.numRows && numColumns == a.numColumns ); int s = numRows * numColumns; -#ifdef MATX_SIMD +#if defined(ID_WIN_X86_SSE_INTRIN) && defined(MATX_SIMD) for ( int i = 0; i < s; i += 4 ) { _mm_store_ps( mat + i, _mm_sub_ps( _mm_load_ps( mat + i ), _mm_load_ps( a.mat + i ) ) ); } @@ -744,7 +744,7 @@ idMatX::Zero */ ID_INLINE void idMatX::Zero() { int s = numRows * numColumns; -#ifdef MATX_SIMD +#if defined(ID_WIN_X86_SSE_INTRIN) && defined(MATX_SIMD) for ( int i = 0; i < s; i += 4 ) { _mm_store_ps( mat + i, _mm_setzero_ps() ); } @@ -838,7 +838,7 @@ idMatX::Negate */ ID_INLINE void idMatX::Negate() { int s = numRows * numColumns; -#ifdef MATX_SIMD +#if defined(ID_WIN_X86_SSE_INTRIN) && defined(MATX_SIMD) ALIGN16( const unsigned int signBit[4] ) = { IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK }; for ( int i = 0; i < s; i += 4 ) { _mm_store_ps( mat + i, _mm_xor_ps( _mm_load_ps( mat + i ), (__m128 &) signBit[0] ) ); diff --git a/neo/idlib/math/Math.cpp b/neo/idlib/math/Math.cpp index ce338b5a..7f7d44e5 100644 --- a/neo/idlib/math/Math.cpp +++ b/neo/idlib/math/Math.cpp @@ -51,6 +51,7 @@ const float idMath::INFINITY = 1e30f; const float idMath::FLT_EPSILON = 1.192092896e-07f; const float idMath::FLT_SMALLEST_NON_DENORMAL = * reinterpret_cast< const float * >( & SMALLEST_NON_DENORMAL ); // 1.1754944e-038f +#if defined( ID_WIN_X86_SSE_INTRIN ) const __m128 idMath::SIMD_SP_zero = { 0.0f, 0.0f, 0.0f, 0.0f }; const __m128 idMath::SIMD_SP_255 = { 255.0f, 255.0f, 255.0f, 255.0f }; const __m128 idMath::SIMD_SP_min_char = { -128.0f, -128.0f, -128.0f, -128.0f }; @@ -61,6 +62,7 @@ const __m128 idMath::SIMD_SP_smallestNonDenorm = { FLT_SMALLEST_NON_DENORMAL, FL const __m128 idMath::SIMD_SP_tiny = { 1e-4f, 1e-4f, 1e-4f, 1e-4f }; const __m128 idMath::SIMD_SP_rsqrt_c0 = { 3.0f, 3.0f, 3.0f, 3.0f }; const __m128 idMath::SIMD_SP_rsqrt_c1 = { -0.5f, -0.5f, -0.5f, -0.5f }; +#endif bool idMath::initialized = false; dword idMath::iSqrt[SQRT_TABLE_SIZE]; // inverse square root lookup table diff --git a/neo/idlib/math/Math.h b/neo/idlib/math/Math.h index d08fcac4..abf90b5b 100644 --- a/neo/idlib/math/Math.h +++ b/neo/idlib/math/Math.h @@ -419,6 +419,7 @@ public: static const float FLT_EPSILON; // smallest positive number such that 1.0+FLT_EPSILON != 1.0 static const float FLT_SMALLEST_NON_DENORMAL; // smallest non-denormal 32-bit floating point value +#if defined( ID_WIN_X86_SSE_INTRIN ) static const __m128 SIMD_SP_zero; static const __m128 SIMD_SP_255; static const __m128 SIMD_SP_min_char; @@ -429,6 +430,7 @@ public: static const __m128 SIMD_SP_tiny; static const __m128 SIMD_SP_rsqrt_c0; static const __m128 SIMD_SP_rsqrt_c1; +#endif private: enum { @@ -460,9 +462,15 @@ idMath::InvSqrt ======================== */ ID_INLINE float idMath::InvSqrt( float x ) { +#ifdef ID_WIN_X86_SSE_INTRIN return ( x > FLT_SMALLEST_NON_DENORMAL ) ? sqrtf( 1.0f / x ) : INFINITY; +#else + + return ( x > FLT_SMALLEST_NON_DENORMAL ) ? sqrtf( 1.0f / x ) : INFINITY; + +#endif } /* @@ -471,9 +479,15 @@ idMath::InvSqrt16 ======================== */ ID_INLINE float idMath::InvSqrt16( float x ) { +#ifdef ID_WIN_X86_SSE_INTRIN return ( x > FLT_SMALLEST_NON_DENORMAL ) ? sqrtf( 1.0f / x ) : INFINITY; +#else + + return ( x > FLT_SMALLEST_NON_DENORMAL ) ? sqrtf( 1.0f / x ) : INFINITY; + +#endif } /* @@ -482,7 +496,11 @@ idMath::Sqrt ======================== */ ID_INLINE float idMath::Sqrt( float x ) { +#ifdef ID_WIN_X86_SSE_INTRIN return ( x >= 0.0f ) ? x * InvSqrt( x ) : 0.0f; +#else + return ( x >= 0.0f ) ? sqrtf( x ) : 0.0f; +#endif } /* @@ -491,7 +509,11 @@ idMath::Sqrt16 ======================== */ ID_INLINE float idMath::Sqrt16( float x ) { +#ifdef ID_WIN_X86_SSE_INTRIN return ( x >= 0.0f ) ? x * InvSqrt16( x ) : 0.0f; +#else + return ( x >= 0.0f ) ? sqrtf( x ) : 0.0f; +#endif } /* @@ -601,6 +623,7 @@ idMath::SinCos ======================== */ ID_INLINE void idMath::SinCos( float a, float &s, float &c ) { +#if defined( ID_WIN_X86_ASM ) _asm { fld a fsincos @@ -609,6 +632,10 @@ ID_INLINE void idMath::SinCos( float a, float &s, float &c ) { fstp dword ptr [ecx] fstp dword ptr [edx] } +#else + s = sinf( a ); + c = cosf( a ); +#endif } /* @@ -1128,11 +1155,24 @@ idMath::Ftoi ======================== */ ID_INLINE int idMath::Ftoi( float f ) { +#ifdef ID_WIN_X86_SSE_INTRIN // If a converted result is larger than the maximum signed doubleword integer, // the floating-point invalid exception is raised, and if this exception is masked, // the indefinite integer value (80000000H) is returned. __m128 x = _mm_load_ss( &f ); return _mm_cvttss_si32( x ); +#elif 0 // round chop (C/C++ standard) + int i, s, e, m, shift; + i = *reinterpret_cast(&f); + s = i >> IEEE_FLT_SIGN_BIT; + e = ( ( i >> IEEE_FLT_MANTISSA_BITS ) & ( ( 1 << IEEE_FLT_EXPONENT_BITS ) - 1 ) ) - IEEE_FLT_EXPONENT_BIAS; + m = ( i & ( ( 1 << IEEE_FLT_MANTISSA_BITS ) - 1 ) ) | ( 1 << IEEE_FLT_MANTISSA_BITS ); + shift = e - IEEE_FLT_MANTISSA_BITS; + return ( ( ( ( m >> -shift ) | ( m << shift ) ) & ~( e >> INT32_SIGN_BIT ) ) ^ s ) - s; +#else + // If a converted result is larger than the maximum signed doubleword integer the result is undefined. + return C_FLOAT_TO_INT( f ); +#endif } /* @@ -1141,10 +1181,21 @@ idMath::Ftoi8 ======================== */ ID_INLINE char idMath::Ftoi8( float f ) { +#ifdef ID_WIN_X86_SSE_INTRIN __m128 x = _mm_load_ss( &f ); x = _mm_max_ss( x, SIMD_SP_min_char ); x = _mm_min_ss( x, SIMD_SP_max_char ); return static_cast( _mm_cvttss_si32( x ) ); +#else + // The converted result is clamped to the range [-128,127]. + int i = C_FLOAT_TO_INT( f ); + if ( i < -128 ) { + return -128; + } else if ( i > 127 ) { + return 127; + } + return static_cast( i ); +#endif } /* @@ -1153,10 +1204,21 @@ idMath::Ftoi16 ======================== */ ID_INLINE short idMath::Ftoi16( float f ) { +#ifdef ID_WIN_X86_SSE_INTRIN __m128 x = _mm_load_ss( &f ); x = _mm_max_ss( x, SIMD_SP_min_short ); x = _mm_min_ss( x, SIMD_SP_max_short ); return static_cast( _mm_cvttss_si32( x ) ); +#else + // The converted result is clamped to the range [-32768,32767]. + int i = C_FLOAT_TO_INT( f ); + if ( i < -32768 ) { + return -32768; + } else if ( i > 32767 ) { + return 32767; + } + return static_cast( i ); +#endif } /* @@ -1183,12 +1245,23 @@ idMath::Ftob ======================== */ ID_INLINE byte idMath::Ftob( float f ) { +#ifdef ID_WIN_X86_SSE_INTRIN // If a converted result is negative the value (0) is returned and if the // converted result is larger than the maximum byte the value (255) is returned. __m128 x = _mm_load_ss( &f ); x = _mm_max_ss( x, SIMD_SP_zero ); x = _mm_min_ss( x, SIMD_SP_255 ); return static_cast( _mm_cvttss_si32( x ) ); +#else + // The converted result is clamped to the range [0,255]. + int i = C_FLOAT_TO_INT( f ); + if ( i < 0 ) { + return 0; + } else if ( i > 255 ) { + return 255; + } + return static_cast( i ); +#endif } /* diff --git a/neo/idlib/math/VecX.h b/neo/idlib/math/VecX.h index 4086a1a9..e8ff421a 100644 --- a/neo/idlib/math/VecX.h +++ b/neo/idlib/math/VecX.h @@ -213,7 +213,7 @@ ID_INLINE idVecX idVecX::operator-() const { idVecX m; m.SetTempSize( size ); -#ifdef VECX_SIMD +#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD) ALIGN16( unsigned int signBit[4] ) = { IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK }; for ( int i = 0; i < size; i += 4 ) { _mm_store_ps( m.p + i, _mm_xor_ps( _mm_load_ps( p + i ), (__m128 &) signBit[0] ) ); @@ -233,7 +233,7 @@ idVecX::operator= */ ID_INLINE idVecX &idVecX::operator=( const idVecX &a ) { SetSize( a.size ); -#ifdef VECX_SIMD +#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD) for ( int i = 0; i < a.size; i += 4 ) { _mm_store_ps( p + i, _mm_load_ps( a.p + i ) ); } @@ -254,7 +254,7 @@ ID_INLINE idVecX idVecX::operator+( const idVecX &a ) const { assert( size == a.size ); m.SetTempSize( size ); -#ifdef VECX_SIMD +#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD) for ( int i = 0; i < size; i += 4 ) { _mm_store_ps( m.p + i, _mm_add_ps( _mm_load_ps( p + i ), _mm_load_ps( a.p + i ) ) ); } @@ -276,7 +276,7 @@ ID_INLINE idVecX idVecX::operator-( const idVecX &a ) const { assert( size == a.size ); m.SetTempSize( size ); -#ifdef VECX_SIMD +#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD) for ( int i = 0; i < size; i += 4 ) { _mm_store_ps( m.p + i, _mm_sub_ps( _mm_load_ps( p + i ), _mm_load_ps( a.p + i ) ) ); } @@ -295,7 +295,7 @@ idVecX::operator+= */ ID_INLINE idVecX &idVecX::operator+=( const idVecX &a ) { assert( size == a.size ); -#ifdef VECX_SIMD +#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD) for ( int i = 0; i < size; i += 4 ) { _mm_store_ps( p + i, _mm_add_ps( _mm_load_ps( p + i ), _mm_load_ps( a.p + i ) ) ); } @@ -315,7 +315,7 @@ idVecX::operator-= */ ID_INLINE idVecX &idVecX::operator-=( const idVecX &a ) { assert( size == a.size ); -#ifdef VECX_SIMD +#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD) for ( int i = 0; i < size; i += 4 ) { _mm_store_ps( p + i, _mm_sub_ps( _mm_load_ps( p + i ), _mm_load_ps( a.p + i ) ) ); } @@ -337,7 +337,7 @@ ID_INLINE idVecX idVecX::operator*( const float a ) const { idVecX m; m.SetTempSize( size ); -#ifdef VECX_SIMD +#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD) __m128 va = _mm_load1_ps( & a ); for ( int i = 0; i < size; i += 4 ) { _mm_store_ps( m.p + i, _mm_mul_ps( _mm_load_ps( p + i ), va ) ); @@ -356,7 +356,7 @@ idVecX::operator*= ======================== */ ID_INLINE idVecX &idVecX::operator*=( const float a ) { -#ifdef VECX_SIMD +#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD) __m128 va = _mm_load1_ps( & a ); for ( int i = 0; i < size; i += 4 ) { _mm_store_ps( p + i, _mm_mul_ps( _mm_load_ps( p + i ), va ) ); @@ -551,7 +551,7 @@ idVecX::Zero ======================== */ ID_INLINE void idVecX::Zero() { -#ifdef VECX_SIMD +#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD) for ( int i = 0; i < size; i += 4 ) { _mm_store_ps( p + i, _mm_setzero_ps() ); } @@ -567,7 +567,7 @@ idVecX::Zero */ ID_INLINE void idVecX::Zero( int length ) { SetSize( length ); -#ifdef VECX_SIMD +#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD) for ( int i = 0; i < length; i += 4 ) { _mm_store_ps( p + i, _mm_setzero_ps() ); } @@ -611,7 +611,7 @@ idVecX::Negate ======================== */ ID_INLINE void idVecX::Negate() { -#ifdef VECX_SIMD +#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD) ALIGN16( const unsigned int signBit[4] ) = { IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK }; for ( int i = 0; i < size; i += 4 ) { _mm_store_ps( p + i, _mm_xor_ps( _mm_load_ps( p + i ), (__m128 &) signBit[0] ) ); diff --git a/neo/idlib/math/Vector.h b/neo/idlib/math/Vector.h index f194618d..c0a010ea 100644 --- a/neo/idlib/math/Vector.h +++ b/neo/idlib/math/Vector.h @@ -451,6 +451,10 @@ ID_INLINE idVec3 operator*( const float a, const idVec3 b ) { return idVec3( b.x * a, b.y * a, b.z * a ); } +ID_INLINE idVec3 operator/( const float a, const idVec3 b ) { + return idVec3( a / b.x, a / b.y, a / b.z ); +} + ID_INLINE idVec3 idVec3::operator+( const idVec3 &a ) const { return idVec3( x + a.x, y + a.y, z + a.z ); } diff --git a/neo/idlib/sys/sys_defines.h b/neo/idlib/sys/sys_defines.h index 4a052a00..b20e6169 100644 --- a/neo/idlib/sys/sys_defines.h +++ b/neo/idlib/sys/sys_defines.h @@ -28,6 +28,58 @@ If you have questions concerning this license or the applicable additional terms #ifndef SYS_DEFINES_H #define SYS_DEFINES_H +/* +================================================================================================ + + Platform Specific ID_ Defines + + The ID_ defines are the only platform defines we should be using. + +================================================================================================ +*/ + +#undef ID_PC +#undef ID_PC_WIN +#undef ID_PC_WIN64 +#undef ID_CONSOLE +#undef ID_WIN32 +#undef ID_LITTLE_ENDIAN + +#if defined(_WIN32) + // _WIN32 always defined + // _WIN64 also defined for x64 target +/* + #if !defined( _MANAGED ) + #if !defined( _WIN64 ) + #define ID_WIN_X86_ASM + #define ID_WIN_X86_MMX_ASM + #define ID_WIN_X86_MMX_INTRIN + #define ID_WIN_X86_SSE_ASM + #define ID_WIN_X86_SSE_INTRIN + #define ID_WIN_X86_SSE2_ASM + #define ID_WIN_X86_SSE2_INTRIN + // the 32 bit build is now as close to the console builds as possible + #define ID_CONSOLE + #else + #define ID_PC_WIN64 + #define ID_WIN_X86_MMX_INTRIN + #define ID_WIN_X86_SSE_INTRIN + #define ID_WIN_X86_SSE2_INTRIN + #define ID_WIN_X86_SSE3_INTRIN + #endif + #endif +*/ + + #define ID_PC + #define ID_PC_WIN + #define ID_WIN32 + #define ID_LITTLE_ENDIAN +#else +#error Unknown Platform +#endif + +#define ID_OPENGL + /* ================================================================================================ @@ -36,6 +88,7 @@ If you have questions concerning this license or the applicable additional terms ================================================================================================ */ +#ifdef ID_PC_WIN #define CPUSTRING "x86" @@ -69,6 +122,8 @@ If you have questions concerning this license or the applicable additional terms #define WIN32 #endif +#endif + /* ================================================================================================ @@ -108,6 +163,8 @@ bulk of the codebase, so it is the best place for analyze pragmas. ================================================================================================ */ +#if defined( ID_WIN32 ) + // disable some /analyze warnings here #pragma warning( disable: 6255 ) // warning C6255: _alloca indicates failure by raising a stack overflow exception. Consider using _malloca instead. (Note: _malloca requires _freea.) #pragma warning( disable: 6262 ) // warning C6262: Function uses '36924' bytes of stack: exceeds /analyze:stacksize'32768'. Consider moving some data to heap @@ -135,6 +192,7 @@ bulk of the codebase, so it is the best place for analyze pragmas. // guaranteed to be false in the following code #define NO_RETURN __declspec(noreturn) +#endif // I don't want to disable "warning C6031: Return value ignored" from /analyze // but there are several cases with sprintf where we pre-initialized the variables diff --git a/neo/idlib/sys/sys_intrinsics.h b/neo/idlib/sys/sys_intrinsics.h index 12ad78dd..93e5a515 100644 --- a/neo/idlib/sys/sys_intrinsics.h +++ b/neo/idlib/sys/sys_intrinsics.h @@ -56,6 +56,8 @@ ID_INLINE_EXTERN float __frndz( float x ) { return (float)( (int)( x ) ); } ================================================================================================ */ +#ifdef ID_WIN_X86_SSE2_INTRIN + // The code below assumes that a cache line is 64 bytes. // We specify the cache line size as 128 here to make the code consistent with the consoles. #define CACHE_LINE_SIZE 128 @@ -84,6 +86,24 @@ ID_FORCE_INLINE void FlushCacheLine( const void * ptr, int offset ) { _mm_clflush( bytePtr + 64 ); } +/* +================================================ + Other +================================================ +*/ +#else + +#define CACHE_LINE_SIZE 128 + +ID_INLINE void Prefetch( const void * ptr, int offset ) {} +ID_INLINE void ZeroCacheLine( void * ptr, int offset ) { + byte * bytePtr = (byte *)( ( ( (UINT_PTR) ( ptr ) ) + ( offset ) ) & ~( CACHE_LINE_SIZE - 1 ) ); + memset( bytePtr, 0, CACHE_LINE_SIZE ); +} +ID_INLINE void FlushCacheLine( const void * ptr, int offset ) {} + +#endif + /* ================================================ Block Clear Macros diff --git a/neo/renderer/BufferObject.cpp b/neo/renderer/BufferObject.cpp index 2e42d2fb..4a537aa1 100644 --- a/neo/renderer/BufferObject.cpp +++ b/neo/renderer/BufferObject.cpp @@ -72,6 +72,7 @@ void UnbindBufferObjects() { qglBindBufferARB( GL_ELEMENT_ARRAY_BUFFER_ARB, 0 ); } +#ifdef ID_WIN_X86_SSE2_INTRIN void CopyBuffer( byte * dst, const byte * src, int numBytes ) { assert_16_byte_aligned( dst ); @@ -109,6 +110,15 @@ void CopyBuffer( byte * dst, const byte * src, int numBytes ) { _mm_sfence(); } +#else + +void CopyBuffer( byte * dst, const byte * src, int numBytes ) { + assert_16_byte_aligned( dst ); + assert_16_byte_aligned( src ); + memcpy( dst, src, numBytes ); +} + +#endif /* ================================================================================================ diff --git a/neo/renderer/DXT/DXTCodec.h b/neo/renderer/DXT/DXTCodec.h index cd84d33a..76d76daf 100644 --- a/neo/renderer/DXT/DXTCodec.h +++ b/neo/renderer/DXT/DXTCodec.h @@ -258,7 +258,11 @@ idDxtEncoder::CompressImageDXT1Fast ======================== */ ID_INLINE void idDxtEncoder::CompressImageDXT1Fast( const byte *inBuf, byte *outBuf, int width, int height ) { +#ifdef ID_WIN_X86_SSE2_INTRIN CompressImageDXT1Fast_SSE2( inBuf, outBuf, width, height ); +#else + CompressImageDXT1Fast_Generic( inBuf, outBuf, width, height ); +#endif } /* @@ -267,7 +271,11 @@ idDxtEncoder::CompressImageDXT1AlphaFast ======================== */ ID_INLINE void idDxtEncoder::CompressImageDXT1AlphaFast( const byte *inBuf, byte *outBuf, int width, int height ) { +#ifdef ID_WIN_X86_SSE2_INTRIN CompressImageDXT1AlphaFast_SSE2( inBuf, outBuf, width, height ); +#else + CompressImageDXT1AlphaFast_Generic( inBuf, outBuf, width, height ); +#endif } /* @@ -276,7 +284,11 @@ idDxtEncoder::CompressImageDXT5Fast ======================== */ ID_INLINE void idDxtEncoder::CompressImageDXT5Fast( const byte *inBuf, byte *outBuf, int width, int height ) { +#ifdef ID_WIN_X86_SSE2_INTRIN CompressImageDXT5Fast_SSE2( inBuf, outBuf, width, height ); +#else + CompressImageDXT5Fast_Generic( inBuf, outBuf, width, height ); +#endif } /* @@ -294,7 +306,11 @@ idDxtEncoder::CompressYCoCgDXT5Fast ======================== */ ID_INLINE void idDxtEncoder::CompressYCoCgDXT5Fast( const byte *inBuf, byte *outBuf, int width, int height ) { +#ifdef ID_WIN_X86_SSE2_INTRIN CompressYCoCgDXT5Fast_SSE2( inBuf, outBuf, width, height ); +#else + CompressYCoCgDXT5Fast_Generic( inBuf, outBuf, width, height ); +#endif } /* @@ -312,7 +328,11 @@ idDxtEncoder::CompressNormalMapDXT5Fast ======================== */ ID_INLINE void idDxtEncoder::CompressNormalMapDXT5Fast( const byte *inBuf, byte *outBuf, int width, int height ) { +#ifdef ID_WIN_X86_SSE2_INTRIN CompressNormalMapDXT5Fast_SSE2( inBuf, outBuf, width, height ); +#else + CompressNormalMapDXT5Fast_Generic( inBuf, outBuf, width, height ); +#endif } /* diff --git a/neo/renderer/DXT/DXTEncoder.cpp b/neo/renderer/DXT/DXTEncoder.cpp index 3336008e..411edc3f 100644 --- a/neo/renderer/DXT/DXTEncoder.cpp +++ b/neo/renderer/DXT/DXTEncoder.cpp @@ -52,6 +52,7 @@ idDxtEncoder::NV4XHardwareBugFix ======================== */ void idDxtEncoder::NV4XHardwareBugFix( byte *minColor, byte *maxColor ) const { +#ifdef ID_WIN_X86_ASM int minq = ( ( minColor[0] << 16 ) | ( minColor[1] << 8 ) | minColor[2] ) & 0x00F8FCF8; int maxq = ( ( maxColor[0] << 16 ) | ( maxColor[1] << 8 ) | maxColor[2] ) & 0x00F8FCF8; int mask = -( minq > maxq ) & 0x00FFFFFF; @@ -62,6 +63,13 @@ void idDxtEncoder::NV4XHardwareBugFix( byte *minColor, byte *maxColor ) const { min ^= max; *(int *)minColor = min; *(int *)maxColor = max; +#else + if ( ColorTo565( minColor ) > ColorTo565( maxColor ) ) { + SwapValues( minColor[0], maxColor[0] ); + SwapValues( minColor[1], maxColor[1] ); + SwapValues( minColor[2], maxColor[2] ); + } +#endif } /* @@ -950,6 +958,7 @@ int idDxtEncoder::GetMinMaxNormalYHQ( const byte *colorBlock, byte *minColor, by return bestError; } +#if defined( ID_WIN_X86_ASM ) ALIGN16( static float SIMD_SSE2_float_scale[4] ) = { 2.0f / 255.0f, 2.0f / 255.0f, 2.0f / 255.0f, 2.0f / 255.0f }; ALIGN16( static float SIMD_SSE2_float_descale[4] ) = { 255.0f / 2.0f, 255.0f / 2.0f, 255.0f / 2.0f, 255.0f / 2.0f }; ALIGN16( static float SIMD_SSE2_float_zero[4] ) = { 0.0f, 0.0f, 0.0f, 0.0f }; @@ -961,6 +970,7 @@ ALIGN16( static float SIMD_SP_rsqrt_c1[4] ) = { -0.5f, -0.5f, -0.5f, -0.5f }; ALIGN16( static dword SIMD_SSE2_dword_maskFirstThree[4] ) = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }; ALIGN16( static dword SIMD_SSE2_dword_maskWords[4] ) = { 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x00000000 }; #define R_SHUFFLE_PS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 )) +#endif /* ======================== @@ -968,6 +978,7 @@ NormalDistanceDXT1 ======================== */ int NormalDistanceDXT1( const int *vector, const int *normalized ) { +#if defined( ID_WIN_X86_ASM ) int result; __asm { mov esi, vector @@ -1007,6 +1018,24 @@ int NormalDistanceDXT1( const int *vector, const int *normalized ) { movd result, xmm0 } return result; +#else + float floatNormal[3]; + byte intNormal[4]; + floatNormal[0] = vector[0] * ( 2.0f / 255.0f ) - 1.0f; + floatNormal[1] = vector[1] * ( 2.0f / 255.0f ) - 1.0f; + floatNormal[2] = vector[2] * ( 2.0f / 255.0f ) - 1.0f; + float rcplen = idMath::InvSqrt( floatNormal[0] * floatNormal[0] + floatNormal[1] * floatNormal[1] + floatNormal[2] * floatNormal[2] ); + floatNormal[0] *= rcplen; + floatNormal[1] *= rcplen; + floatNormal[2] *= rcplen; + intNormal[0] = idMath::Ftob( ( floatNormal[0] + 1.0f ) * ( 255.0f / 2.0f ) + 0.5f ); + intNormal[1] = idMath::Ftob( ( floatNormal[1] + 1.0f ) * ( 255.0f / 2.0f ) + 0.5f ); + intNormal[2] = idMath::Ftob( ( floatNormal[2] + 1.0f ) * ( 255.0f / 2.0f ) + 0.5f ); + int result = ( ( intNormal[ 0 ] - normalized[ 0 ] ) * ( intNormal[ 0 ] - normalized[ 0 ] ) ) + + ( ( intNormal[ 1 ] - normalized[ 1 ] ) * ( intNormal[ 1 ] - normalized[ 1 ] ) ) + + ( ( intNormal[ 2 ] - normalized[ 2 ] ) * ( intNormal[ 2 ] - normalized[ 2 ] ) ); + return result; +#endif } /* @@ -1015,6 +1044,7 @@ NormalDistanceDXT5 ======================== */ int NormalDistanceDXT5( const int *vector, const int *normalized ) { +#if defined( ID_WIN_X86_ASM ) int result; __asm { mov esi, vector @@ -1064,6 +1094,33 @@ int NormalDistanceDXT5( const int *vector, const int *normalized ) { movd result, xmm0 } return result; +#else +#if 0 // object-space + const int c0 = 0; + const int c1 = 1; + const int c2 = 3; +#else + const int c0 = 1; + const int c1 = 2; + const int c2 = 3; +#endif + float floatNormal[3]; + byte intNormal[4]; + floatNormal[0] = vector[c0] / 255.0f * 2.0f - 1.0f; + floatNormal[1] = vector[c1] / 255.0f * 2.0f - 1.0f; + floatNormal[2] = vector[c2] / 255.0f * 2.0f - 1.0f; + float rcplen = idMath::InvSqrt( floatNormal[0] * floatNormal[0] + floatNormal[1] * floatNormal[1] + floatNormal[2] * floatNormal[2] ); + floatNormal[0] *= rcplen; + floatNormal[1] *= rcplen; + floatNormal[2] *= rcplen; + intNormal[c0] = idMath::Ftob( ( floatNormal[0] + 1.0f ) / 2.0f * 255.0f + 0.5f ); + intNormal[c1] = idMath::Ftob( ( floatNormal[1] + 1.0f ) / 2.0f * 255.0f + 0.5f ); + intNormal[c2] = idMath::Ftob( ( floatNormal[2] + 1.0f ) / 2.0f * 255.0f + 0.5f ); + int result = ( ( intNormal[ c0 ] - normalized[ c0 ] ) * ( intNormal[ c0 ] - normalized[ c0 ] ) ) + + ( ( intNormal[ c1 ] - normalized[ c1 ] ) * ( intNormal[ c1 ] - normalized[ c1 ] ) ) + + ( ( intNormal[ c2 ] - normalized[ c2 ] ) * ( intNormal[ c2 ] - normalized[ c2 ] ) ); + return result; +#endif } /* diff --git a/neo/renderer/DXT/DXTEncoder_SSE2.cpp b/neo/renderer/DXT/DXTEncoder_SSE2.cpp index e7f73657..1d09a4b2 100644 --- a/neo/renderer/DXT/DXTEncoder_SSE2.cpp +++ b/neo/renderer/DXT/DXTEncoder_SSE2.cpp @@ -34,6 +34,7 @@ Contains the DxtEncoder implementation for SSE2. #include "DXTCodec_local.h" #include "DXTCodec.h" +#if defined( ID_WIN_X86_SSE2_INTRIN ) || ( ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) ) ) //#define TEST_COMPRESSION #ifdef TEST_COMPRESSION @@ -142,10 +143,30 @@ paramO: colorBlock - 4*4 output tile, 4 bytes per pixel ======================== */ ID_INLINE void idDxtEncoder::ExtractBlock_SSE2( const byte * inPtr, int width, byte * colorBlock ) const { +#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) ) + __asm { + mov esi, inPtr + mov edi, colorBlock + mov eax, width + shl eax, 2 + movdqa xmm0, xmmword ptr [esi] + movdqa xmmword ptr [edi+ 0], xmm0 + movdqa xmm1, xmmword ptr [esi+eax] // + 4 * width + movdqa xmmword ptr [edi+16], xmm1 + movdqa xmm2, xmmword ptr [esi+eax*2] // + 8 * width + add esi, eax + movdqa xmmword ptr [edi+32], xmm2 + movdqa xmm3, xmmword ptr [esi+eax*2] // + 12 * width + movdqa xmmword ptr [edi+48], xmm3 + } +#elif defined ( ID_WIN_X86_SSE2_INTRIN ) *((__m128i *)(&colorBlock[ 0])) = _mm_load_si128( (__m128i *)( inPtr + width * 4 * 0 ) ); *((__m128i *)(&colorBlock[16])) = _mm_load_si128( (__m128i *)( inPtr + width * 4 * 1 ) ); *((__m128i *)(&colorBlock[32])) = _mm_load_si128( (__m128i *)( inPtr + width * 4 * 2 ) ); *((__m128i *)(&colorBlock[48])) = _mm_load_si128( (__m128i *)( inPtr + width * 4 * 3 ) ); +#else + assert( false ); +#endif } /* @@ -160,6 +181,31 @@ paramO: maxColor - Max 4 byte output color ======================== */ ID_INLINE void idDxtEncoder::GetMinMaxBBox_SSE2( const byte * colorBlock, byte * minColor, byte * maxColor ) const { +#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) ) + __asm { + mov eax, colorBlock + mov esi, minColor + mov edi, maxColor + movdqa xmm0, xmmword ptr [eax+ 0] + movdqa xmm1, xmmword ptr [eax+ 0] + pminub xmm0, xmmword ptr [eax+16] + pmaxub xmm1, xmmword ptr [eax+16] + pminub xmm0, xmmword ptr [eax+32] + pmaxub xmm1, xmmword ptr [eax+32] + pminub xmm0, xmmword ptr [eax+48] + pmaxub xmm1, xmmword ptr [eax+48] + pshufd xmm3, xmm0, R_SHUFFLE_D( 2, 3, 2, 3 ) + pshufd xmm4, xmm1, R_SHUFFLE_D( 2, 3, 2, 3 ) + pminub xmm0, xmm3 + pmaxub xmm1, xmm4 + pshuflw xmm6, xmm0, R_SHUFFLE_D( 2, 3, 2, 3 ) + pshuflw xmm7, xmm1, R_SHUFFLE_D( 2, 3, 2, 3 ) + pminub xmm0, xmm6 + pmaxub xmm1, xmm7 + movd dword ptr [esi], xmm0 + movd dword ptr [edi], xmm1 + } +#elif defined ( ID_WIN_X86_SSE2_INTRIN ) __m128i block0 = *((__m128i *)(&colorBlock[ 0])); __m128i block1 = *((__m128i *)(&colorBlock[16])); __m128i block2 = *((__m128i *)(&colorBlock[32])); @@ -187,6 +233,9 @@ ID_INLINE void idDxtEncoder::GetMinMaxBBox_SSE2( const byte * colorBlock, byte * *((int *)maxColor) = _mm_cvtsi128_si32( max6 ); *((int *)minColor) = _mm_cvtsi128_si32( min6 ); +#else + assert( false ); +#endif } /* @@ -195,6 +244,25 @@ idDxtEncoder::InsetColorsBBox_SSE2 ======================== */ ID_INLINE void idDxtEncoder::InsetColorsBBox_SSE2( byte * minColor, byte * maxColor ) const { +#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) ) + __asm { + mov esi, minColor + mov edi, maxColor + movd xmm0, dword ptr [esi] + movd xmm1, dword ptr [edi] + punpcklbw xmm0, SIMD_SSE2_byte_0 + punpcklbw xmm1, SIMD_SSE2_byte_0 + movdqa xmm2, xmm1 + psubw xmm2, xmm0 + pmulhw xmm2, SIMD_SSE2_word_insetShift + paddw xmm0, xmm2 + psubw xmm1, xmm2 + packuswb xmm0, xmm0 + packuswb xmm1, xmm1 + movd dword ptr [esi], xmm0 + movd dword ptr [edi], xmm1 + } +#elif defined ( ID_WIN_X86_SSE2_INTRIN ) __m128i min = _mm_cvtsi32_si128( *(int *)minColor ); __m128i max = _mm_cvtsi32_si128( *(int *)maxColor ); @@ -213,6 +281,9 @@ ID_INLINE void idDxtEncoder::InsetColorsBBox_SSE2( byte * minColor, byte * maxCo *((int *)minColor) = _mm_cvtsi128_si32( xmm0 ); *((int *)maxColor) = _mm_cvtsi128_si32( xmm1 ); +#else + assert( false ); +#endif } /* @@ -226,6 +297,165 @@ return: 4 byte color index block ======================== */ void idDxtEncoder::EmitColorIndices_SSE2( const byte * colorBlock, const byte * minColor_, const byte * maxColor_ ) { +#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) ) + ALIGN16( byte color0[16] ); + ALIGN16( byte color1[16] ); + ALIGN16( byte color2[16] ); + ALIGN16( byte color3[16] ); + ALIGN16( byte result[16] ); + byte *outPtr = outData; + + __asm { + mov esi, maxColor_ + mov edi, minColor_ + pxor xmm7, xmm7 + movdqa result, xmm7 + + movd xmm0, dword ptr [esi] + pand xmm0, SIMD_SSE2_byte_colorMask + punpcklbw xmm0, xmm7 + pshuflw xmm4, xmm0, R_SHUFFLE_D( 0, 3, 2, 3 ) + pshuflw xmm5, xmm0, R_SHUFFLE_D( 3, 1, 3, 3 ) + psrlw xmm4, 5 + psrlw xmm5, 6 + por xmm0, xmm4 + por xmm0, xmm5 + + movd xmm1, dword ptr [edi] + pand xmm1, SIMD_SSE2_byte_colorMask + punpcklbw xmm1, xmm7 + pshuflw xmm4, xmm1, R_SHUFFLE_D( 0, 3, 2, 3 ) + pshuflw xmm5, xmm1, R_SHUFFLE_D( 3, 1, 3, 3 ) + psrlw xmm4, 5 + psrlw xmm5, 6 + por xmm1, xmm4 + por xmm1, xmm5 + + movdqa xmm2, xmm0 + packuswb xmm2, xmm7 + pshufd xmm2, xmm2, R_SHUFFLE_D( 0, 1, 0, 1 ) + movdqa color0, xmm2 + + movdqa xmm6, xmm0 + paddw xmm6, xmm0 + paddw xmm6, xmm1 + pmulhw xmm6, SIMD_SSE2_word_div_by_3 // * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16 + packuswb xmm6, xmm7 + pshufd xmm6, xmm6, R_SHUFFLE_D( 0, 1, 0, 1 ) + movdqa color2, xmm6 + + movdqa xmm3, xmm1 + packuswb xmm3, xmm7 + pshufd xmm3, xmm3, R_SHUFFLE_D( 0, 1, 0, 1 ) + movdqa color1, xmm3 + + paddw xmm1, xmm1 + paddw xmm0, xmm1 + pmulhw xmm0, SIMD_SSE2_word_div_by_3 // * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16 + packuswb xmm0, xmm7 + pshufd xmm0, xmm0, R_SHUFFLE_D( 0, 1, 0, 1 ) + movdqa color3, xmm0 + + mov eax, 32 + mov esi, colorBlock + + loop1: // iterates 2 times + movq xmm3, qword ptr [esi+eax+0] + pshufd xmm3, xmm3, R_SHUFFLE_D( 0, 2, 1, 3 ) // punpckldq xmm4, SIMD_SSE2_dword_0 + movq xmm5, qword ptr [esi+eax+8] + pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 ) // punpckldq xmm5, SIMD_SSE2_dword_0 + + movdqa xmm0, xmm3 + movdqa xmm6, xmm5 + psadbw xmm0, color0 + psadbw xmm6, color0 + packssdw xmm0, xmm6 + movdqa xmm1, xmm3 + movdqa xmm6, xmm5 + psadbw xmm1, color1 + psadbw xmm6, color1 + packssdw xmm1, xmm6 + movdqa xmm2, xmm3 + movdqa xmm6, xmm5 + psadbw xmm2, color2 + psadbw xmm6, color2 + packssdw xmm2, xmm6 + psadbw xmm3, color3 + psadbw xmm5, color3 + packssdw xmm3, xmm5 + + movq xmm4, qword ptr [esi+eax+16] + pshufd xmm4, xmm4, R_SHUFFLE_D( 0, 2, 1, 3 ) + movq xmm5, qword ptr [esi+eax+24] + pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 ) + + movdqa xmm6, xmm4 + movdqa xmm7, xmm5 + psadbw xmm6, color0 + psadbw xmm7, color0 + packssdw xmm6, xmm7 + packssdw xmm0, xmm6 // d1 + movdqa xmm6, xmm4 + movdqa xmm7, xmm5 + psadbw xmm6, color1 + psadbw xmm7, color1 + packssdw xmm6, xmm7 + packssdw xmm1, xmm6 // d1 + movdqa xmm6, xmm4 + movdqa xmm7, xmm5 + psadbw xmm6, color2 + psadbw xmm7, color2 + packssdw xmm6, xmm7 + packssdw xmm2, xmm6 // d2 + psadbw xmm4, color3 + psadbw xmm5, color3 + packssdw xmm4, xmm5 + packssdw xmm3, xmm4 // d3 + + movdqa xmm7, result + pslld xmm7, 16 + + movdqa xmm4, xmm0 + movdqa xmm5, xmm1 + pcmpgtw xmm0, xmm3 // b0 + pcmpgtw xmm1, xmm2 // b1 + pcmpgtw xmm4, xmm2 // b2 + pcmpgtw xmm5, xmm3 // b3 + pcmpgtw xmm2, xmm3 // b4 + pand xmm4, xmm1 // x0 + pand xmm5, xmm0 // x1 + pand xmm2, xmm0 // x2 + por xmm4, xmm5 + pand xmm2, SIMD_SSE2_word_1 + pand xmm4, SIMD_SSE2_word_2 + por xmm2, xmm4 + + pshufd xmm5, xmm2, R_SHUFFLE_D( 2, 3, 0, 1 ) + punpcklwd xmm2, SIMD_SSE2_word_0 + punpcklwd xmm5, SIMD_SSE2_word_0 + pslld xmm5, 8 + por xmm7, xmm5 + por xmm7, xmm2 + movdqa result, xmm7 + + sub eax, 32 + jge loop1 + + mov esi, outPtr + pshufd xmm4, xmm7, R_SHUFFLE_D( 1, 2, 3, 0 ) + pshufd xmm5, xmm7, R_SHUFFLE_D( 2, 3, 0, 1 ) + pshufd xmm6, xmm7, R_SHUFFLE_D( 3, 0, 1, 2 ) + pslld xmm4, 2 + pslld xmm5, 4 + pslld xmm6, 6 + por xmm7, xmm4 + por xmm7, xmm5 + por xmm7, xmm6 + movd dword ptr [esi], xmm7 + } + + outData += 4; +#elif defined ( ID_WIN_X86_SSE2_INTRIN ) __m128c zero = SIMD_SSE2_zero; __m128c result = SIMD_SSE2_zero; __m128c color0, color1, color2, color3; @@ -359,6 +589,9 @@ void idDxtEncoder::EmitColorIndices_SSE2( const byte * colorBlock, const byte * unsigned int out = _mm_cvtsi128_si32( temp7 ); EmitUInt( out ); +#else + assert( false ); +#endif } /* @@ -372,6 +605,162 @@ return: 4 byte color index block ======================== */ void idDxtEncoder::EmitColorAlphaIndices_SSE2( const byte *colorBlock, const byte *minColor_, const byte *maxColor_ ) { +#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) ) + ALIGN16( byte color0[16] ); + ALIGN16( byte color1[16] ); + ALIGN16( byte color2[16] ); + ALIGN16( byte color3[16] ); + ALIGN16( byte result[16] ); + byte *outPtr = outData; + + __asm { + mov esi, maxColor_ + mov edi, minColor_ + pxor xmm7, xmm7 + movdqa result, xmm7 + + movd xmm0, dword ptr [esi] + pand xmm0, SIMD_SSE2_byte_colorMask + punpcklbw xmm0, xmm7 + pshuflw xmm4, xmm0, R_SHUFFLE_D( 0, 3, 2, 3 ) + pshuflw xmm5, xmm0, R_SHUFFLE_D( 3, 1, 3, 3 ) + psrlw xmm4, 5 + psrlw xmm5, 6 + por xmm0, xmm4 + por xmm0, xmm5 + + movd xmm1, dword ptr [edi] + pand xmm1, SIMD_SSE2_byte_colorMask + punpcklbw xmm1, xmm7 + pshuflw xmm4, xmm1, R_SHUFFLE_D( 0, 3, 2, 3 ) + pshuflw xmm5, xmm1, R_SHUFFLE_D( 3, 1, 3, 3 ) + psrlw xmm4, 5 + psrlw xmm5, 6 + por xmm1, xmm4 + por xmm1, xmm5 + + movdqa xmm2, xmm0 + packuswb xmm2, xmm7 + pshufd xmm2, xmm2, R_SHUFFLE_D( 0, 1, 0, 1 ) + movdqa color0, xmm2 + + movdqa xmm6, xmm0 + paddw xmm6, xmm1 + psrlw xmm6, 1 + packuswb xmm6, xmm7 + pshufd xmm6, xmm6, R_SHUFFLE_D( 0, 1, 0, 1 ) + movdqa color2, xmm6 + + movdqa xmm3, xmm1 + packuswb xmm3, xmm7 + pshufd xmm3, xmm3, R_SHUFFLE_D( 0, 1, 0, 1 ) + movdqa color1, xmm3 + + movdqa color3, xmm7 + + mov eax, 32 + mov esi, colorBlock + + loop1: // iterates 2 times + movq xmm3, qword ptr [esi+eax+0] + pshufd xmm3, xmm3, R_SHUFFLE_D( 0, 2, 1, 3 ) + movq xmm5, qword ptr [esi+eax+8] + pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 ) + + movdqa xmm0, xmm3 + movdqa xmm6, xmm5 + psadbw xmm0, color0 + psadbw xmm6, color0 + packssdw xmm0, xmm6 + movdqa xmm1, xmm3 + movdqa xmm6, xmm5 + psadbw xmm1, color1 + psadbw xmm6, color1 + packssdw xmm1, xmm6 + movdqa xmm2, xmm3 + movdqa xmm6, xmm5 + psadbw xmm2, color2 + psadbw xmm6, color2 + packssdw xmm2, xmm6 + + shufps xmm3, xmm5, R_SHUFFLE_D( 0, 2, 0, 2 ) + psrld xmm3, 24 + packssdw xmm3, xmm3 + + movq xmm4, qword ptr [esi+eax+16] + pshufd xmm4, xmm4, R_SHUFFLE_D( 0, 2, 1, 3 ) + movq xmm5, qword ptr [esi+eax+24] + pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 ) + + movdqa xmm6, xmm4 + movdqa xmm7, xmm5 + psadbw xmm6, color0 + psadbw xmm7, color0 + packssdw xmm6, xmm7 + packssdw xmm0, xmm6 // d1 + movdqa xmm6, xmm4 + movdqa xmm7, xmm5 + psadbw xmm6, color1 + psadbw xmm7, color1 + packssdw xmm6, xmm7 + packssdw xmm1, xmm6 // d1 + movdqa xmm6, xmm4 + movdqa xmm7, xmm5 + psadbw xmm6, color2 + psadbw xmm7, color2 + packssdw xmm6, xmm7 + packssdw xmm2, xmm6 // d2 + + shufps xmm4, xmm5, R_SHUFFLE_D( 0, 2, 0, 2 ) + psrld xmm4, 24 + packssdw xmm4, xmm4 + + punpcklqdq xmm3, xmm4 // c3 + + movdqa xmm7, result + pslld xmm7, 16 + + movdqa xmm4, xmm2 + pcmpgtw xmm2, xmm0 // b0 + pcmpgtw xmm4, xmm1 // b1 + pcmpgtw xmm1, xmm0 // b2 + pmaxsw xmm3, SIMD_SSE2_word_127 // b3 + pcmpeqw xmm3, SIMD_SSE2_word_127 + + pand xmm2, xmm4 + por xmm2, xmm3 // b0 & b1 | b3 + pxor xmm1, xmm4 + por xmm1, xmm3 // b2 ^ b1 | b3 + pand xmm2, SIMD_SSE2_word_2 + pand xmm1, SIMD_SSE2_word_1 + por xmm2, xmm1 + + pshufd xmm5, xmm2, R_SHUFFLE_D( 2, 3, 0, 1 ) + punpcklwd xmm2, SIMD_SSE2_word_0 + punpcklwd xmm5, SIMD_SSE2_word_0 + pslld xmm5, 8 + por xmm7, xmm5 + por xmm7, xmm2 + movdqa result, xmm7 + + sub eax, 32 + jge loop1 + + mov esi, outPtr + pshufd xmm4, xmm7, R_SHUFFLE_D( 1, 2, 3, 0 ) + pshufd xmm5, xmm7, R_SHUFFLE_D( 2, 3, 0, 1 ) + pshufd xmm6, xmm7, R_SHUFFLE_D( 3, 0, 1, 2 ) + pslld xmm4, 2 + pslld xmm5, 4 + pslld xmm6, 6 + por xmm7, xmm4 + por xmm7, xmm5 + por xmm7, xmm6 + movd dword ptr [esi], xmm7 + } + + outData += 4; +#elif defined ( ID_WIN_X86_SSE2_INTRIN ) __m128c zero = SIMD_SSE2_zero; __m128c result = SIMD_SSE2_zero; __m128c color0, color1, color2; @@ -508,6 +897,9 @@ void idDxtEncoder::EmitColorAlphaIndices_SSE2( const byte *colorBlock, const byt unsigned int out = _mm_cvtsi128_si32( temp7 ); EmitUInt( out ); +#else + assert( false ); +#endif } /* @@ -521,6 +913,147 @@ return: 4 byte color index block ======================== */ void idDxtEncoder::EmitCoCgIndices_SSE2( const byte *colorBlock, const byte *minColor_, const byte *maxColor_ ) { +#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) ) + ALIGN16( byte color0[16] ); + ALIGN16( byte color1[16] ); + ALIGN16( byte color2[16] ); + ALIGN16( byte color3[16] ); + ALIGN16( byte result[16] ); + byte *outPtr = outData; + + __asm { + mov esi, maxColor_ + mov edi, minColor_ + pxor xmm7, xmm7 + movdqa result, xmm7 + + movd xmm0, dword ptr [esi] + pand xmm0, SIMD_SSE2_byte_colorMask2 + pshufd xmm0, xmm0, R_SHUFFLE_D( 0, 1, 0, 1 ) + movdqa color0, xmm0 + + movd xmm1, dword ptr [edi] + pand xmm1, SIMD_SSE2_byte_colorMask2 + pshufd xmm1, xmm1, R_SHUFFLE_D( 0, 1, 0, 1 ) + movdqa color1, xmm1 + + punpcklbw xmm0, xmm7 + punpcklbw xmm1, xmm7 + + movdqa xmm6, xmm1 + paddw xmm1, xmm0 + paddw xmm0, xmm1 + pmulhw xmm0, SIMD_SSE2_word_div_by_3 // * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16 + packuswb xmm0, xmm7 + pshufd xmm0, xmm0, R_SHUFFLE_D( 0, 1, 0, 1 ) + movdqa color2, xmm0 + + paddw xmm1, xmm6 + pmulhw xmm1, SIMD_SSE2_word_div_by_3 // * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16 + packuswb xmm1, xmm7 + pshufd xmm1, xmm1, R_SHUFFLE_D( 0, 1, 0, 1 ) + movdqa color3, xmm1 + + mov eax, 32 + mov esi, colorBlock + + loop1: // iterates 2 times + movq xmm3, qword ptr [esi+eax+0] + pshufd xmm3, xmm3, R_SHUFFLE_D( 0, 2, 1, 3 ) // punpckldq xmm4, SIMD_SSE2_dword_0 + movq xmm5, qword ptr [esi+eax+8] + pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 ) // punpckldq xmm5, SIMD_SSE2_dword_0 + + movdqa xmm0, xmm3 + movdqa xmm6, xmm5 + psadbw xmm0, color0 + psadbw xmm6, color0 + packssdw xmm0, xmm6 + movdqa xmm1, xmm3 + movdqa xmm6, xmm5 + psadbw xmm1, color1 + psadbw xmm6, color1 + packssdw xmm1, xmm6 + movdqa xmm2, xmm3 + movdqa xmm6, xmm5 + psadbw xmm2, color2 + psadbw xmm6, color2 + packssdw xmm2, xmm6 + psadbw xmm3, color3 + psadbw xmm5, color3 + packssdw xmm3, xmm5 + + movq xmm4, qword ptr [esi+eax+16] + pshufd xmm4, xmm4, R_SHUFFLE_D( 0, 2, 1, 3 ) + movq xmm5, qword ptr [esi+eax+24] + pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 ) + + movdqa xmm6, xmm4 + movdqa xmm7, xmm5 + psadbw xmm6, color0 + psadbw xmm7, color0 + packssdw xmm6, xmm7 + packssdw xmm0, xmm6 // d1 + movdqa xmm6, xmm4 + movdqa xmm7, xmm5 + psadbw xmm6, color1 + psadbw xmm7, color1 + packssdw xmm6, xmm7 + packssdw xmm1, xmm6 // d1 + movdqa xmm6, xmm4 + movdqa xmm7, xmm5 + psadbw xmm6, color2 + psadbw xmm7, color2 + packssdw xmm6, xmm7 + packssdw xmm2, xmm6 // d2 + psadbw xmm4, color3 + psadbw xmm5, color3 + packssdw xmm4, xmm5 + packssdw xmm3, xmm4 // d3 + + movdqa xmm7, result + pslld xmm7, 16 + + movdqa xmm4, xmm0 + movdqa xmm5, xmm1 + pcmpgtw xmm0, xmm3 // b0 + pcmpgtw xmm1, xmm2 // b1 + pcmpgtw xmm4, xmm2 // b2 + pcmpgtw xmm5, xmm3 // b3 + pcmpgtw xmm2, xmm3 // b4 + pand xmm4, xmm1 // x0 + pand xmm5, xmm0 // x1 + pand xmm2, xmm0 // x2 + por xmm4, xmm5 + pand xmm2, SIMD_SSE2_word_1 + pand xmm4, SIMD_SSE2_word_2 + por xmm2, xmm4 + + pshufd xmm5, xmm2, R_SHUFFLE_D( 2, 3, 0, 1 ) + punpcklwd xmm2, SIMD_SSE2_word_0 + punpcklwd xmm5, SIMD_SSE2_word_0 + pslld xmm5, 8 + por xmm7, xmm5 + por xmm7, xmm2 + movdqa result, xmm7 + + sub eax, 32 + jge loop1 + + mov esi, outPtr + pshufd xmm4, xmm7, R_SHUFFLE_D( 1, 2, 3, 0 ) + pshufd xmm5, xmm7, R_SHUFFLE_D( 2, 3, 0, 1 ) + pshufd xmm6, xmm7, R_SHUFFLE_D( 3, 0, 1, 2 ) + pslld xmm4, 2 + pslld xmm5, 4 + pslld xmm6, 6 + por xmm7, xmm4 + por xmm7, xmm5 + por xmm7, xmm6 + movd dword ptr [esi], xmm7 + } + + outData += 4; +#elif defined ( ID_WIN_X86_SSE2_INTRIN ) __m128c zero = SIMD_SSE2_zero; __m128c result = SIMD_SSE2_zero; __m128c color0, color1, color2, color3; @@ -640,6 +1173,9 @@ void idDxtEncoder::EmitCoCgIndices_SSE2( const byte *colorBlock, const byte *min unsigned int out = _mm_cvtsi128_si32( temp7 ); EmitUInt( out ); +#else + assert( false ); +#endif } /* @@ -652,6 +1188,144 @@ paramO: maxAlpha - Max alpha found ======================== */ void idDxtEncoder::EmitAlphaIndices_SSE2( const byte *block, const int minAlpha_, const int maxAlpha_ ) { +#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) ) + assert( maxAlpha_ >= minAlpha_ ); + + byte *outPtr = outData; + + __asm { + mov esi, block + movdqa xmm0, xmmword ptr [esi+ 0] + movdqa xmm5, xmmword ptr [esi+16] + movdqa xmm6, xmmword ptr [esi+32] + movdqa xmm4, xmmword ptr [esi+48] + + psrld xmm0, 24 + psrld xmm5, 24 + psrld xmm6, 24 + psrld xmm4, 24 + + packuswb xmm0, xmm5 + packuswb xmm6, xmm4 + + //--------------------- + + // ab0 = ( 7 * maxAlpha + 7 * minAlpha + ALPHA_RANGE ) / 14 + // ab3 = ( 9 * maxAlpha + 5 * minAlpha + ALPHA_RANGE ) / 14 + // ab2 = ( 11 * maxAlpha + 3 * minAlpha + ALPHA_RANGE ) / 14 + // ab1 = ( 13 * maxAlpha + 1 * minAlpha + ALPHA_RANGE ) / 14 + + // ab4 = ( 7 * maxAlpha + 7 * minAlpha + ALPHA_RANGE ) / 14 + // ab5 = ( 5 * maxAlpha + 9 * minAlpha + ALPHA_RANGE ) / 14 + // ab6 = ( 3 * maxAlpha + 11 * minAlpha + ALPHA_RANGE ) / 14 + // ab7 = ( 1 * maxAlpha + 13 * minAlpha + ALPHA_RANGE ) / 14 + + movd xmm5, maxAlpha_ + pshuflw xmm5, xmm5, R_SHUFFLE_D( 0, 0, 0, 0 ) + pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 0, 0, 0 ) + movdqa xmm7, xmm5 + + movd xmm2, minAlpha_ + pshuflw xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 ) + pshufd xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 ) + movdqa xmm3, xmm2 + + pmullw xmm5, SIMD_SSE2_word_scale_7_9_11_13 + pmullw xmm7, SIMD_SSE2_word_scale_7_5_3_1 + pmullw xmm2, SIMD_SSE2_word_scale_7_5_3_1 + pmullw xmm3, SIMD_SSE2_word_scale_7_9_11_13 + + paddw xmm5, xmm2 + paddw xmm7, xmm3 + + paddw xmm5, SIMD_SSE2_word_7 + paddw xmm7, SIMD_SSE2_word_7 + + pmulhw xmm5, SIMD_SSE2_word_div_by_14 // * ( ( 1 << 16 ) / 14 + 1 ) ) >> 16 + pmulhw xmm7, SIMD_SSE2_word_div_by_14 // * ( ( 1 << 16 ) / 14 + 1 ) ) >> 16 + + pshufd xmm1, xmm5, R_SHUFFLE_D( 3, 3, 3, 3 ) + pshufd xmm2, xmm5, R_SHUFFLE_D( 2, 2, 2, 2 ) + pshufd xmm3, xmm5, R_SHUFFLE_D( 1, 1, 1, 1 ) + packuswb xmm1, xmm1 // ab1 + packuswb xmm2, xmm2 // ab2 + packuswb xmm3, xmm3 // ab3 + + packuswb xmm0, xmm6 // alpha block + + pshufd xmm4, xmm7, R_SHUFFLE_D( 0, 0, 0, 0 ) + pshufd xmm5, xmm7, R_SHUFFLE_D( 1, 1, 1, 1 ) + pshufd xmm6, xmm7, R_SHUFFLE_D( 2, 2, 2, 2 ) + pshufd xmm7, xmm7, R_SHUFFLE_D( 3, 3, 3, 3 ) + packuswb xmm4, xmm4 // ab4 + packuswb xmm5, xmm5 // ab5 + packuswb xmm6, xmm6 // ab6 + packuswb xmm7, xmm7 // ab7 + + pmaxub xmm1, xmm0 + pmaxub xmm2, xmm0 + pmaxub xmm3, xmm0 + pcmpeqb xmm1, xmm0 + pcmpeqb xmm2, xmm0 + pcmpeqb xmm3, xmm0 + pmaxub xmm4, xmm0 + pmaxub xmm5, xmm0 + pmaxub xmm6, xmm0 + pmaxub xmm7, xmm0 + pcmpeqb xmm4, xmm0 + pcmpeqb xmm5, xmm0 + pcmpeqb xmm6, xmm0 + pcmpeqb xmm7, xmm0 + movdqa xmm0, SIMD_SSE2_byte_8 + paddsb xmm0, xmm1 + paddsb xmm2, xmm3 + paddsb xmm4, xmm5 + paddsb xmm6, xmm7 + paddsb xmm0, xmm2 + paddsb xmm4, xmm6 + paddsb xmm0, xmm4 + pand xmm0, SIMD_SSE2_byte_7 + movdqa xmm1, SIMD_SSE2_byte_2 + pcmpgtb xmm1, xmm0 + pand xmm1, SIMD_SSE2_byte_1 + pxor xmm0, xmm1 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + movdqa xmm5, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + psrlq xmm1, 8- 3 + psrlq xmm2, 16- 6 + psrlq xmm3, 24- 9 + psrlq xmm4, 32-12 + psrlq xmm5, 40-15 + psrlq xmm6, 48-18 + psrlq xmm7, 56-21 + pand xmm0, SIMD_SSE2_dword_alpha_bit_mask0 + pand xmm1, SIMD_SSE2_dword_alpha_bit_mask1 + pand xmm2, SIMD_SSE2_dword_alpha_bit_mask2 + pand xmm3, SIMD_SSE2_dword_alpha_bit_mask3 + pand xmm4, SIMD_SSE2_dword_alpha_bit_mask4 + pand xmm5, SIMD_SSE2_dword_alpha_bit_mask5 + pand xmm6, SIMD_SSE2_dword_alpha_bit_mask6 + pand xmm7, SIMD_SSE2_dword_alpha_bit_mask7 + por xmm0, xmm1 + por xmm2, xmm3 + por xmm4, xmm5 + por xmm6, xmm7 + por xmm0, xmm2 + por xmm4, xmm6 + por xmm0, xmm4 + mov esi, outPtr + movd [esi+0], xmm0 + pshufd xmm1, xmm0, R_SHUFFLE_D( 2, 3, 0, 1 ) + movd [esi+3], xmm1 + } + + outData += 6; +#elif defined ( ID_WIN_X86_SSE2_INTRIN ) __m128i block0 = *((__m128i *)(&block[ 0])); __m128i block1 = *((__m128i *)(&block[16])); __m128i block2 = *((__m128i *)(&block[32])); @@ -777,6 +1451,9 @@ void idDxtEncoder::EmitAlphaIndices_SSE2( const byte *block, const int minAlpha_ out = _mm_cvtsi128_si32( temp1 ); EmitUInt( out ); outData--; +#else + assert( false ); +#endif } /* @@ -785,6 +1462,151 @@ idDxtEncoder::EmitAlphaIndices_SSE2 ======================== */ void idDxtEncoder::EmitAlphaIndices_SSE2( const byte *block, const int channelBitOffset, const int minAlpha_, const int maxAlpha_ ) { +#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) ) + assert( maxAlpha_ >= minAlpha_ ); + + byte *outPtr = outData; + + __asm { + movd xmm7, channelBitOffset + + mov esi, block + movdqa xmm0, xmmword ptr [esi+ 0] + movdqa xmm5, xmmword ptr [esi+16] + movdqa xmm6, xmmword ptr [esi+32] + movdqa xmm4, xmmword ptr [esi+48] + + psrld xmm0, xmm7 + psrld xmm5, xmm7 + psrld xmm6, xmm7 + psrld xmm4, xmm7 + + pand xmm0, SIMD_SSE2_dword_byte_mask + pand xmm5, SIMD_SSE2_dword_byte_mask + pand xmm6, SIMD_SSE2_dword_byte_mask + pand xmm4, SIMD_SSE2_dword_byte_mask + + packuswb xmm0, xmm5 + packuswb xmm6, xmm4 + + //--------------------- + + // ab0 = ( 7 * maxAlpha + 7 * minAlpha + ALPHA_RANGE ) / 14 + // ab3 = ( 9 * maxAlpha + 5 * minAlpha + ALPHA_RANGE ) / 14 + // ab2 = ( 11 * maxAlpha + 3 * minAlpha + ALPHA_RANGE ) / 14 + // ab1 = ( 13 * maxAlpha + 1 * minAlpha + ALPHA_RANGE ) / 14 + + // ab4 = ( 7 * maxAlpha + 7 * minAlpha + ALPHA_RANGE ) / 14 + // ab5 = ( 5 * maxAlpha + 9 * minAlpha + ALPHA_RANGE ) / 14 + // ab6 = ( 3 * maxAlpha + 11 * minAlpha + ALPHA_RANGE ) / 14 + // ab7 = ( 1 * maxAlpha + 13 * minAlpha + ALPHA_RANGE ) / 14 + + movd xmm5, maxAlpha_ + pshuflw xmm5, xmm5, R_SHUFFLE_D( 0, 0, 0, 0 ) + pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 0, 0, 0 ) + movdqa xmm7, xmm5 + + movd xmm2, minAlpha_ + pshuflw xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 ) + pshufd xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 ) + movdqa xmm3, xmm2 + + pmullw xmm5, SIMD_SSE2_word_scale_7_9_11_13 + pmullw xmm7, SIMD_SSE2_word_scale_7_5_3_1 + pmullw xmm2, SIMD_SSE2_word_scale_7_5_3_1 + pmullw xmm3, SIMD_SSE2_word_scale_7_9_11_13 + + paddw xmm5, xmm2 + paddw xmm7, xmm3 + + paddw xmm5, SIMD_SSE2_word_7 + paddw xmm7, SIMD_SSE2_word_7 + + pmulhw xmm5, SIMD_SSE2_word_div_by_14 // * ( ( 1 << 16 ) / 14 + 1 ) ) >> 16 + pmulhw xmm7, SIMD_SSE2_word_div_by_14 // * ( ( 1 << 16 ) / 14 + 1 ) ) >> 16 + + pshufd xmm1, xmm5, R_SHUFFLE_D( 3, 3, 3, 3 ) + pshufd xmm2, xmm5, R_SHUFFLE_D( 2, 2, 2, 2 ) + pshufd xmm3, xmm5, R_SHUFFLE_D( 1, 1, 1, 1 ) + packuswb xmm1, xmm1 // ab1 + packuswb xmm2, xmm2 // ab2 + packuswb xmm3, xmm3 // ab3 + + packuswb xmm0, xmm6 // alpha block + + pshufd xmm4, xmm7, R_SHUFFLE_D( 0, 0, 0, 0 ) + pshufd xmm5, xmm7, R_SHUFFLE_D( 1, 1, 1, 1 ) + pshufd xmm6, xmm7, R_SHUFFLE_D( 2, 2, 2, 2 ) + pshufd xmm7, xmm7, R_SHUFFLE_D( 3, 3, 3, 3 ) + packuswb xmm4, xmm4 // ab4 + packuswb xmm5, xmm5 // ab5 + packuswb xmm6, xmm6 // ab6 + packuswb xmm7, xmm7 // ab7 + + pmaxub xmm1, xmm0 + pmaxub xmm2, xmm0 + pmaxub xmm3, xmm0 + pcmpeqb xmm1, xmm0 + pcmpeqb xmm2, xmm0 + pcmpeqb xmm3, xmm0 + pmaxub xmm4, xmm0 + pmaxub xmm5, xmm0 + pmaxub xmm6, xmm0 + pmaxub xmm7, xmm0 + pcmpeqb xmm4, xmm0 + pcmpeqb xmm5, xmm0 + pcmpeqb xmm6, xmm0 + pcmpeqb xmm7, xmm0 + movdqa xmm0, SIMD_SSE2_byte_8 + paddsb xmm0, xmm1 + paddsb xmm2, xmm3 + paddsb xmm4, xmm5 + paddsb xmm6, xmm7 + paddsb xmm0, xmm2 + paddsb xmm4, xmm6 + paddsb xmm0, xmm4 + pand xmm0, SIMD_SSE2_byte_7 + movdqa xmm1, SIMD_SSE2_byte_2 + pcmpgtb xmm1, xmm0 + pand xmm1, SIMD_SSE2_byte_1 + pxor xmm0, xmm1 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + movdqa xmm5, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + psrlq xmm1, 8- 3 + psrlq xmm2, 16- 6 + psrlq xmm3, 24- 9 + psrlq xmm4, 32-12 + psrlq xmm5, 40-15 + psrlq xmm6, 48-18 + psrlq xmm7, 56-21 + pand xmm0, SIMD_SSE2_dword_alpha_bit_mask0 + pand xmm1, SIMD_SSE2_dword_alpha_bit_mask1 + pand xmm2, SIMD_SSE2_dword_alpha_bit_mask2 + pand xmm3, SIMD_SSE2_dword_alpha_bit_mask3 + pand xmm4, SIMD_SSE2_dword_alpha_bit_mask4 + pand xmm5, SIMD_SSE2_dword_alpha_bit_mask5 + pand xmm6, SIMD_SSE2_dword_alpha_bit_mask6 + pand xmm7, SIMD_SSE2_dword_alpha_bit_mask7 + por xmm0, xmm1 + por xmm2, xmm3 + por xmm4, xmm5 + por xmm6, xmm7 + por xmm0, xmm2 + por xmm4, xmm6 + por xmm0, xmm4 + mov esi, outPtr + movd [esi+0], xmm0 + pshufd xmm1, xmm0, R_SHUFFLE_D( 2, 3, 0, 1 ) + movd [esi+3], xmm1 + } + + outData += 6; +#elif defined ( ID_WIN_X86_SSE2_INTRIN ) __m128i block0 = *((__m128i *)(&block[ 0])); __m128i block1 = *((__m128i *)(&block[16])); __m128i block2 = *((__m128i *)(&block[32])); @@ -917,6 +1739,9 @@ void idDxtEncoder::EmitAlphaIndices_SSE2( const byte *block, const int channelBi out = _mm_cvtsi128_si32( temp1 ); EmitUInt( out ); outData--; +#else + assert( false ); +#endif } /* @@ -1102,6 +1927,108 @@ idDxtEncoder::ScaleYCoCg_SSE2 ======================== */ ID_INLINE void idDxtEncoder::ScaleYCoCg_SSE2( byte *colorBlock, byte *minColor, byte *maxColor ) const { +#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) ) + __asm { + mov esi, colorBlock + mov edx, minColor + mov ecx, maxColor + + movd xmm0, dword ptr [edx] + movd xmm1, dword ptr [ecx] + + punpcklbw xmm0, SIMD_SSE2_byte_0 + punpcklbw xmm1, SIMD_SSE2_byte_0 + + movdqa xmm6, SIMD_SSE2_word_center_128 + movdqa xmm7, SIMD_SSE2_word_center_128 + + psubw xmm6, xmm0 + psubw xmm7, xmm1 + + psubw xmm0, SIMD_SSE2_word_center_128 + psubw xmm1, SIMD_SSE2_word_center_128 + + pmaxsw xmm6, xmm0 + pmaxsw xmm7, xmm1 + + pmaxsw xmm6, xmm7 + pshuflw xmm7, xmm6, R_SHUFFLE_D( 1, 0, 1, 0 ) + pmaxsw xmm6, xmm7 + pshufd xmm6, xmm6, R_SHUFFLE_D( 0, 0, 0, 0 ) + + movdqa xmm7, xmm6 + pcmpgtw xmm6, SIMD_SSE2_word_63 // mask0 + pcmpgtw xmm7, SIMD_SSE2_word_31 // mask1 + + pandn xmm7, SIMD_SSE2_byte_2 + por xmm7, SIMD_SSE2_byte_1 + pandn xmm6, xmm7 + movdqa xmm3, xmm6 + movdqa xmm7, xmm6 + pxor xmm7, SIMD_SSE2_byte_not + por xmm7, SIMD_SSE2_byte_scale_mask0 // 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00 + paddw xmm6, SIMD_SSE2_byte_1 + pand xmm6, SIMD_SSE2_byte_scale_mask1 // 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0xFF + por xmm6, SIMD_SSE2_byte_scale_mask2 // 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00 + + movd xmm4, dword ptr [edx] + movd xmm5, dword ptr [ecx] + + pand xmm4, SIMD_SSE2_byte_scale_mask3 // 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0xFF, 0xFF + pand xmm5, SIMD_SSE2_byte_scale_mask3 + + pslld xmm3, 3 + pand xmm3, SIMD_SSE2_byte_scale_mask4 // 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00 + + por xmm4, xmm3 + por xmm5, xmm3 + + paddb xmm4, SIMD_SSE2_byte_minus_128_0 + paddb xmm5, SIMD_SSE2_byte_minus_128_0 + + pmullw xmm4, xmm6 + pmullw xmm5, xmm6 + + pand xmm4, xmm7 + pand xmm5, xmm7 + + psubb xmm4, SIMD_SSE2_byte_minus_128_0 + psubb xmm5, SIMD_SSE2_byte_minus_128_0 + + movd dword ptr [edx], xmm4 + movd dword ptr [ecx], xmm5 + + movdqa xmm0, xmmword ptr [esi+ 0*4] + movdqa xmm1, xmmword ptr [esi+ 4*4] + movdqa xmm2, xmmword ptr [esi+ 8*4] + movdqa xmm3, xmmword ptr [esi+12*4] + + paddb xmm0, SIMD_SSE2_byte_minus_128_0 + paddb xmm1, SIMD_SSE2_byte_minus_128_0 + paddb xmm2, SIMD_SSE2_byte_minus_128_0 + paddb xmm3, SIMD_SSE2_byte_minus_128_0 + + pmullw xmm0, xmm6 + pmullw xmm1, xmm6 + pmullw xmm2, xmm6 + pmullw xmm3, xmm6 + + pand xmm0, xmm7 + pand xmm1, xmm7 + pand xmm2, xmm7 + pand xmm3, xmm7 + + psubb xmm0, SIMD_SSE2_byte_minus_128_0 + psubb xmm1, SIMD_SSE2_byte_minus_128_0 + psubb xmm2, SIMD_SSE2_byte_minus_128_0 + psubb xmm3, SIMD_SSE2_byte_minus_128_0 + + movdqa xmmword ptr [esi+ 0*4], xmm0 + movdqa xmmword ptr [esi+ 4*4], xmm1 + movdqa xmmword ptr [esi+ 8*4], xmm2 + movdqa xmmword ptr [esi+12*4], xmm3 + } +#elif defined ( ID_WIN_X86_SSE2_INTRIN ) __m128i block0 = *((__m128i *)(&colorBlock[ 0])); __m128i block1 = *((__m128i *)(&colorBlock[16])); __m128i block2 = *((__m128i *)(&colorBlock[32])); @@ -1189,6 +2116,9 @@ ID_INLINE void idDxtEncoder::ScaleYCoCg_SSE2( byte *colorBlock, byte *minColor, *((__m128i *)(&colorBlock[16])) = _mm_sub_epi8( temp1, (const __m128i &)SIMD_SSE2_byte_minus_128_0 ); *((__m128i *)(&colorBlock[32])) = _mm_sub_epi8( temp2, (const __m128i &)SIMD_SSE2_byte_minus_128_0 ); *((__m128i *)(&colorBlock[48])) = _mm_sub_epi8( temp3, (const __m128i &)SIMD_SSE2_byte_minus_128_0 ); +#else + assert( false ); +#endif } /* @@ -1197,6 +2127,40 @@ idDxtEncoder::InsetYCoCgBBox_SSE2 ======================== */ ID_INLINE void idDxtEncoder::InsetYCoCgBBox_SSE2( byte *minColor, byte *maxColor ) const { +#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) ) + __asm { + mov esi, minColor + mov edi, maxColor + movd xmm0, dword ptr [esi] + movd xmm1, dword ptr [edi] + punpcklbw xmm0, SIMD_SSE2_byte_0 + punpcklbw xmm1, SIMD_SSE2_byte_0 + movdqa xmm2, xmm1 + psubw xmm2, xmm0 + psubw xmm2, SIMD_SSE2_word_insetYCoCgRound + pand xmm2, SIMD_SSE2_word_insetYCoCgMask + pmullw xmm0, SIMD_SSE2_word_insetYCoCgShiftUp + pmullw xmm1, SIMD_SSE2_word_insetYCoCgShiftUp + paddw xmm0, xmm2 + psubw xmm1, xmm2 + pmulhw xmm0, SIMD_SSE2_word_insetYCoCgShiftDown + pmulhw xmm1, SIMD_SSE2_word_insetYCoCgShiftDown + pmaxsw xmm0, SIMD_SSE2_word_0 + pmaxsw xmm1, SIMD_SSE2_word_0 + pand xmm0, SIMD_SSE2_word_insetYCoCgQuantMask + pand xmm1, SIMD_SSE2_word_insetYCoCgQuantMask + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + pmulhw xmm2, SIMD_SSE2_word_insetYCoCgRep + pmulhw xmm3, SIMD_SSE2_word_insetYCoCgRep + por xmm0, xmm2 + por xmm1, xmm3 + packuswb xmm0, xmm0 + packuswb xmm1, xmm1 + movd dword ptr [esi], xmm0 + movd dword ptr [edi], xmm1 + } +#elif defined ( ID_WIN_X86_SSE2_INTRIN ) __m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; temp0 = _mm_cvtsi32_si128( *(int *)minColor ); @@ -1227,6 +2191,9 @@ ID_INLINE void idDxtEncoder::InsetYCoCgBBox_SSE2( byte *minColor, byte *maxColor *(int *)minColor = _mm_cvtsi128_si32( temp0 ); *(int *)maxColor = _mm_cvtsi128_si32( temp1 ); +#else + assert( false ); +#endif } /* @@ -1240,6 +2207,80 @@ return: diagonal to use ======================== */ ID_INLINE void idDxtEncoder::SelectYCoCgDiagonal_SSE2( const byte *colorBlock, byte *minColor, byte *maxColor ) const { +#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) ) + __asm { + mov esi, colorBlock + mov edx, minColor + mov ecx, maxColor + + movdqa xmm0, xmmword ptr [esi+ 0] + movdqa xmm1, xmmword ptr [esi+16] + movdqa xmm2, xmmword ptr [esi+32] + movdqa xmm3, xmmword ptr [esi+48] + + pand xmm0, SIMD_SSE2_dword_word_mask + pand xmm1, SIMD_SSE2_dword_word_mask + pand xmm2, SIMD_SSE2_dword_word_mask + pand xmm3, SIMD_SSE2_dword_word_mask + + pslldq xmm1, 2 + pslldq xmm3, 2 + por xmm0, xmm1 + por xmm2, xmm3 + + movd xmm1, dword ptr [edx] // minColor + movd xmm3, dword ptr [ecx] // maxColor + + movdqa xmm6, xmm1 + movdqa xmm7, xmm3 + + pavgb xmm1, xmm3 + pshuflw xmm1, xmm1, R_SHUFFLE_D( 0, 0, 0, 0 ) + pshufd xmm1, xmm1, R_SHUFFLE_D( 0, 0, 0, 0 ) + movdqa xmm3, xmm1 + + pmaxub xmm1, xmm0 + pmaxub xmm3, xmm2 + pcmpeqb xmm1, xmm0 + pcmpeqb xmm3, xmm2 + + movdqa xmm0, xmm1 + movdqa xmm2, xmm3 + psrldq xmm0, 1 + psrldq xmm2, 1 + + pxor xmm0, xmm1 + pxor xmm2, xmm3 + pand xmm0, SIMD_SSE2_word_1 + pand xmm2, SIMD_SSE2_word_1 + + paddw xmm0, xmm2 + psadbw xmm0, SIMD_SSE2_byte_0 + pshufd xmm1, xmm0, R_SHUFFLE_D( 2, 3, 0, 1 ) + +#ifdef NVIDIA_7X_HARDWARE_BUG_FIX + paddw xmm1, xmm0 // side + pcmpgtw xmm1, SIMD_SSE2_word_8 // mask = -( side > 8 ) + pand xmm1, SIMD_SSE2_byte_diagonalMask + movdqa xmm0, xmm6 + pcmpeqb xmm0, xmm7 // mask &= -( minColor[0] != maxColor[0] ) + pslldq xmm0, 1 + pandn xmm0, xmm1 +#else + paddw xmm0, xmm1 // side + pcmpgtw xmm0, SIMD_SSE2_word_8 // mask = -( side > 8 ) + pand xmm0, SIMD_SSE2_byte_diagonalMask +#endif + + pxor xmm6, xmm7 + pand xmm0, xmm6 + pxor xmm7, xmm0 + pxor xmm6, xmm7 + + movd dword ptr [edx], xmm6 + movd dword ptr [ecx], xmm7 + } +#elif defined ( ID_WIN_X86_SSE2_INTRIN ) __m128i block0 = *((__m128i *)(&colorBlock[ 0])); __m128i block1 = *((__m128i *)(&colorBlock[16])); __m128i block2 = *((__m128i *)(&colorBlock[32])); @@ -1300,6 +2341,9 @@ ID_INLINE void idDxtEncoder::SelectYCoCgDiagonal_SSE2( const byte *colorBlock, b *(int *)minColor = _mm_cvtsi128_si32( temp6 ); *(int *)maxColor = _mm_cvtsi128_si32( temp7 ); +#else + assert( false ); +#endif } /* @@ -1376,6 +2420,113 @@ paramO: maxGreen - Maximal normal Y found ======================== */ void idDxtEncoder::EmitGreenIndices_SSE2( const byte *block, const int channelBitOffset, const int minGreen, const int maxGreen ) { +#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) ) + assert( maxGreen >= minGreen ); + + byte *outPtr = outData; + + __asm { + movd xmm7, channelBitOffset + + mov esi, block + movdqa xmm0, xmmword ptr [esi+ 0] + movdqa xmm5, xmmword ptr [esi+16] + movdqa xmm6, xmmword ptr [esi+32] + movdqa xmm4, xmmword ptr [esi+48] + + psrld xmm0, xmm7 + psrld xmm5, xmm7 + psrld xmm6, xmm7 + psrld xmm4, xmm7 + + pand xmm0, SIMD_SSE2_dword_byte_mask + pand xmm5, SIMD_SSE2_dword_byte_mask + pand xmm6, SIMD_SSE2_dword_byte_mask + pand xmm4, SIMD_SSE2_dword_byte_mask + + packuswb xmm0, xmm5 + packuswb xmm6, xmm4 + + //--------------------- + + movd xmm2, maxGreen + pshuflw xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 ) + + movd xmm3, minGreen + pshuflw xmm3, xmm3, R_SHUFFLE_D( 0, 0, 0, 0 ) + + pmullw xmm2, SIMD_SSE2_word_scale_5_3_1 + pmullw xmm3, SIMD_SSE2_word_scale_1_3_5 + paddw xmm2, SIMD_SSE2_word_3 + paddw xmm3, xmm2 + pmulhw xmm3, SIMD_SSE2_word_div_by_6 + + pshuflw xmm1, xmm3, R_SHUFFLE_D( 0, 0, 0, 0 ) + pshuflw xmm2, xmm3, R_SHUFFLE_D( 1, 1, 1, 1 ) + pshuflw xmm3, xmm3, R_SHUFFLE_D( 2, 2, 2, 2 ) + + pshufd xmm1, xmm1, R_SHUFFLE_D( 0, 0, 0, 0 ) + pshufd xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 ) + pshufd xmm3, xmm3, R_SHUFFLE_D( 0, 0, 0, 0 ) + + packuswb xmm1, xmm1 + packuswb xmm2, xmm2 + packuswb xmm3, xmm3 + + packuswb xmm0, xmm6 + + pmaxub xmm1, xmm0 + pmaxub xmm2, xmm0 + pmaxub xmm3, xmm0 + pcmpeqb xmm1, xmm0 + pcmpeqb xmm2, xmm0 + pcmpeqb xmm3, xmm0 + movdqa xmm0, SIMD_SSE2_byte_4 + paddsb xmm0, xmm1 + paddsb xmm2, xmm3 + paddsb xmm0, xmm2 + pand xmm0, SIMD_SSE2_byte_3 + movdqa xmm4, SIMD_SSE2_byte_2 + pcmpgtb xmm4, xmm0 + pand xmm4, SIMD_SSE2_byte_1 + pxor xmm0, xmm4 + movdqa xmm4, xmm0 + movdqa xmm5, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + psrlq xmm4, 8- 2 + psrlq xmm5, 16- 4 + psrlq xmm6, 24- 6 + psrlq xmm7, 32- 8 + pand xmm4, SIMD_SSE2_dword_color_bit_mask1 + pand xmm5, SIMD_SSE2_dword_color_bit_mask2 + pand xmm6, SIMD_SSE2_dword_color_bit_mask3 + pand xmm7, SIMD_SSE2_dword_color_bit_mask4 + por xmm5, xmm4 + por xmm7, xmm6 + por xmm7, xmm5 + movdqa xmm4, xmm0 + movdqa xmm5, xmm0 + movdqa xmm6, xmm0 + psrlq xmm4, 40-10 + psrlq xmm5, 48-12 + psrlq xmm6, 56-14 + pand xmm0, SIMD_SSE2_dword_color_bit_mask0 + pand xmm4, SIMD_SSE2_dword_color_bit_mask5 + pand xmm5, SIMD_SSE2_dword_color_bit_mask6 + pand xmm6, SIMD_SSE2_dword_color_bit_mask7 + por xmm4, xmm5 + por xmm0, xmm6 + por xmm7, xmm4 + por xmm7, xmm0 + mov esi, outPtr + pshufd xmm7, xmm7, R_SHUFFLE_D( 0, 2, 1, 3 ) + pshuflw xmm7, xmm7, R_SHUFFLE_D( 0, 2, 1, 3 ) + movd [esi], xmm7 + } + + outData += 4; +#elif defined ( ID_WIN_X86_SSE2_INTRIN ) __m128i block0 = *((__m128i *)(&block[ 0])); __m128i block1 = *((__m128i *)(&block[16])); __m128i block2 = *((__m128i *)(&block[32])); @@ -1472,6 +2623,9 @@ void idDxtEncoder::EmitGreenIndices_SSE2( const byte *block, const int channelBi int result = _mm_cvtsi128_si32( temp7 ); EmitUInt( result ); +#else + assert( false ); +#endif } /* @@ -1480,6 +2634,46 @@ idDxtEncoder::InsetNormalsBBoxDXT5_SSE2 ======================== */ void idDxtEncoder::InsetNormalsBBoxDXT5_SSE2( byte *minNormal, byte *maxNormal ) const { +#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) ) + __asm { + mov esi, minNormal + mov edi, maxNormal + movd xmm0, dword ptr [esi] // xmm0 = minNormal + movd xmm1, dword ptr [edi] // xmm1 = maxNormal + punpcklbw xmm0, SIMD_SSE2_byte_0 + punpcklbw xmm1, SIMD_SSE2_byte_0 + movdqa xmm2, xmm1 + psubw xmm2, xmm0 + psubw xmm2, SIMD_SSE2_word_insetNormalDXT5Round + pand xmm2, SIMD_SSE2_word_insetNormalDXT5Mask // xmm2 = inset (1 & 3) + + pmullw xmm0, SIMD_SSE2_word_insetNormalDXT5ShiftUp + pmullw xmm1, SIMD_SSE2_word_insetNormalDXT5ShiftUp + paddw xmm0, xmm2 + psubw xmm1, xmm2 + pmulhw xmm0, SIMD_SSE2_word_insetNormalDXT5ShiftDown // xmm0 = mini + pmulhw xmm1, SIMD_SSE2_word_insetNormalDXT5ShiftDown // xmm1 = maxi + + // mini and maxi must be >= 0 and <= 255 + pmaxsw xmm0, SIMD_SSE2_word_0 + pmaxsw xmm1, SIMD_SSE2_word_0 + pminsw xmm0, SIMD_SSE2_word_255 + pminsw xmm1, SIMD_SSE2_word_255 + + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + pand xmm0, SIMD_SSE2_word_insetNormalDXT5QuantMask + pand xmm1, SIMD_SSE2_word_insetNormalDXT5QuantMask + pmulhw xmm2, SIMD_SSE2_word_insetNormalDXT5Rep + pmulhw xmm3, SIMD_SSE2_word_insetNormalDXT5Rep + por xmm0, xmm2 + por xmm1, xmm3 + packuswb xmm0, xmm0 + packuswb xmm1, xmm1 + movd dword ptr [esi], xmm0 + movd dword ptr [edi], xmm1 + } +#elif defined ( ID_WIN_X86_SSE2_INTRIN ) __m128i temp0, temp1, temp2, temp3; temp0 = _mm_cvtsi32_si128( *(int *)minNormal ); @@ -1516,6 +2710,9 @@ void idDxtEncoder::InsetNormalsBBoxDXT5_SSE2( byte *minNormal, byte *maxNormal ) *(int *)minNormal = _mm_cvtsi128_si32( temp0 ); *(int *)maxNormal = _mm_cvtsi128_si32( temp1 ); +#else + assert( false ); +#endif } /* @@ -1578,3 +2775,4 @@ void idDxtEncoder::CompressNormalMapDXT5Fast_SSE2( const byte *inBuf, byte *outB #endif } +#endif diff --git a/neo/renderer/GLMatrix.cpp b/neo/renderer/GLMatrix.cpp index 9c188437..7408b2c9 100644 --- a/neo/renderer/GLMatrix.cpp +++ b/neo/renderer/GLMatrix.cpp @@ -72,6 +72,7 @@ R_MatrixMultiply ========================== */ void R_MatrixMultiply( const float a[16], const float b[16], float out[16] ) { +#ifdef ID_WIN_X86_SSE2_INTRIN __m128 a0 = _mm_loadu_ps( a + 0*4 ); __m128 a1 = _mm_loadu_ps( a + 1*4 ); @@ -108,6 +109,41 @@ void R_MatrixMultiply( const float a[16], const float b[16], float out[16] ) { _mm_storeu_ps( out + 2*4, t2 ); _mm_storeu_ps( out + 3*4, t3 ); +#else + + /* + for ( int i = 0; i < 4; i++ ) { + for ( int j = 0; j < 4; j++ ) { + out[ i * 4 + j ] = + a[ i * 4 + 0 ] * b[ 0 * 4 + j ] + + a[ i * 4 + 1 ] * b[ 1 * 4 + j ] + + a[ i * 4 + 2 ] * b[ 2 * 4 + j ] + + a[ i * 4 + 3 ] * b[ 3 * 4 + j ]; + } + } + */ + + out[0*4+0] = a[0*4+0]*b[0*4+0] + a[0*4+1]*b[1*4+0] + a[0*4+2]*b[2*4+0] + a[0*4+3]*b[3*4+0]; + out[0*4+1] = a[0*4+0]*b[0*4+1] + a[0*4+1]*b[1*4+1] + a[0*4+2]*b[2*4+1] + a[0*4+3]*b[3*4+1]; + out[0*4+2] = a[0*4+0]*b[0*4+2] + a[0*4+1]*b[1*4+2] + a[0*4+2]*b[2*4+2] + a[0*4+3]*b[3*4+2]; + out[0*4+3] = a[0*4+0]*b[0*4+3] + a[0*4+1]*b[1*4+3] + a[0*4+2]*b[2*4+3] + a[0*4+3]*b[3*4+3]; + + out[1*4+0] = a[1*4+0]*b[0*4+0] + a[1*4+1]*b[1*4+0] + a[1*4+2]*b[2*4+0] + a[1*4+3]*b[3*4+0]; + out[1*4+1] = a[1*4+0]*b[0*4+1] + a[1*4+1]*b[1*4+1] + a[1*4+2]*b[2*4+1] + a[1*4+3]*b[3*4+1]; + out[1*4+2] = a[1*4+0]*b[0*4+2] + a[1*4+1]*b[1*4+2] + a[1*4+2]*b[2*4+2] + a[1*4+3]*b[3*4+2]; + out[1*4+3] = a[1*4+0]*b[0*4+3] + a[1*4+1]*b[1*4+3] + a[1*4+2]*b[2*4+3] + a[1*4+3]*b[3*4+3]; + + out[2*4+0] = a[2*4+0]*b[0*4+0] + a[2*4+1]*b[1*4+0] + a[2*4+2]*b[2*4+0] + a[2*4+3]*b[3*4+0]; + out[2*4+1] = a[2*4+0]*b[0*4+1] + a[2*4+1]*b[1*4+1] + a[2*4+2]*b[2*4+1] + a[2*4+3]*b[3*4+1]; + out[2*4+2] = a[2*4+0]*b[0*4+2] + a[2*4+1]*b[1*4+2] + a[2*4+2]*b[2*4+2] + a[2*4+3]*b[3*4+2]; + out[2*4+3] = a[2*4+0]*b[0*4+3] + a[2*4+1]*b[1*4+3] + a[2*4+2]*b[2*4+3] + a[2*4+3]*b[3*4+3]; + + out[3*4+0] = a[3*4+0]*b[0*4+0] + a[3*4+1]*b[1*4+0] + a[3*4+2]*b[2*4+0] + a[3*4+3]*b[3*4+0]; + out[3*4+1] = a[3*4+0]*b[0*4+1] + a[3*4+1]*b[1*4+1] + a[3*4+2]*b[2*4+1] + a[3*4+3]*b[3*4+1]; + out[3*4+2] = a[3*4+0]*b[0*4+2] + a[3*4+1]*b[1*4+2] + a[3*4+2]*b[2*4+2] + a[3*4+3]*b[3*4+2]; + out[3*4+3] = a[3*4+0]*b[0*4+3] + a[3*4+1]*b[1*4+3] + a[3*4+2]*b[2*4+3] + a[3*4+3]*b[3*4+3]; + +#endif } /* diff --git a/neo/renderer/ModelDecal.cpp b/neo/renderer/ModelDecal.cpp index 596522dd..560819aa 100644 --- a/neo/renderer/ModelDecal.cpp +++ b/neo/renderer/ModelDecal.cpp @@ -274,6 +274,7 @@ static void R_DecalPointCullStatic( byte * cullBits, const idPlane * planes, con assert_16_byte_aligned( cullBits ); assert_16_byte_aligned( verts ); +#ifdef ID_WIN_X86_SSE2_INTRIN idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts ); @@ -376,6 +377,37 @@ static void R_DecalPointCullStatic( byte * cullBits, const idPlane * planes, con } } +#else + + idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts ); + + for ( int i = 0; i < numVerts; ) { + + const int nextNumVerts = vertsODS.FetchNextBatch() - 1; + + for ( ; i <= nextNumVerts; i++ ) { + const idVec3 & v = vertsODS[i].xyz; + + const float d0 = planes[0].Distance( v ); + const float d1 = planes[1].Distance( v ); + const float d2 = planes[2].Distance( v ); + const float d3 = planes[3].Distance( v ); + const float d4 = planes[4].Distance( v ); + const float d5 = planes[5].Distance( v ); + + byte bits; + bits = IEEE_FLT_SIGNBITNOTSET( d0 ) << 0; + bits |= IEEE_FLT_SIGNBITNOTSET( d1 ) << 1; + bits |= IEEE_FLT_SIGNBITNOTSET( d2 ) << 2; + bits |= IEEE_FLT_SIGNBITNOTSET( d3 ) << 3; + bits |= IEEE_FLT_SIGNBITNOTSET( d4 ) << 4; + bits |= IEEE_FLT_SIGNBITNOTSET( d5 ) << 5; + + cullBits[i] = bits; + } + } + +#endif } /* @@ -573,6 +605,7 @@ static void R_CopyDecalSurface( idDrawVert * verts, int numVerts, triIndex_t * i assert( ( ( decal->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 ); assert_16_byte_aligned( fadeColor ); +#ifdef ID_WIN_X86_SSE2_INTRIN const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 ); const __m128i vector_short_num_verts = _mm_packs_epi32( vector_int_num_verts, vector_int_num_verts ); @@ -612,6 +645,25 @@ static void R_CopyDecalSurface( idDrawVert * verts, int numVerts, triIndex_t * i _mm_sfence(); +#else + + // copy vertices and apply depth/time based fading + for ( int i = 0; i < decal->numVerts; i++ ) { + // NOTE: bad out-of-order write-combined write, SIMD code does the right thing + verts[numVerts + i] = decal->verts[i]; + for ( int j = 0; j < 4; j++ ) { + verts[numVerts + i].color[j] = idMath::Ftob( fadeColor[j] * decal->vertDepthFade[i] ); + } + } + + // copy indices + assert( ( decal->numIndexes & 1 ) == 0 ); + for ( int i = 0; i < decal->numIndexes; i += 2 ) { + assert( decal->indexes[i + 0] < decal->numVerts && decal->indexes[i + 1] < decal->numVerts ); + WriteIndexPair( &indexes[numIndexes + i], numVerts + decal->indexes[i + 0], numVerts + decal->indexes[i + 1] ); + } + +#endif } /* diff --git a/neo/renderer/ModelOverlay.cpp b/neo/renderer/ModelOverlay.cpp index fa2b1962..da4f369d 100644 --- a/neo/renderer/ModelOverlay.cpp +++ b/neo/renderer/ModelOverlay.cpp @@ -102,6 +102,7 @@ static void R_OverlayPointCullStatic( byte * cullBits, halfFloat_t * texCoordS, assert_16_byte_aligned( texCoordT ); assert_16_byte_aligned( verts ); +#ifdef ID_WIN_X86_SSE2_INTRIN idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts ); @@ -176,6 +177,39 @@ static void R_OverlayPointCullStatic( byte * cullBits, halfFloat_t * texCoordS, } } +#else + + idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts ); + + for ( int i = 0; i < numVerts; ) { + + const int nextNumVerts = vertsODS.FetchNextBatch() - 1; + + for ( ; i <= nextNumVerts; i++ ) { + const idVec3 & v = vertsODS[i].xyz; + + const float d0 = planes[0].Distance( v ); + const float d1 = planes[1].Distance( v ); + const float d2 = 1.0f - d0; + const float d3 = 1.0f - d1; + + halfFloat_t s = Scalar_FastF32toF16( d0 ); + halfFloat_t t = Scalar_FastF32toF16( d1 ); + + texCoordS[i] = s; + texCoordT[i] = t; + + byte bits; + bits = IEEE_FLT_SIGNBITSET( d0 ) << 0; + bits |= IEEE_FLT_SIGNBITSET( d1 ) << 1; + bits |= IEEE_FLT_SIGNBITSET( d2 ) << 2; + bits |= IEEE_FLT_SIGNBITSET( d3 ) << 3; + + cullBits[i] = bits; + } + } + +#endif } /* @@ -189,6 +223,7 @@ static void R_OverlayPointCullSkinned( byte * cullBits, halfFloat_t * texCoordS, assert_16_byte_aligned( texCoordT ); assert_16_byte_aligned( verts ); +#ifdef ID_WIN_X86_SSE2_INTRIN idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts ); @@ -263,6 +298,39 @@ static void R_OverlayPointCullSkinned( byte * cullBits, halfFloat_t * texCoordS, } } +#else + + idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts ); + + for ( int i = 0; i < numVerts; ) { + + const int nextNumVerts = vertsODS.FetchNextBatch() - 1; + + for ( ; i <= nextNumVerts; i++ ) { + const idVec3 transformed = Scalar_LoadSkinnedDrawVertPosition( vertsODS[i], joints ); + + const float d0 = planes[0].Distance( transformed ); + const float d1 = planes[1].Distance( transformed ); + const float d2 = 1.0f - d0; + const float d3 = 1.0f - d1; + + halfFloat_t s = Scalar_FastF32toF16( d0 ); + halfFloat_t t = Scalar_FastF32toF16( d1 ); + + texCoordS[i] = s; + texCoordT[i] = t; + + byte bits; + bits = IEEE_FLT_SIGNBITSET( d0 ) << 0; + bits |= IEEE_FLT_SIGNBITSET( d1 ) << 1; + bits |= IEEE_FLT_SIGNBITSET( d2 ) << 2; + bits |= IEEE_FLT_SIGNBITSET( d3 ) << 3; + + cullBits[i] = bits; + } + } + +#endif } /* @@ -446,6 +514,7 @@ static void R_CopyOverlaySurface( idDrawVert * verts, int numVerts, triIndex_t * assert( ( ( overlay->numVerts * sizeof( idDrawVert ) ) & 15 ) == 0 ); assert( ( ( overlay->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 ); +#ifdef ID_WIN_X86_SSE2_INTRIN const __m128i vector_int_clear_last = _mm_set_epi32( 0, -1, -1, -1 ); const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 ); @@ -482,6 +551,25 @@ static void R_CopyOverlaySurface( idDrawVert * verts, int numVerts, triIndex_t * _mm_sfence(); +#else + + // copy vertices + for ( int i = 0; i < overlay->numVerts; i++ ) { + const overlayVertex_t &overlayVert = overlay->verts[i]; + + // NOTE: bad out-of-order write-combined write, SIMD code does the right thing + verts[numVerts + i] = sourceVerts[overlayVert.vertexNum]; + verts[numVerts + i].st[0] = overlayVert.st[0]; + verts[numVerts + i].st[1] = overlayVert.st[1]; + } + + // copy indexes + for ( int i = 0; i < overlay->numIndexes; i += 2 ) { + assert( overlay->indexes[i + 0] < overlay->numVerts && overlay->indexes[i + 1] < overlay->numVerts ); + WriteIndexPair( &indexes[numIndexes + i], numVerts + overlay->indexes[i + 0], numVerts + overlay->indexes[i + 1] ); + } + +#endif } /* diff --git a/neo/renderer/Model_md5.cpp b/neo/renderer/Model_md5.cpp index 30865b41..e2c7f223 100644 --- a/neo/renderer/Model_md5.cpp +++ b/neo/renderer/Model_md5.cpp @@ -32,10 +32,12 @@ If you have questions concerning this license or the applicable additional terms #include "tr_local.h" #include "Model_local.h" +#ifdef ID_WIN_X86_SSE2_INTRIN static const __m128 vector_float_posInfinity = { idMath::INFINITY, idMath::INFINITY, idMath::INFINITY, idMath::INFINITY }; static const __m128 vector_float_negInfinity = { -idMath::INFINITY, -idMath::INFINITY, -idMath::INFINITY, -idMath::INFINITY }; +#endif static const char *MD5_SnapshotName = "_MD5_Snapshot_"; @@ -501,6 +503,7 @@ idMD5Mesh::CalculateBounds ==================== */ void idMD5Mesh::CalculateBounds( const idJointMat * entJoints, idBounds & bounds ) const { +#ifdef ID_WIN_X86_SSE2_INTRIN __m128 minX = vector_float_posInfinity; __m128 minY = vector_float_posInfinity; @@ -534,6 +537,16 @@ void idMD5Mesh::CalculateBounds( const idJointMat * entJoints, idBounds & bounds _mm_store_ss( bounds.ToFloatPtr() + 4, _mm_splat_ps( maxY, 3 ) ); _mm_store_ss( bounds.ToFloatPtr() + 5, _mm_splat_ps( maxZ, 3 ) ); +#else + + bounds.Clear(); + for ( int i = 0; i < numMeshJoints; i++ ) { + const idJointMat & joint = entJoints[meshJoints[i]]; + bounds.AddPoint( joint.GetTranslation() ); + } + bounds.ExpandSelf( maxJointVertDist ); + +#endif } /* @@ -1085,6 +1098,7 @@ static void TransformJoints( idJointMat *__restrict outJoints, const int numJoin assert_16_byte_aligned( inFloats1 ); assert_16_byte_aligned( inFloats2 ); +#ifdef ID_WIN_X86_SSE2_INTRIN const __m128 mask_keep_last = __m128c( _mm_set_epi32( 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 ) ); @@ -1160,6 +1174,13 @@ static void TransformJoints( idJointMat *__restrict outJoints, const int numJoin _mm_store_ps( outFloats + 1 * 12 + 8, ri1 ); } +#else + + for ( int i = 0; i < numJoints; i++ ) { + idJointMat::Multiply( outJoints[i], inJoints1[i], inJoints2[i] ); + } + +#endif } /* diff --git a/neo/renderer/jobs/ShadowShared.cpp b/neo/renderer/jobs/ShadowShared.cpp index 1e9f082a..25009a9d 100644 --- a/neo/renderer/jobs/ShadowShared.cpp +++ b/neo/renderer/jobs/ShadowShared.cpp @@ -87,6 +87,7 @@ static void R_ShadowVolumeCullBits( byte *cullBits, byte &totalOr, const float r assert_16_byte_aligned( cullBits ); assert_16_byte_aligned( verts ); +#ifdef ID_WIN_X86_SSE2_INTRIN idODSStreamedArray< idShadowVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts ); @@ -208,6 +209,54 @@ static void R_ShadowVolumeCullBits( byte *cullBits, byte &totalOr, const float r totalOr = (byte) _mm_cvtsi128_si32( vecTotalOrByte ); +#else + + idODSStreamedArray< idShadowVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts ); + + byte tOr = 0; + for ( int i = 0; i < numVerts; ) { + + const int nextNumVerts = vertsODS.FetchNextBatch() - 1; + + for ( ; i <= nextNumVerts; i++ ) { + const idVec3 & v = vertsODS[i].xyzw.ToVec3(); + + const float d0 = planes[0].Distance( v ); + const float d1 = planes[1].Distance( v ); + const float d2 = planes[2].Distance( v ); + const float d3 = planes[3].Distance( v ); + + const float t0 = d0 + radius; + const float t1 = d1 + radius; + const float t2 = d2 + radius; + const float t3 = d3 + radius; + + const float s0 = d0 - radius; + const float s1 = d1 - radius; + const float s2 = d2 - radius; + const float s3 = d3 - radius; + + byte bits; + bits = IEEE_FLT_SIGNBITSET( t0 ) << 0; + bits |= IEEE_FLT_SIGNBITSET( t1 ) << 1; + bits |= IEEE_FLT_SIGNBITSET( t2 ) << 2; + bits |= IEEE_FLT_SIGNBITSET( t3 ) << 3; + + bits |= IEEE_FLT_SIGNBITSET( s0 ) << 4; + bits |= IEEE_FLT_SIGNBITSET( s1 ) << 5; + bits |= IEEE_FLT_SIGNBITSET( s2 ) << 6; + bits |= IEEE_FLT_SIGNBITSET( s3 ) << 7; + + bits ^= 0x0F; // flip lower four bits + + tOr |= bits; + cullBits[i] = bits; + } + } + + totalOr = tOr; + +#endif } /* diff --git a/neo/renderer/jobs/dynamicshadowvolume/DynamicShadowVolume.cpp b/neo/renderer/jobs/dynamicshadowvolume/DynamicShadowVolume.cpp index 241a6aad..0f443929 100644 --- a/neo/renderer/jobs/dynamicshadowvolume/DynamicShadowVolume.cpp +++ b/neo/renderer/jobs/dynamicshadowvolume/DynamicShadowVolume.cpp @@ -31,6 +31,7 @@ If you have questions concerning this license or the applicable additional terms #include "../../../idlib/sys/sys_intrinsics.h" #include "../../../idlib/geometry/DrawVert_intrinsics.h" +#ifdef ID_WIN_X86_SSE2_INTRIN static const __m128i vector_int_neg_one = _mm_set_epi32( -1, -1, -1, -1 ); @@ -126,6 +127,69 @@ static __forceinline __m128i TriangleCulled_SSE2( const __m128 & vert0X, const _ return _mm_castps_si128( _mm_cmpeq_ps( b0, zero ) ); } +#else + +/* +===================== +TriangleFacing + +Returns 255 if the triangle is facing the light origin, otherwise returns 0. +===================== +*/ +static byte TriangleFacing_Generic( const idVec3 & v1, const idVec3 & v2, const idVec3 & v3, const idVec3 & lightOrigin ) { + const float sx = v2.x - v1.x; + const float sy = v2.y - v1.y; + const float sz = v2.z - v1.z; + + const float tx = v3.x - v1.x; + const float ty = v3.y - v1.y; + const float tz = v3.z - v1.z; + + const float normalX = ty * sz - tz * sy; + const float normalY = tz * sx - tx * sz; + const float normalZ = tx * sy - ty * sx; + const float normalW = normalX * v1.x + normalY * v1.y + normalZ * v1.z; + + const float d = lightOrigin.x * normalX + lightOrigin.y * normalY + lightOrigin.z * normalZ - normalW; + return ( d > 0.0f ) ? 255 : 0; +} + +/* +===================== +TriangleCulled + +Returns 255 if the triangle is culled to the light projection matrix, otherwise returns 0. +The clip space of the 'lightProject' is assumed to be in the range [0, 1]. +===================== +*/ +static byte TriangleCulled_Generic( const idVec3 & v1, const idVec3 & v2, const idVec3 & v3, const idRenderMatrix & lightProject ) { + // transform the triangle + idVec4 c[3]; + for ( int i = 0; i < 4; i++ ) { + c[0][i] = v1[0] * lightProject[i][0] + v1[1] * lightProject[i][1] + v1[2] * lightProject[i][2] + lightProject[i][3]; + c[1][i] = v2[0] * lightProject[i][0] + v2[1] * lightProject[i][1] + v2[2] * lightProject[i][2] + lightProject[i][3]; + c[2][i] = v3[0] * lightProject[i][0] + v3[1] * lightProject[i][1] + v3[2] * lightProject[i][2] + lightProject[i][3]; + } + + // calculate the culled bits + int bits = 0; + for ( int i = 0; i < 3; i++ ) { + const float minW = 0.0f; + const float maxW = c[i][3]; + + if ( c[i][0] > minW ) { bits |= ( 1 << 0 ); } + if ( c[i][0] < maxW ) { bits |= ( 1 << 1 ); } + if ( c[i][1] > minW ) { bits |= ( 1 << 2 ); } + if ( c[i][1] < maxW ) { bits |= ( 1 << 3 ); } + if ( c[i][2] > minW ) { bits |= ( 1 << 4 ); } + if ( c[i][2] < maxW ) { bits |= ( 1 << 5 ); } + } + + // if any bits weren't set, the triangle is completely off one side of the frustum + return ( bits != 63 ) ? 255 : 0; +} + +#endif /* ===================== @@ -155,6 +219,7 @@ static int CalculateTriangleFacingCulledStatic( byte * __restrict facing, byte * const idVec3 lineDir = lineDelta * lineLengthRcp; const float lineLength = lineLengthSqr * lineLengthRcp; +#ifdef ID_WIN_X86_SSE2_INTRIN idODSStreamedIndexedArray< idDrawVert, triIndex_t, 32, SBT_QUAD, 4 * 3 > indexedVertsODS( verts, numVerts, indexes, numIndexes ); @@ -261,6 +326,55 @@ static int CalculateTriangleFacingCulledStatic( byte * __restrict facing, byte * return _mm_cvtsi128_si32( numFrontFacing ); +#else + + idODSStreamedIndexedArray< idDrawVert, triIndex_t, 32, SBT_QUAD, 1 > indexedVertsODS( verts, numVerts, indexes, numIndexes ); + + const byte cullShadowTrianglesToLightMask = cullShadowTrianglesToLight ? 255 : 0; + + int numFrontFacing = 0; + + for ( int i = 0, j = 0; i < numIndexes; ) { + + const int batchStart = i; + const int batchEnd = indexedVertsODS.FetchNextBatch(); + const int indexStart = j; + + for ( ; i <= batchEnd - 3; i += 3, j++ ) { + const idVec3 & v1 = indexedVertsODS[i + 0].xyz; + const idVec3 & v2 = indexedVertsODS[i + 1].xyz; + const idVec3 & v3 = indexedVertsODS[i + 2].xyz; + + const byte triangleCulled = TriangleCulled_Generic( v1, v2, v3, lightProject ); + + byte triangleFacing = TriangleFacing_Generic( v1, v2, v3, lightOrigin ); + + // optionally make triangles that are outside the light frustum facing so they do not contribute to the shadow volume + triangleFacing |= ( triangleCulled & cullShadowTrianglesToLightMask ); + + culled[j] = triangleCulled; + facing[j] = triangleFacing; + + // count the number of facing triangles + numFrontFacing += ( triangleFacing & 1 ); + } + + if ( insideShadowVolume != NULL ) { + for ( int k = batchStart, n = indexStart; k <= batchEnd - 3; k += 3, n++ ) { + if ( !facing[n] ) { + if ( R_LineIntersectsTriangleExpandedWithSphere( lineStart, lineEnd, lineDir, lineLength, radius, indexedVertsODS[k + 2].xyz, indexedVertsODS[k + 1].xyz, indexedVertsODS[k + 0].xyz ) ) { + *insideShadowVolume = true; + insideShadowVolume = NULL; + break; + } + } + } + } + } + + return numFrontFacing; + +#endif } /* @@ -291,6 +405,7 @@ static int CalculateTriangleFacingCulledSkinned( byte * __restrict facing, byte const idVec3 lineDir = lineDelta * lineLengthRcp; const float lineLength = lineLengthSqr * lineLengthRcp; +#ifdef ID_WIN_X86_SSE2_INTRIN idODSStreamedArray< idDrawVert, 32, SBT_DOUBLE, 1 > vertsODS( verts, numVerts ); @@ -428,6 +543,74 @@ static int CalculateTriangleFacingCulledSkinned( byte * __restrict facing, byte return _mm_cvtsi128_si32( numFrontFacing ); +#else + + idODSStreamedArray< idDrawVert, 32, SBT_DOUBLE, 1 > vertsODS( verts, numVerts ); + + for ( int i = 0; i < numVerts; ) { + + const int nextNumVerts = vertsODS.FetchNextBatch() - 1; + + for ( ; i <= nextNumVerts; i++ ) { + tempVerts[i].ToVec3() = Scalar_LoadSkinnedDrawVertPosition( vertsODS[i], joints ); + tempVerts[i].w = 1.0f; + } + } + + idODSStreamedArray< triIndex_t, 256, SBT_QUAD, 1 > indexesODS( indexes, numIndexes ); + + const byte cullShadowTrianglesToLightMask = cullShadowTrianglesToLight ? 255 : 0; + + int numFrontFacing = 0; + + for ( int i = 0, j = 0; i < numIndexes; ) { + + const int batchStart = i; + const int batchEnd = indexesODS.FetchNextBatch(); + const int indexStart = j; + + for ( ; i <= batchEnd - 3; i += 3, j++ ) { + const int i0 = indexesODS[i + 0]; + const int i1 = indexesODS[i + 1]; + const int i2 = indexesODS[i + 2]; + + const idVec3 & v1 = tempVerts[i0].ToVec3(); + const idVec3 & v2 = tempVerts[i1].ToVec3(); + const idVec3 & v3 = tempVerts[i2].ToVec3(); + + const byte triangleCulled = TriangleCulled_Generic( v1, v2, v3, lightProject ); + + byte triangleFacing = TriangleFacing_Generic( v1, v2, v3, lightOrigin ); + + // optionally make triangles that are outside the light frustum facing so they do not contribute to the shadow volume + triangleFacing |= ( triangleCulled & cullShadowTrianglesToLightMask ); + + culled[j] = triangleCulled; + facing[j] = triangleFacing; + + // count the number of facing triangles + numFrontFacing += ( triangleFacing & 1 ); + } + + if ( insideShadowVolume != NULL ) { + for ( int k = batchStart, n = indexStart; k <= batchEnd - 3; k += 3, n++ ) { + if ( !facing[n] ) { + const int i0 = indexesODS[k + 0]; + const int i1 = indexesODS[k + 1]; + const int i2 = indexesODS[k + 2]; + if ( R_LineIntersectsTriangleExpandedWithSphere( lineStart, lineEnd, lineDir, lineLength, radius, tempVerts[i2].ToVec3(), tempVerts[i1].ToVec3(), tempVerts[i0].ToVec3() ) ) { + *insideShadowVolume = true; + insideShadowVolume = NULL; + break; + } + } + } + } + } + + return numFrontFacing; + +#endif } /* @@ -440,6 +623,7 @@ static void StreamOut( void * dst, const void * src, int numBytes ) { assert_16_byte_aligned( dst ); assert_16_byte_aligned( src ); +#ifdef ID_WIN_X86_SSE2_INTRIN int i = 0; for ( ; i + 128 <= numBytes; i += 128 ) { __m128i d0 = _mm_load_si128( (const __m128i *)( (byte *)src + i + 0*16 ) ); @@ -463,6 +647,9 @@ static void StreamOut( void * dst, const void * src, int numBytes ) { __m128i d = _mm_load_si128( (__m128i *)( (byte *)src + i ) ); _mm_stream_si128( (__m128i *)( (byte *)dst + i ), d ); } +#else + memcpy( dst, src, numBytes ); +#endif } /* @@ -671,7 +858,9 @@ static void R_CreateShadowVolumeTriangles( triIndex_t *__restrict shadowIndices, numShadowIndexesTotal = numShadowIndices; +#if defined( ID_WIN_X86_SSE2_INTRIN ) _mm_sfence(); +#endif #else // NOTE: this code will not work on the SPU because it tries to write directly to the destination @@ -844,7 +1033,9 @@ void R_CreateLightTriangles( triIndex_t * __restrict lightIndices, triIndex_t * numLightIndicesTotal = numLightIndices; +#if defined( ID_WIN_X86_SSE2_INTRIN ) _mm_sfence(); +#endif #else // NOTE: this code will not work on the SPU because it tries to write directly to the destination diff --git a/neo/renderer/tr_trace.cpp b/neo/renderer/tr_trace.cpp index a9d789aa..fe7595de 100644 --- a/neo/renderer/tr_trace.cpp +++ b/neo/renderer/tr_trace.cpp @@ -43,6 +43,7 @@ static void R_TracePointCullStatic( byte *cullBits, byte &totalOr, const float r assert_16_byte_aligned( cullBits ); assert_16_byte_aligned( verts ); +#ifdef ID_WIN_X86_SSE2_INTRIN idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts ); @@ -164,6 +165,54 @@ static void R_TracePointCullStatic( byte *cullBits, byte &totalOr, const float r totalOr = (byte) _mm_cvtsi128_si32( vecTotalOrByte ); +#else + + idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts ); + + byte tOr = 0; + for ( int i = 0; i < numVerts; ) { + + const int nextNumVerts = vertsODS.FetchNextBatch() - 1; + + for ( ; i <= nextNumVerts; i++ ) { + const idVec3 & v = vertsODS[i].xyz; + + const float d0 = planes[0].Distance( v ); + const float d1 = planes[1].Distance( v ); + const float d2 = planes[2].Distance( v ); + const float d3 = planes[3].Distance( v ); + + const float t0 = d0 + radius; + const float t1 = d1 + radius; + const float t2 = d2 + radius; + const float t3 = d3 + radius; + + const float s0 = d0 - radius; + const float s1 = d1 - radius; + const float s2 = d2 - radius; + const float s3 = d3 - radius; + + byte bits; + bits = IEEE_FLT_SIGNBITSET( t0 ) << 0; + bits |= IEEE_FLT_SIGNBITSET( t1 ) << 1; + bits |= IEEE_FLT_SIGNBITSET( t2 ) << 2; + bits |= IEEE_FLT_SIGNBITSET( t3 ) << 3; + + bits |= IEEE_FLT_SIGNBITSET( s0 ) << 4; + bits |= IEEE_FLT_SIGNBITSET( s1 ) << 5; + bits |= IEEE_FLT_SIGNBITSET( s2 ) << 6; + bits |= IEEE_FLT_SIGNBITSET( s3 ) << 7; + + bits ^= 0x0F; // flip lower four bits + + tOr |= bits; + cullBits[i] = bits; + } + } + + totalOr = tOr; + +#endif } /* @@ -175,6 +224,7 @@ static void R_TracePointCullSkinned( byte *cullBits, byte &totalOr, const float assert_16_byte_aligned( cullBits ); assert_16_byte_aligned( verts ); +#ifdef ID_WIN_X86_SSE2_INTRIN idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts ); @@ -296,6 +346,54 @@ static void R_TracePointCullSkinned( byte *cullBits, byte &totalOr, const float totalOr = (byte) _mm_cvtsi128_si32( vecTotalOrByte ); +#else + + idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts ); + + byte tOr = 0; + for ( int i = 0; i < numVerts; ) { + + const int nextNumVerts = vertsODS.FetchNextBatch() - 1; + + for ( ; i <= nextNumVerts; i++ ) { + const idVec3 v = Scalar_LoadSkinnedDrawVertPosition( vertsODS[i], joints ); + + const float d0 = planes[0].Distance( v ); + const float d1 = planes[1].Distance( v ); + const float d2 = planes[2].Distance( v ); + const float d3 = planes[3].Distance( v ); + + const float t0 = d0 + radius; + const float t1 = d1 + radius; + const float t2 = d2 + radius; + const float t3 = d3 + radius; + + const float s0 = d0 - radius; + const float s1 = d1 - radius; + const float s2 = d2 - radius; + const float s3 = d3 - radius; + + byte bits; + bits = IEEE_FLT_SIGNBITSET( t0 ) << 0; + bits |= IEEE_FLT_SIGNBITSET( t1 ) << 1; + bits |= IEEE_FLT_SIGNBITSET( t2 ) << 2; + bits |= IEEE_FLT_SIGNBITSET( t3 ) << 3; + + bits |= IEEE_FLT_SIGNBITSET( s0 ) << 4; + bits |= IEEE_FLT_SIGNBITSET( s1 ) << 5; + bits |= IEEE_FLT_SIGNBITSET( s2 ) << 6; + bits |= IEEE_FLT_SIGNBITSET( s3 ) << 7; + + bits ^= 0x0F; // flip lower four bits + + tOr |= bits; + cullBits[i] = bits; + } + } + + totalOr = tOr; + +#endif } /*