/* =========================================================================== Doom 3 BFG Edition GPL Source Code Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company. Copyright (C) 2012 Robert Beckebans This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code"). Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Doom 3 BFG Edition Source Code. If not, see . In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code. If not, please request a copy in writing from id Software at the address below. If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA. =========================================================================== */ #pragma hdrstop #include "precompiled.h" // this file is full of intentional case fall throughs //lint -e616 // the code is correct, it can't be a NULL pointer //lint -e613 static idCVar lcp_showFailures( "lcp_showFailures", "0", CVAR_BOOL, "show LCP solver failures" ); const float LCP_BOUND_EPSILON = 1e-5f; const float LCP_ACCEL_EPSILON = 1e-5f; const float LCP_DELTA_ACCEL_EPSILON = 1e-9f; const float LCP_DELTA_FORCE_EPSILON = 1e-9f; #define IGNORE_UNSATISFIABLE_VARIABLES #if defined(USE_INTRINSICS) #define LCP_SIMD #endif #if defined(LCP_SIMD) ALIGN16( const __m128 SIMD_SP_zero ) = { 0.0f, 0.0f, 0.0f, 0.0f }; ALIGN16( const __m128 SIMD_SP_one ) = { 1.0f, 1.0f, 1.0f, 1.0f }; ALIGN16( const __m128 SIMD_SP_two ) = { 2.0f, 2.0f, 2.0f, 2.0f }; ALIGN16( const __m128 SIMD_SP_tiny ) = { 1e-10f, 1e-10f, 1e-10f, 1e-10f }; ALIGN16( const __m128 SIMD_SP_infinity ) = { idMath::INFINITY, idMath::INFINITY, idMath::INFINITY, idMath::INFINITY }; ALIGN16( const __m128 SIMD_SP_LCP_DELTA_ACCEL_EPSILON ) = { LCP_DELTA_ACCEL_EPSILON, LCP_DELTA_ACCEL_EPSILON, LCP_DELTA_ACCEL_EPSILON, LCP_DELTA_ACCEL_EPSILON }; ALIGN16( const __m128 SIMD_SP_LCP_DELTA_FORCE_EPSILON ) = { LCP_DELTA_FORCE_EPSILON, LCP_DELTA_FORCE_EPSILON, LCP_DELTA_FORCE_EPSILON, LCP_DELTA_FORCE_EPSILON }; ALIGN16( const __m128 SIMD_SP_LCP_BOUND_EPSILON ) = { LCP_BOUND_EPSILON, LCP_BOUND_EPSILON, LCP_BOUND_EPSILON, LCP_BOUND_EPSILON }; ALIGN16( const __m128 SIMD_SP_neg_LCP_BOUND_EPSILON ) = { -LCP_BOUND_EPSILON, -LCP_BOUND_EPSILON, -LCP_BOUND_EPSILON, -LCP_BOUND_EPSILON }; ALIGN16( const unsigned int SIMD_SP_signBit[4] ) = { IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK }; // DG: cast IEEE_FLT_SIGN_MASK to uint before interting it, as uint and not long values are expected here #define INV_IEEE_FLT_SIGN_MASK (~((unsigned int)IEEE_FLT_SIGN_MASK)) ALIGN16( const unsigned int SIMD_SP_absMask[4] ) = { INV_IEEE_FLT_SIGN_MASK, INV_IEEE_FLT_SIGN_MASK, INV_IEEE_FLT_SIGN_MASK, INV_IEEE_FLT_SIGN_MASK }; // DG end ALIGN16( const unsigned int SIMD_SP_indexedStartMask[4][4] ) = { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, { 0, 0, 0xFFFFFFFF, 0xFFFFFFFF }, { 0, 0, 0, 0xFFFFFFFF } }; ALIGN16( const unsigned int SIMD_SP_indexedEndMask[4][4] ) = { { 0, 0, 0, 0 }, { 0xFFFFFFFF, 0, 0, 0 }, { 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 }, { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 } }; ALIGN16( const unsigned int SIMD_SP_clearLast1[4] ) = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 }; ALIGN16( const unsigned int SIMD_SP_clearLast2[4] ) = { 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 }; ALIGN16( const unsigned int SIMD_SP_clearLast3[4] ) = { 0xFFFFFFFF, 0, 0, 0 }; ALIGN16( const unsigned int SIMD_DW_zero[4] ) = { 0, 0, 0, 0 }; ALIGN16( const unsigned int SIMD_DW_one[4] ) = { 1, 1, 1, 1 }; ALIGN16( const unsigned int SIMD_DW_four[4] ) = { 4, 4, 4, 4 }; ALIGN16( const unsigned int SIMD_DW_index[4] ) = { 0, 1, 2, 3 }; ALIGN16( const int SIMD_DW_not3[4] ) = { ~3, ~3, ~3, ~3 }; #endif // #if defined(LCP_SIMD) /* ======================== Multiply_SIMD dst[i] = src0[i] * src1[i]; Assumes the source and destination have the same memory alignment. ======================== */ static void Multiply_SIMD( float* dst, const float* src0, const float* src1, const int count ) { int i = 0; // RB: changed unsigned int to uintptr_t for( ; ( ( uintptr_t )dst & 0xF ) != 0 && i < count; i++ ) // RB end { dst[i] = src0[i] * src1[i]; } #if defined(LCP_SIMD) for( ; i + 4 <= count; i += 4 ) { assert_16_byte_aligned( &dst[i] ); assert_16_byte_aligned( &src0[i] ); assert_16_byte_aligned( &src1[i] ); __m128 s0 = _mm_load_ps( src0 + i ); __m128 s1 = _mm_load_ps( src1 + i ); s0 = _mm_mul_ps( s0, s1 ); _mm_store_ps( dst + i, s0 ); } #else for( ; i + 4 <= count; i += 4 ) { assert_16_byte_aligned( &dst[i] ); assert_16_byte_aligned( &src0[i] ); assert_16_byte_aligned( &src1[i] ); dst[i + 0] = src0[i + 0] * src1[i + 0]; dst[i + 1] = src0[i + 1] * src1[i + 1]; dst[i + 2] = src0[i + 2] * src1[i + 2]; dst[i + 3] = src0[i + 3] * src1[i + 3]; } #endif for( ; i < count; i++ ) { dst[i] = src0[i] * src1[i]; } } /* ======================== MultiplyAdd_SIMD dst[i] += constant * src[i]; Assumes the source and destination have the same memory alignment. ======================== */ static void MultiplyAdd_SIMD( float* dst, const float constant, const float* src, const int count ) { int i = 0; // RB: changed unsigned int to uintptr_t for( ; ( ( uintptr_t )dst & 0xF ) != 0 && i < count; i++ ) // RB end { dst[i] += constant * src[i]; } #if defined(LCP_SIMD) __m128 c = _mm_load1_ps( & constant ); for( ; i + 4 <= count; i += 4 ) { assert_16_byte_aligned( &dst[i] ); assert_16_byte_aligned( &src[i] ); __m128 s = _mm_load_ps( src + i ); __m128 d = _mm_load_ps( dst + i ); s = _mm_add_ps( _mm_mul_ps( s, c ), d ); _mm_store_ps( dst + i, s ); } #else for( ; i + 4 <= count; i += 4 ) { assert_16_byte_aligned( &src[i] ); assert_16_byte_aligned( &dst[i] ); dst[i + 0] += constant * src[i + 0]; dst[i + 1] += constant * src[i + 1]; dst[i + 2] += constant * src[i + 2]; dst[i + 3] += constant * src[i + 3]; } #endif for( ; i < count; i++ ) { dst[i] += constant * src[i]; } } /* ======================== DotProduct_SIMD dot = src0[0] * src1[0] + src0[1] * src1[1] + src0[2] * src1[2] + ... ======================== */ static float DotProduct_SIMD( const float* src0, const float* src1, const int count ) { assert_16_byte_aligned( src0 ); assert_16_byte_aligned( src1 ); #if defined(LCP_SIMD) __m128 sum = ( __m128& ) SIMD_SP_zero; int i = 0; for( ; i < count - 3; i += 4 ) { __m128 s0 = _mm_load_ps( src0 + i ); __m128 s1 = _mm_load_ps( src1 + i ); sum = _mm_add_ps( _mm_mul_ps( s0, s1 ), sum ); } __m128 mask = _mm_load_ps( ( float* ) &SIMD_SP_indexedEndMask[count & 3] ); __m128 s0 = _mm_and_ps( _mm_load_ps( src0 + i ), mask ); __m128 s1 = _mm_and_ps( _mm_load_ps( src1 + i ), mask ); sum = _mm_add_ps( _mm_mul_ps( s0, s1 ), sum ); sum = _mm_add_ps( sum, _mm_shuffle_ps( sum, sum, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); sum = _mm_add_ps( sum, _mm_shuffle_ps( sum, sum, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); float dot; _mm_store_ss( & dot, sum ); return dot; #else // RB: the old loop caused completely broken rigid body physics and NaN errors #if 1 float s0 = 0.0f; float s1 = 0.0f; float s2 = 0.0f; float s3 = 0.0f; int i = 0; for( ; i + 4 <= count; i += 4 ) { s0 += src0[i + 0] * src1[i + 0]; s1 += src0[i + 1] * src1[i + 1]; s2 += src0[i + 2] * src1[i + 2]; s3 += src0[i + 3] * src1[i + 3]; } switch( count - i ) { NODEFAULT; case 4: s3 += src0[i + 3] * src1[i + 3]; case 3: s2 += src0[i + 2] * src1[i + 2]; case 2: s1 += src0[i + 1] * src1[i + 1]; case 1: s0 += src0[i + 0] * src1[i + 0]; case 0: break; } return s0 + s1 + s2 + s3; #else float s = 0; for( int i = 0; i < count; i++ ) { s += src0[i] * src1[i]; } return s; #endif // RB end #endif } /* ======================== LowerTriangularSolve_SIMD Solves x in Lx = b for the n * n sub-matrix of L. * if skip > 0 the first skip elements of x are assumed to be valid already * L has to be a lower triangular matrix with (implicit) ones on the diagonal * x == b is allowed ======================== */ static void LowerTriangularSolve_SIMD( const idMatX& L, float* x, const float* b, const int n, int skip ) { if( skip >= n ) { return; } const float* lptr = L.ToFloatPtr(); int nc = L.GetNumColumns(); assert( ( nc & 3 ) == 0 ); // unrolled cases for n < 8 if( n < 8 ) { #define NSKIP( n, s ) ((n<<3)|(s&7)) switch( NSKIP( n, skip ) ) { case NSKIP( 1, 0 ): x[0] = b[0]; return; case NSKIP( 2, 0 ): x[0] = b[0]; case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1 * nc + 0] * x[0]; return; case NSKIP( 3, 0 ): x[0] = b[0]; case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1 * nc + 0] * x[0]; case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2 * nc + 0] * x[0] - lptr[2 * nc + 1] * x[1]; return; case NSKIP( 4, 0 ): x[0] = b[0]; case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1 * nc + 0] * x[0]; case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2 * nc + 0] * x[0] - lptr[2 * nc + 1] * x[1]; case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3 * nc + 0] * x[0] - lptr[3 * nc + 1] * x[1] - lptr[3 * nc + 2] * x[2]; return; case NSKIP( 5, 0 ): x[0] = b[0]; case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1 * nc + 0] * x[0]; case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2 * nc + 0] * x[0] - lptr[2 * nc + 1] * x[1]; case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3 * nc + 0] * x[0] - lptr[3 * nc + 1] * x[1] - lptr[3 * nc + 2] * x[2]; case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4 * nc + 0] * x[0] - lptr[4 * nc + 1] * x[1] - lptr[4 * nc + 2] * x[2] - lptr[4 * nc + 3] * x[3]; return; case NSKIP( 6, 0 ): x[0] = b[0]; case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1 * nc + 0] * x[0]; case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2 * nc + 0] * x[0] - lptr[2 * nc + 1] * x[1]; case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3 * nc + 0] * x[0] - lptr[3 * nc + 1] * x[1] - lptr[3 * nc + 2] * x[2]; case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4 * nc + 0] * x[0] - lptr[4 * nc + 1] * x[1] - lptr[4 * nc + 2] * x[2] - lptr[4 * nc + 3] * x[3]; case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5 * nc + 0] * x[0] - lptr[5 * nc + 1] * x[1] - lptr[5 * nc + 2] * x[2] - lptr[5 * nc + 3] * x[3] - lptr[5 * nc + 4] * x[4]; return; case NSKIP( 7, 0 ): x[0] = b[0]; case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1 * nc + 0] * x[0]; case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2 * nc + 0] * x[0] - lptr[2 * nc + 1] * x[1]; case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3 * nc + 0] * x[0] - lptr[3 * nc + 1] * x[1] - lptr[3 * nc + 2] * x[2]; case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4 * nc + 0] * x[0] - lptr[4 * nc + 1] * x[1] - lptr[4 * nc + 2] * x[2] - lptr[4 * nc + 3] * x[3]; case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5 * nc + 0] * x[0] - lptr[5 * nc + 1] * x[1] - lptr[5 * nc + 2] * x[2] - lptr[5 * nc + 3] * x[3] - lptr[5 * nc + 4] * x[4]; case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6 * nc + 0] * x[0] - lptr[6 * nc + 1] * x[1] - lptr[6 * nc + 2] * x[2] - lptr[6 * nc + 3] * x[3] - lptr[6 * nc + 4] * x[4] - lptr[6 * nc + 5] * x[5]; return; } #undef NSKIP return; } // process first 4 rows switch( skip ) { case 0: x[0] = b[0]; case 1: x[1] = b[1] - lptr[1 * nc + 0] * x[0]; case 2: x[2] = b[2] - lptr[2 * nc + 0] * x[0] - lptr[2 * nc + 1] * x[1]; case 3: x[3] = b[3] - lptr[3 * nc + 0] * x[0] - lptr[3 * nc + 1] * x[1] - lptr[3 * nc + 2] * x[2]; skip = 4; } lptr = L[skip]; int i = skip; #if defined(LCP_SIMD) // work up to a multiple of 4 rows for( ; ( i & 3 ) != 0 && i < n; i++ ) { __m128 sum = _mm_load_ss( & b[i] ); int j = 0; for( ; j < i - 3; j += 4 ) { __m128 s0 = _mm_load_ps( lptr + j ); __m128 s1 = _mm_load_ps( x + j ); sum = _mm_sub_ps( sum, _mm_mul_ps( s0, s1 ) ); } __m128 mask = _mm_load_ps( ( float* ) & SIMD_SP_indexedEndMask[i & 3] ); __m128 s0 = _mm_and_ps( _mm_load_ps( lptr + j ), mask ); __m128 s1 = _mm_and_ps( _mm_load_ps( x + j ), mask ); sum = _mm_sub_ps( sum, _mm_mul_ps( s0, s1 ) ); sum = _mm_add_ps( sum, _mm_shuffle_ps( sum, sum, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); sum = _mm_add_ps( sum, _mm_shuffle_ps( sum, sum, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); _mm_store_ss( & x[i], sum ); lptr += nc; } for( ; i + 3 < n; i += 4 ) { const float* lptr0 = &lptr[0 * nc]; const float* lptr1 = &lptr[1 * nc]; const float* lptr2 = &lptr[2 * nc]; const float* lptr3 = &lptr[3 * nc]; assert_16_byte_aligned( lptr0 ); assert_16_byte_aligned( lptr1 ); assert_16_byte_aligned( lptr2 ); assert_16_byte_aligned( lptr3 ); __m128 va = _mm_load_ss( & b[i + 0] ); __m128 vb = _mm_load_ss( & b[i + 1] ); __m128 vc = _mm_load_ss( & b[i + 2] ); __m128 vd = _mm_load_ss( & b[i + 3] ); __m128 x0 = _mm_load_ps( & x[0] ); va = _mm_sub_ps( va, _mm_mul_ps( x0, _mm_load_ps( lptr0 + 0 ) ) ); vb = _mm_sub_ps( vb, _mm_mul_ps( x0, _mm_load_ps( lptr1 + 0 ) ) ); vc = _mm_sub_ps( vc, _mm_mul_ps( x0, _mm_load_ps( lptr2 + 0 ) ) ); vd = _mm_sub_ps( vd, _mm_mul_ps( x0, _mm_load_ps( lptr3 + 0 ) ) ); for( int j = 4; j < i; j += 4 ) { __m128 xj = _mm_load_ps( &x[j] ); va = _mm_sub_ps( va, _mm_mul_ps( xj, _mm_load_ps( lptr0 + j ) ) ); vb = _mm_sub_ps( vb, _mm_mul_ps( xj, _mm_load_ps( lptr1 + j ) ) ); vc = _mm_sub_ps( vc, _mm_mul_ps( xj, _mm_load_ps( lptr2 + j ) ) ); vd = _mm_sub_ps( vd, _mm_mul_ps( xj, _mm_load_ps( lptr3 + j ) ) ); } vb = _mm_sub_ps( vb, _mm_mul_ps( va, _mm_load1_ps( lptr1 + i + 0 ) ) ); vc = _mm_sub_ps( vc, _mm_mul_ps( va, _mm_load1_ps( lptr2 + i + 0 ) ) ); vc = _mm_sub_ps( vc, _mm_mul_ps( vb, _mm_load1_ps( lptr2 + i + 1 ) ) ); vd = _mm_sub_ps( vd, _mm_mul_ps( va, _mm_load1_ps( lptr3 + i + 0 ) ) ); vd = _mm_sub_ps( vd, _mm_mul_ps( vb, _mm_load1_ps( lptr3 + i + 1 ) ) ); vd = _mm_sub_ps( vd, _mm_mul_ps( vc, _mm_load1_ps( lptr3 + i + 2 ) ) ); __m128 ta = _mm_unpacklo_ps( va, vc ); // x0, z0, x1, z1 __m128 tb = _mm_unpackhi_ps( va, vc ); // x2, z2, x3, z3 __m128 tc = _mm_unpacklo_ps( vb, vd ); // y0, w0, y1, w1 __m128 td = _mm_unpackhi_ps( vb, vd ); // y2, w2, y3, w3 va = _mm_unpacklo_ps( ta, tc ); // x0, y0, z0, w0 vb = _mm_unpackhi_ps( ta, tc ); // x1, y1, z1, w1 vc = _mm_unpacklo_ps( tb, td ); // x2, y2, z2, w2 vd = _mm_unpackhi_ps( tb, td ); // x3, y3, z3, w3 va = _mm_add_ps( va, vb ); vc = _mm_add_ps( vc, vd ); va = _mm_add_ps( va, vc ); _mm_store_ps( & x[i], va ); lptr += 4 * nc; } // go through any remaining rows for( ; i < n; i++ ) { __m128 sum = _mm_load_ss( & b[i] ); int j = 0; for( ; j < i - 3; j += 4 ) { __m128 s0 = _mm_load_ps( lptr + j ); __m128 s1 = _mm_load_ps( x + j ); sum = _mm_sub_ps( sum, _mm_mul_ps( s0, s1 ) ); } __m128 mask = _mm_load_ps( ( float* ) & SIMD_SP_indexedEndMask[i & 3] ); __m128 s0 = _mm_and_ps( _mm_load_ps( lptr + j ), mask ); __m128 s1 = _mm_and_ps( _mm_load_ps( x + j ), mask ); sum = _mm_sub_ps( sum, _mm_mul_ps( s0, s1 ) ); sum = _mm_add_ps( sum, _mm_shuffle_ps( sum, sum, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); sum = _mm_add_ps( sum, _mm_shuffle_ps( sum, sum, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); _mm_store_ss( & x[i], sum ); lptr += nc; } #else // work up to a multiple of 4 rows for( ; ( i & 3 ) != 0 && i < n; i++ ) { float sum = b[i]; for( int j = 0; j < i; j++ ) { sum -= lptr[j] * x[j]; } x[i] = sum; lptr += nc; } assert_16_byte_aligned( x ); for( ; i + 3 < n; i += 4 ) { const float* lptr0 = &lptr[0 * nc]; const float* lptr1 = &lptr[1 * nc]; const float* lptr2 = &lptr[2 * nc]; const float* lptr3 = &lptr[3 * nc]; assert_16_byte_aligned( lptr0 ); assert_16_byte_aligned( lptr1 ); assert_16_byte_aligned( lptr2 ); assert_16_byte_aligned( lptr3 ); float a0 = - lptr0[0] * x[0] + b[i + 0]; float a1 = - lptr0[1] * x[1]; float a2 = - lptr0[2] * x[2]; float a3 = - lptr0[3] * x[3]; float b0 = - lptr1[0] * x[0] + b[i + 1]; float b1 = - lptr1[1] * x[1]; float b2 = - lptr1[2] * x[2]; float b3 = - lptr1[3] * x[3]; float c0 = - lptr2[0] * x[0] + b[i + 2]; float c1 = - lptr2[1] * x[1]; float c2 = - lptr2[2] * x[2]; float c3 = - lptr2[3] * x[3]; float d0 = - lptr3[0] * x[0] + b[i + 3]; float d1 = - lptr3[1] * x[1]; float d2 = - lptr3[2] * x[2]; float d3 = - lptr3[3] * x[3]; for( int j = 4; j < i; j += 4 ) { a0 -= lptr0[j + 0] * x[j + 0]; a1 -= lptr0[j + 1] * x[j + 1]; a2 -= lptr0[j + 2] * x[j + 2]; a3 -= lptr0[j + 3] * x[j + 3]; b0 -= lptr1[j + 0] * x[j + 0]; b1 -= lptr1[j + 1] * x[j + 1]; b2 -= lptr1[j + 2] * x[j + 2]; b3 -= lptr1[j + 3] * x[j + 3]; c0 -= lptr2[j + 0] * x[j + 0]; c1 -= lptr2[j + 1] * x[j + 1]; c2 -= lptr2[j + 2] * x[j + 2]; c3 -= lptr2[j + 3] * x[j + 3]; d0 -= lptr3[j + 0] * x[j + 0]; d1 -= lptr3[j + 1] * x[j + 1]; d2 -= lptr3[j + 2] * x[j + 2]; d3 -= lptr3[j + 3] * x[j + 3]; } b0 -= lptr1[i + 0] * a0; b1 -= lptr1[i + 0] * a1; b2 -= lptr1[i + 0] * a2; b3 -= lptr1[i + 0] * a3; c0 -= lptr2[i + 0] * a0; c1 -= lptr2[i + 0] * a1; c2 -= lptr2[i + 0] * a2; c3 -= lptr2[i + 0] * a3; c0 -= lptr2[i + 1] * b0; c1 -= lptr2[i + 1] * b1; c2 -= lptr2[i + 1] * b2; c3 -= lptr2[i + 1] * b3; d0 -= lptr3[i + 0] * a0; d1 -= lptr3[i + 0] * a1; d2 -= lptr3[i + 0] * a2; d3 -= lptr3[i + 0] * a3; d0 -= lptr3[i + 1] * b0; d1 -= lptr3[i + 1] * b1; d2 -= lptr3[i + 1] * b2; d3 -= lptr3[i + 1] * b3; d0 -= lptr3[i + 2] * c0; d1 -= lptr3[i + 2] * c1; d2 -= lptr3[i + 2] * c2; d3 -= lptr3[i + 2] * c3; x[i + 0] = a0 + a1 + a2 + a3; x[i + 1] = b0 + b1 + b2 + b3; x[i + 2] = c0 + c1 + c2 + c3; x[i + 3] = d0 + d1 + d2 + d3; lptr += 4 * nc; } // go through any remaining rows for( ; i < n; i++ ) { float sum = b[i]; for( int j = 0; j < i; j++ ) { sum -= lptr[j] * x[j]; } x[i] = sum; lptr += nc; } #endif } /* ======================== LowerTriangularSolveTranspose_SIMD Solves x in L'x = b for the n * n sub-matrix of L. * L has to be a lower triangular matrix with (implicit) ones on the diagonal * x == b is allowed ======================== */ static void LowerTriangularSolveTranspose_SIMD( const idMatX& L, float* x, const float* b, const int n ) { int nc = L.GetNumColumns(); assert( ( nc & 3 ) == 0 ); int m = n; int r = n & 3; if( ( m & 3 ) != 0 ) { const float* lptr = L.ToFloatPtr() + m * nc + m; if( ( m & 3 ) == 1 ) { x[m - 1] = b[m - 1]; m -= 1; } else if( ( m & 3 ) == 2 ) { x[m - 1] = b[m - 1]; x[m - 2] = b[m - 2] - lptr[-1 * nc - 2] * x[m - 1]; m -= 2; } else { x[m - 1] = b[m - 1]; x[m - 2] = b[m - 2] - lptr[-1 * nc - 2] * x[m - 1]; x[m - 3] = b[m - 3] - lptr[-1 * nc - 3] * x[m - 1] - lptr[-2 * nc - 3] * x[m - 2]; m -= 3; } } const float* lptr = L.ToFloatPtr() + m * nc + m - 4; float* xptr = x + m; #if defined(LCP_SIMD) // process 4 rows at a time for( int i = m; i >= 4; i -= 4 ) { assert_16_byte_aligned( b ); assert_16_byte_aligned( xptr ); assert_16_byte_aligned( lptr ); __m128 s0 = _mm_load_ps( &b[i - 4] ); __m128 s1 = ( __m128& )SIMD_SP_zero; __m128 s2 = ( __m128& )SIMD_SP_zero; __m128 s3 = ( __m128& )SIMD_SP_zero; // process 4x4 blocks const float* xptr2 = xptr; // x + i; const float* lptr2 = lptr; // ptr = L[i] + i - 4; for( int j = i; j < m; j += 4 ) { __m128 xj = _mm_load_ps( xptr2 ); s0 = _mm_sub_ps( s0, _mm_mul_ps( _mm_splat_ps( xj, 0 ), _mm_load_ps( lptr2 + 0 * nc ) ) ); s1 = _mm_sub_ps( s1, _mm_mul_ps( _mm_splat_ps( xj, 1 ), _mm_load_ps( lptr2 + 1 * nc ) ) ); s2 = _mm_sub_ps( s2, _mm_mul_ps( _mm_splat_ps( xj, 2 ), _mm_load_ps( lptr2 + 2 * nc ) ) ); s3 = _mm_sub_ps( s3, _mm_mul_ps( _mm_splat_ps( xj, 3 ), _mm_load_ps( lptr2 + 3 * nc ) ) ); lptr2 += 4 * nc; xptr2 += 4; } for( int j = 0; j < r; j++ ) { s0 = _mm_sub_ps( s0, _mm_mul_ps( _mm_load_ps( lptr2 ), _mm_load1_ps( &xptr2[j] ) ) ); lptr2 += nc; } s0 = _mm_add_ps( s0, s1 ); s2 = _mm_add_ps( s2, s3 ); s0 = _mm_add_ps( s0, s2 ); // process left over of the 4 rows lptr -= 4 * nc; __m128 t0 = _mm_and_ps( _mm_load_ps( lptr + 3 * nc ), ( __m128& )SIMD_SP_clearLast1 ); __m128 t1 = _mm_and_ps( _mm_load_ps( lptr + 2 * nc ), ( __m128& )SIMD_SP_clearLast2 ); __m128 t2 = _mm_load_ss( lptr + 1 * nc ); s0 = _mm_sub_ps( s0, _mm_mul_ps( t0, _mm_splat_ps( s0, 3 ) ) ); s0 = _mm_sub_ps( s0, _mm_mul_ps( t1, _mm_splat_ps( s0, 2 ) ) ); s0 = _mm_sub_ps( s0, _mm_mul_ps( t2, _mm_splat_ps( s0, 1 ) ) ); // store result _mm_store_ps( &xptr[-4], s0 ); // update pointers for next four rows lptr -= 4; xptr -= 4; } #else // process 4 rows at a time for( int i = m; i >= 4; i -= 4 ) { assert_16_byte_aligned( b ); assert_16_byte_aligned( xptr ); assert_16_byte_aligned( lptr ); float s0 = b[i - 4]; float s1 = b[i - 3]; float s2 = b[i - 2]; float s3 = b[i - 1]; // process 4x4 blocks const float* xptr2 = xptr; // x + i; const float* lptr2 = lptr; // ptr = L[i] + i - 4; for( int j = i; j < m; j += 4 ) { float t0 = xptr2[0]; s0 -= lptr2[0] * t0; s1 -= lptr2[1] * t0; s2 -= lptr2[2] * t0; s3 -= lptr2[3] * t0; lptr2 += nc; float t1 = xptr2[1]; s0 -= lptr2[0] * t1; s1 -= lptr2[1] * t1; s2 -= lptr2[2] * t1; s3 -= lptr2[3] * t1; lptr2 += nc; float t2 = xptr2[2]; s0 -= lptr2[0] * t2; s1 -= lptr2[1] * t2; s2 -= lptr2[2] * t2; s3 -= lptr2[3] * t2; lptr2 += nc; float t3 = xptr2[3]; s0 -= lptr2[0] * t3; s1 -= lptr2[1] * t3; s2 -= lptr2[2] * t3; s3 -= lptr2[3] * t3; lptr2 += nc; xptr2 += 4; } for( int j = 0; j < r; j++ ) { float t = xptr2[j]; s0 -= lptr2[0] * t; s1 -= lptr2[1] * t; s2 -= lptr2[2] * t; s3 -= lptr2[3] * t; lptr2 += nc; } // process left over of the 4 rows lptr -= nc; s0 -= lptr[0] * s3; s1 -= lptr[1] * s3; s2 -= lptr[2] * s3; lptr -= nc; s0 -= lptr[0] * s2; s1 -= lptr[1] * s2; lptr -= nc; s0 -= lptr[0] * s1; lptr -= nc; // store result xptr[-4] = s0; xptr[-3] = s1; xptr[-2] = s2; xptr[-1] = s3; // update pointers for next four rows lptr -= 4; xptr -= 4; } #endif } /* ======================== UpperTriangularSolve_SIMD Solves x in Ux = b for the n * n sub-matrix of U. * U has to be a upper triangular matrix * invDiag is the reciprical of the diagonal of the upper triangular matrix. * x == b is allowed ======================== */ static void UpperTriangularSolve_SIMD( const idMatX& U, const float* invDiag, float* x, const float* b, const int n ) { for( int i = n - 1; i >= 0; i-- ) { float sum = b[i]; const float* uptr = U[i]; for( int j = i + 1; j < n; j++ ) { sum -= uptr[j] * x[j]; } x[i] = sum * invDiag[i]; } } /* ======================== LU_Factor_SIMD In-place factorization LU of the n * n sub-matrix of mat. The reciprocal of the diagonal elements of U are stored in invDiag. No pivoting is used. ======================== */ static bool LU_Factor_SIMD( idMatX& mat, idVecX& invDiag, const int n ) { for( int i = 0; i < n; i++ ) { float d1 = mat[i][i]; if( fabs( d1 ) < idMath::FLT_SMALLEST_NON_DENORMAL ) { return false; } invDiag[i] = d1 = 1.0f / d1; float* ptr1 = mat[i]; for( int j = i + 1; j < n; j++ ) { float* ptr2 = mat[j]; float d2 = ptr2[i] * d1; ptr2[i] = d2; int k; for( k = n - 1; k > i + 15; k -= 16 ) { ptr2[k - 0] -= d2 * ptr1[k - 0]; ptr2[k - 1] -= d2 * ptr1[k - 1]; ptr2[k - 2] -= d2 * ptr1[k - 2]; ptr2[k - 3] -= d2 * ptr1[k - 3]; ptr2[k - 4] -= d2 * ptr1[k - 4]; ptr2[k - 5] -= d2 * ptr1[k - 5]; ptr2[k - 6] -= d2 * ptr1[k - 6]; ptr2[k - 7] -= d2 * ptr1[k - 7]; ptr2[k - 8] -= d2 * ptr1[k - 8]; ptr2[k - 9] -= d2 * ptr1[k - 9]; ptr2[k - 10] -= d2 * ptr1[k - 10]; ptr2[k - 11] -= d2 * ptr1[k - 11]; ptr2[k - 12] -= d2 * ptr1[k - 12]; ptr2[k - 13] -= d2 * ptr1[k - 13]; ptr2[k - 14] -= d2 * ptr1[k - 14]; ptr2[k - 15] -= d2 * ptr1[k - 15]; } switch( k - i ) { NODEFAULT; case 15: ptr2[k - 14] -= d2 * ptr1[k - 14]; case 14: ptr2[k - 13] -= d2 * ptr1[k - 13]; case 13: ptr2[k - 12] -= d2 * ptr1[k - 12]; case 12: ptr2[k - 11] -= d2 * ptr1[k - 11]; case 11: ptr2[k - 10] -= d2 * ptr1[k - 10]; case 10: ptr2[k - 9] -= d2 * ptr1[k - 9]; case 9: ptr2[k - 8] -= d2 * ptr1[k - 8]; case 8: ptr2[k - 7] -= d2 * ptr1[k - 7]; case 7: ptr2[k - 6] -= d2 * ptr1[k - 6]; case 6: ptr2[k - 5] -= d2 * ptr1[k - 5]; case 5: ptr2[k - 4] -= d2 * ptr1[k - 4]; case 4: ptr2[k - 3] -= d2 * ptr1[k - 3]; case 3: ptr2[k - 2] -= d2 * ptr1[k - 2]; case 2: ptr2[k - 1] -= d2 * ptr1[k - 1]; case 1: ptr2[k - 0] -= d2 * ptr1[k - 0]; case 0: break; } } } return true; } /* ======================== LDLT_Factor_SIMD In-place factorization LDL' of the n * n sub-matrix of mat. The reciprocal of the diagonal elements are stored in invDiag. NOTE: The number of columns of mat must be a multiple of 4. ======================== */ static bool LDLT_Factor_SIMD( idMatX& mat, idVecX& invDiag, const int n ) { float s0, s1, s2, d; float* v = ( float* ) _alloca16( ( ( n + 3 ) & ~3 ) * sizeof( float ) ); float* diag = ( float* ) _alloca16( ( ( n + 3 ) & ~3 ) * sizeof( float ) ); float* invDiagPtr = invDiag.ToFloatPtr(); int nc = mat.GetNumColumns(); assert( ( nc & 3 ) == 0 ); if( n <= 0 ) { return true; } float* mptr = mat[0]; float sum = mptr[0]; if( fabs( sum ) < idMath::FLT_SMALLEST_NON_DENORMAL ) { return false; } diag[0] = sum; invDiagPtr[0] = d = 1.0f / sum; if( n <= 1 ) { return true; } mptr = mat[0]; for( int j = 1; j < n; j++ ) { mptr[j * nc + 0] = ( mptr[j * nc + 0] ) * d; } mptr = mat[1]; v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; sum = mptr[1] - s0; if( fabs( sum ) < idMath::FLT_SMALLEST_NON_DENORMAL ) { return false; } mat[1][1] = sum; diag[1] = sum; invDiagPtr[1] = d = 1.0f / sum; if( n <= 2 ) { return true; } mptr = mat[0]; for( int j = 2; j < n; j++ ) { mptr[j * nc + 1] = ( mptr[j * nc + 1] - v[0] * mptr[j * nc + 0] ) * d; } mptr = mat[2]; v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1]; sum = mptr[2] - s0 - s1; if( fabs( sum ) < idMath::FLT_SMALLEST_NON_DENORMAL ) { return false; } mat[2][2] = sum; diag[2] = sum; invDiagPtr[2] = d = 1.0f / sum; if( n <= 3 ) { return true; } mptr = mat[0]; for( int j = 3; j < n; j++ ) { mptr[j * nc + 2] = ( mptr[j * nc + 2] - v[0] * mptr[j * nc + 0] - v[1] * mptr[j * nc + 1] ) * d; } mptr = mat[3]; v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1]; v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2]; sum = mptr[3] - s0 - s1 - s2; if( fabs( sum ) < idMath::FLT_SMALLEST_NON_DENORMAL ) { return false; } mat[3][3] = sum; diag[3] = sum; invDiagPtr[3] = d = 1.0f / sum; if( n <= 4 ) { return true; } mptr = mat[0]; for( int j = 4; j < n; j++ ) { mptr[j * nc + 3] = ( mptr[j * nc + 3] - v[0] * mptr[j * nc + 0] - v[1] * mptr[j * nc + 1] - v[2] * mptr[j * nc + 2] ) * d; } #if defined(LCP_SIMD) __m128 vzero = _mm_setzero_ps(); for( int i = 4; i < n; i += 4 ) { _mm_store_ps( diag + i, vzero ); } for( int i = 4; i < n; i++ ) { mptr = mat[i]; assert_16_byte_aligned( v ); assert_16_byte_aligned( mptr ); assert_16_byte_aligned( diag ); __m128 m0 = _mm_load_ps( mptr + 0 ); __m128 d0 = _mm_load_ps( diag + 0 ); __m128 v0 = _mm_mul_ps( d0, m0 ); __m128 t0 = _mm_load_ss( mptr + i ); t0 = _mm_sub_ps( t0, _mm_mul_ps( m0, v0 ) ); _mm_store_ps( v + 0, v0 ); int k = 4; for( ; k < i - 3; k += 4 ) { m0 = _mm_load_ps( mptr + k ); d0 = _mm_load_ps( diag + k ); v0 = _mm_mul_ps( d0, m0 ); t0 = _mm_sub_ps( t0, _mm_mul_ps( m0, v0 ) ); _mm_store_ps( v + k, v0 ); } __m128 mask = ( __m128& ) SIMD_SP_indexedEndMask[i & 3]; m0 = _mm_and_ps( _mm_load_ps( mptr + k ), mask ); d0 = _mm_load_ps( diag + k ); v0 = _mm_mul_ps( d0, m0 ); t0 = _mm_sub_ps( t0, _mm_mul_ps( m0, v0 ) ); _mm_store_ps( v + k, v0 ); t0 = _mm_add_ps( t0, _mm_shuffle_ps( t0, t0, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); t0 = _mm_add_ps( t0, _mm_shuffle_ps( t0, t0, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); __m128 tiny = _mm_and_ps( _mm_cmpeq_ps( t0, SIMD_SP_zero ), SIMD_SP_tiny ); t0 = _mm_or_ps( t0, tiny ); _mm_store_ss( mptr + i, t0 ); _mm_store_ss( diag + i, t0 ); __m128 d = _mm_rcp32_ps( t0 ); _mm_store_ss( invDiagPtr + i, d ); if( i + 1 >= n ) { return true; } int j = i + 1; for( ; j < n - 3; j += 4 ) { float* ra = mat[j + 0]; float* rb = mat[j + 1]; float* rc = mat[j + 2]; float* rd = mat[j + 3]; assert_16_byte_aligned( v ); assert_16_byte_aligned( ra ); assert_16_byte_aligned( rb ); assert_16_byte_aligned( rc ); assert_16_byte_aligned( rd ); __m128 va = _mm_load_ss( ra + i ); __m128 vb = _mm_load_ss( rb + i ); __m128 vc = _mm_load_ss( rc + i ); __m128 vd = _mm_load_ss( rd + i ); __m128 v0 = _mm_load_ps( v + 0 ); va = _mm_sub_ps( va, _mm_mul_ps( _mm_load_ps( ra + 0 ), v0 ) ); vb = _mm_sub_ps( vb, _mm_mul_ps( _mm_load_ps( rb + 0 ), v0 ) ); vc = _mm_sub_ps( vc, _mm_mul_ps( _mm_load_ps( rc + 0 ), v0 ) ); vd = _mm_sub_ps( vd, _mm_mul_ps( _mm_load_ps( rd + 0 ), v0 ) ); int k = 4; for( ; k < i - 3; k += 4 ) { v0 = _mm_load_ps( v + k ); va = _mm_sub_ps( va, _mm_mul_ps( _mm_load_ps( ra + k ), v0 ) ); vb = _mm_sub_ps( vb, _mm_mul_ps( _mm_load_ps( rb + k ), v0 ) ); vc = _mm_sub_ps( vc, _mm_mul_ps( _mm_load_ps( rc + k ), v0 ) ); vd = _mm_sub_ps( vd, _mm_mul_ps( _mm_load_ps( rd + k ), v0 ) ); } v0 = _mm_load_ps( v + k ); va = _mm_sub_ps( va, _mm_mul_ps( _mm_and_ps( _mm_load_ps( ra + k ), mask ), v0 ) ); vb = _mm_sub_ps( vb, _mm_mul_ps( _mm_and_ps( _mm_load_ps( rb + k ), mask ), v0 ) ); vc = _mm_sub_ps( vc, _mm_mul_ps( _mm_and_ps( _mm_load_ps( rc + k ), mask ), v0 ) ); vd = _mm_sub_ps( vd, _mm_mul_ps( _mm_and_ps( _mm_load_ps( rd + k ), mask ), v0 ) ); __m128 ta = _mm_unpacklo_ps( va, vc ); // x0, z0, x1, z1 __m128 tb = _mm_unpackhi_ps( va, vc ); // x2, z2, x3, z3 __m128 tc = _mm_unpacklo_ps( vb, vd ); // y0, w0, y1, w1 __m128 td = _mm_unpackhi_ps( vb, vd ); // y2, w2, y3, w3 va = _mm_unpacklo_ps( ta, tc ); // x0, y0, z0, w0 vb = _mm_unpackhi_ps( ta, tc ); // x1, y1, z1, w1 vc = _mm_unpacklo_ps( tb, td ); // x2, y2, z2, w2 vd = _mm_unpackhi_ps( tb, td ); // x3, y3, z3, w3 va = _mm_add_ps( va, vb ); vc = _mm_add_ps( vc, vd ); va = _mm_add_ps( va, vc ); va = _mm_mul_ps( va, d ); _mm_store_ss( ra + i, _mm_splat_ps( va, 0 ) ); _mm_store_ss( rb + i, _mm_splat_ps( va, 1 ) ); _mm_store_ss( rc + i, _mm_splat_ps( va, 2 ) ); _mm_store_ss( rd + i, _mm_splat_ps( va, 3 ) ); } for( ; j < n; j++ ) { float* mptr = mat[j]; assert_16_byte_aligned( v ); assert_16_byte_aligned( mptr ); __m128 va = _mm_load_ss( mptr + i ); __m128 v0 = _mm_load_ps( v + 0 ); va = _mm_sub_ps( va, _mm_mul_ps( _mm_load_ps( mptr + 0 ), v0 ) ); int k = 4; for( ; k < i - 3; k += 4 ) { v0 = _mm_load_ps( v + k ); va = _mm_sub_ps( va, _mm_mul_ps( _mm_load_ps( mptr + k ), v0 ) ); } v0 = _mm_load_ps( v + k ); va = _mm_sub_ps( va, _mm_mul_ps( _mm_and_ps( _mm_load_ps( mptr + k ), mask ), v0 ) ); va = _mm_add_ps( va, _mm_shuffle_ps( va, va, _MM_SHUFFLE( 1, 0, 3, 2 ) ) ); va = _mm_add_ps( va, _mm_shuffle_ps( va, va, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); va = _mm_mul_ps( va, d ); _mm_store_ss( mptr + i, va ); } } return true; #else for( int i = 4; i < n; i += 4 ) { diag[i + 0] = 0.0f; diag[i + 1] = 0.0f; diag[i + 2] = 0.0f; diag[i + 3] = 0.0f; } for( int i = 4; i < n; i++ ) { mptr = mat[i]; assert_16_byte_aligned( v ); assert_16_byte_aligned( mptr ); assert_16_byte_aligned( diag ); v[0] = diag[0] * mptr[0]; v[1] = diag[1] * mptr[1]; v[2] = diag[2] * mptr[2]; v[3] = diag[3] * mptr[3]; float t0 = - mptr[0] * v[0] + mptr[i]; float t1 = - mptr[1] * v[1]; float t2 = - mptr[2] * v[2]; float t3 = - mptr[3] * v[3]; int k = 4; for( ; k < i - 3; k += 4 ) { v[k + 0] = diag[k + 0] * mptr[k + 0]; v[k + 1] = diag[k + 1] * mptr[k + 1]; v[k + 2] = diag[k + 2] * mptr[k + 2]; v[k + 3] = diag[k + 3] * mptr[k + 3]; t0 -= mptr[k + 0] * v[k + 0]; t1 -= mptr[k + 1] * v[k + 1]; t2 -= mptr[k + 2] * v[k + 2]; t3 -= mptr[k + 3] * v[k + 3]; } float m0 = ( i - k > 0 ) ? mptr[k + 0] : 0.0f; float m1 = ( i - k > 1 ) ? mptr[k + 1] : 0.0f; float m2 = ( i - k > 2 ) ? mptr[k + 2] : 0.0f; float m3 = ( i - k > 3 ) ? mptr[k + 3] : 0.0f; v[k + 0] = diag[k + 0] * m0; v[k + 1] = diag[k + 1] * m1; v[k + 2] = diag[k + 2] * m2; v[k + 3] = diag[k + 3] * m3; t0 -= m0 * v[k + 0]; t1 -= m1 * v[k + 1]; t2 -= m2 * v[k + 2]; t3 -= m3 * v[k + 3]; sum = t0 + t1 + t2 + t3; if( fabs( sum ) < idMath::FLT_SMALLEST_NON_DENORMAL ) { return false; } mat[i][i] = sum; diag[i] = sum; invDiagPtr[i] = d = 1.0f / sum; if( i + 1 >= n ) { return true; } int j = i + 1; for( ; j < n - 3; j += 4 ) { float* ra = mat[j + 0]; float* rb = mat[j + 1]; float* rc = mat[j + 2]; float* rd = mat[j + 3]; assert_16_byte_aligned( v ); assert_16_byte_aligned( ra ); assert_16_byte_aligned( rb ); assert_16_byte_aligned( rc ); assert_16_byte_aligned( rd ); float a0 = - ra[0] * v[0] + ra[i]; float a1 = - ra[1] * v[1]; float a2 = - ra[2] * v[2]; float a3 = - ra[3] * v[3]; float b0 = - rb[0] * v[0] + rb[i]; float b1 = - rb[1] * v[1]; float b2 = - rb[2] * v[2]; float b3 = - rb[3] * v[3]; float c0 = - rc[0] * v[0] + rc[i]; float c1 = - rc[1] * v[1]; float c2 = - rc[2] * v[2]; float c3 = - rc[3] * v[3]; float d0 = - rd[0] * v[0] + rd[i]; float d1 = - rd[1] * v[1]; float d2 = - rd[2] * v[2]; float d3 = - rd[3] * v[3]; int k = 4; for( ; k < i - 3; k += 4 ) { a0 -= ra[k + 0] * v[k + 0]; a1 -= ra[k + 1] * v[k + 1]; a2 -= ra[k + 2] * v[k + 2]; a3 -= ra[k + 3] * v[k + 3]; b0 -= rb[k + 0] * v[k + 0]; b1 -= rb[k + 1] * v[k + 1]; b2 -= rb[k + 2] * v[k + 2]; b3 -= rb[k + 3] * v[k + 3]; c0 -= rc[k + 0] * v[k + 0]; c1 -= rc[k + 1] * v[k + 1]; c2 -= rc[k + 2] * v[k + 2]; c3 -= rc[k + 3] * v[k + 3]; d0 -= rd[k + 0] * v[k + 0]; d1 -= rd[k + 1] * v[k + 1]; d2 -= rd[k + 2] * v[k + 2]; d3 -= rd[k + 3] * v[k + 3]; } float ra0 = ( i - k > 0 ) ? ra[k + 0] : 0.0f; float ra1 = ( i - k > 1 ) ? ra[k + 1] : 0.0f; float ra2 = ( i - k > 2 ) ? ra[k + 2] : 0.0f; float ra3 = ( i - k > 3 ) ? ra[k + 3] : 0.0f; float rb0 = ( i - k > 0 ) ? rb[k + 0] : 0.0f; float rb1 = ( i - k > 1 ) ? rb[k + 1] : 0.0f; float rb2 = ( i - k > 2 ) ? rb[k + 2] : 0.0f; float rb3 = ( i - k > 3 ) ? rb[k + 3] : 0.0f; float rc0 = ( i - k > 0 ) ? rc[k + 0] : 0.0f; float rc1 = ( i - k > 1 ) ? rc[k + 1] : 0.0f; float rc2 = ( i - k > 2 ) ? rc[k + 2] : 0.0f; float rc3 = ( i - k > 3 ) ? rc[k + 3] : 0.0f; float rd0 = ( i - k > 0 ) ? rd[k + 0] : 0.0f; float rd1 = ( i - k > 1 ) ? rd[k + 1] : 0.0f; float rd2 = ( i - k > 2 ) ? rd[k + 2] : 0.0f; float rd3 = ( i - k > 3 ) ? rd[k + 3] : 0.0f; a0 -= ra0 * v[k + 0]; a1 -= ra1 * v[k + 1]; a2 -= ra2 * v[k + 2]; a3 -= ra3 * v[k + 3]; b0 -= rb0 * v[k + 0]; b1 -= rb1 * v[k + 1]; b2 -= rb2 * v[k + 2]; b3 -= rb3 * v[k + 3]; c0 -= rc0 * v[k + 0]; c1 -= rc1 * v[k + 1]; c2 -= rc2 * v[k + 2]; c3 -= rc3 * v[k + 3]; d0 -= rd0 * v[k + 0]; d1 -= rd1 * v[k + 1]; d2 -= rd2 * v[k + 2]; d3 -= rd3 * v[k + 3]; ra[i] = ( a0 + a1 + a2 + a3 ) * d; rb[i] = ( b0 + b1 + b2 + b3 ) * d; rc[i] = ( c0 + c1 + c2 + c3 ) * d; rd[i] = ( d0 + d1 + d2 + d3 ) * d; } for( ; j < n; j++ ) { mptr = mat[j]; assert_16_byte_aligned( v ); assert_16_byte_aligned( mptr ); float a0 = - mptr[0] * v[0] + mptr[i]; float a1 = - mptr[1] * v[1]; float a2 = - mptr[2] * v[2]; float a3 = - mptr[3] * v[3]; int k = 4; for( ; k < i - 3; k += 4 ) { a0 -= mptr[k + 0] * v[k + 0]; a1 -= mptr[k + 1] * v[k + 1]; a2 -= mptr[k + 2] * v[k + 2]; a3 -= mptr[k + 3] * v[k + 3]; } float m0 = ( i - k > 0 ) ? mptr[k + 0] : 0.0f; float m1 = ( i - k > 1 ) ? mptr[k + 1] : 0.0f; float m2 = ( i - k > 2 ) ? mptr[k + 2] : 0.0f; float m3 = ( i - k > 3 ) ? mptr[k + 3] : 0.0f; a0 -= m0 * v[k + 0]; a1 -= m1 * v[k + 1]; a2 -= m2 * v[k + 2]; a3 -= m3 * v[k + 3]; mptr[i] = ( a0 + a1 + a2 + a3 ) * d; } } return true; #endif } /* ======================== GetMaxStep_SIMD ======================== */ static void GetMaxStep_SIMD( const float* f, const float* a, const float* delta_f, const float* delta_a, const float* lo, const float* hi, const int* side, int numUnbounded, int numClamped, int d, float dir, float& maxStep, int& limit, int& limitSide ) { #if defined(LCP_SIMD) __m128 vMaxStep; __m128i vLimit; __m128i vLimitSide; // default to a full step for the current variable { __m128 vNegAccel = _mm_xor_ps( _mm_load1_ps( a + d ), ( __m128& ) SIMD_SP_signBit ); __m128 vDeltaAccel = _mm_load1_ps( delta_a + d ); __m128 vM0 = _mm_cmpgt_ps( _mm_and_ps( vDeltaAccel, ( __m128& ) SIMD_SP_absMask ), SIMD_SP_LCP_DELTA_ACCEL_EPSILON ); __m128 vStep = _mm_div32_ps( vNegAccel, _mm_sel_ps( SIMD_SP_one, vDeltaAccel, vM0 ) ); vMaxStep = _mm_sel_ps( SIMD_SP_zero, vStep, vM0 ); vLimit = _mm_shuffle_epi32( _mm_cvtsi32_si128( d ), 0 ); vLimitSide = ( __m128i& ) SIMD_DW_zero; } // test the current variable { __m128 vDeltaForce = _mm_load1_ps( & dir ); __m128 vSign = _mm_cmplt_ps( vDeltaForce, SIMD_SP_zero ); __m128 vForceLimit = _mm_sel_ps( _mm_load1_ps( hi + d ), _mm_load1_ps( lo + d ), vSign ); __m128 vStep = _mm_div32_ps( _mm_sub_ps( vForceLimit, _mm_load1_ps( f + d ) ), vDeltaForce ); __m128i vSetSide = _mm_or_si128( __m128c( vSign ), ( __m128i& ) SIMD_DW_one ); __m128 vM0 = _mm_cmpgt_ps( _mm_and_ps( vDeltaForce, ( __m128& ) SIMD_SP_absMask ), SIMD_SP_LCP_DELTA_FORCE_EPSILON ); __m128 vM1 = _mm_cmpneq_ps( _mm_and_ps( vForceLimit, ( __m128& ) SIMD_SP_absMask ), SIMD_SP_infinity ); __m128 vM2 = _mm_cmplt_ps( vStep, vMaxStep ); __m128 vM3 = _mm_and_ps( _mm_and_ps( vM0, vM1 ), vM2 ); vMaxStep = _mm_sel_ps( vMaxStep, vStep, vM3 ); vLimitSide = _mm_sel_si128( vLimitSide, vSetSide, __m128c( vM3 ) ); } // test the clamped bounded variables { __m128 mask = ( __m128& ) SIMD_SP_indexedStartMask[numUnbounded & 3]; __m128i index = _mm_add_epi32( _mm_and_si128( _mm_shuffle_epi32( _mm_cvtsi32_si128( numUnbounded ), 0 ), ( __m128i& ) SIMD_DW_not3 ), ( __m128i& ) SIMD_DW_index ); int i = numUnbounded & ~3; for( ; i < numClamped - 3; i += 4 ) { __m128 vDeltaForce = _mm_and_ps( _mm_load_ps( delta_f + i ), mask ); __m128 vSign = _mm_cmplt_ps( vDeltaForce, SIMD_SP_zero ); __m128 vForceLimit = _mm_sel_ps( _mm_load_ps( hi + i ), _mm_load_ps( lo + i ), vSign ); __m128 vM0 = _mm_cmpgt_ps( _mm_and_ps( vDeltaForce, ( __m128& ) SIMD_SP_absMask ), SIMD_SP_LCP_DELTA_FORCE_EPSILON ); __m128 vStep = _mm_div32_ps( _mm_sub_ps( vForceLimit, _mm_load_ps( f + i ) ), _mm_sel_ps( SIMD_SP_one, vDeltaForce, vM0 ) ); __m128i vSetSide = _mm_or_si128( __m128c( vSign ), ( __m128i& ) SIMD_DW_one ); __m128 vM1 = _mm_cmpneq_ps( _mm_and_ps( vForceLimit, ( __m128& ) SIMD_SP_absMask ), SIMD_SP_infinity ); __m128 vM2 = _mm_cmplt_ps( vStep, vMaxStep ); __m128 vM3 = _mm_and_ps( _mm_and_ps( vM0, vM1 ), vM2 ); vMaxStep = _mm_sel_ps( vMaxStep, vStep, vM3 ); vLimit = _mm_sel_si128( vLimit, index, vM3 ); vLimitSide = _mm_sel_si128( vLimitSide, vSetSide, __m128c( vM3 ) ); mask = ( __m128& ) SIMD_SP_indexedStartMask[0]; index = _mm_add_epi32( index, ( __m128i& ) SIMD_DW_four ); } __m128 vDeltaForce = _mm_and_ps( _mm_load_ps( delta_f + i ), _mm_and_ps( mask, ( __m128& ) SIMD_SP_indexedEndMask[numClamped & 3] ) ); __m128 vSign = _mm_cmplt_ps( vDeltaForce, SIMD_SP_zero ); __m128 vForceLimit = _mm_sel_ps( _mm_load_ps( hi + i ), _mm_load_ps( lo + i ), vSign ); __m128 vM0 = _mm_cmpgt_ps( _mm_and_ps( vDeltaForce, ( __m128& ) SIMD_SP_absMask ), SIMD_SP_LCP_DELTA_FORCE_EPSILON ); __m128 vStep = _mm_div32_ps( _mm_sub_ps( vForceLimit, _mm_load_ps( f + i ) ), _mm_sel_ps( SIMD_SP_one, vDeltaForce, vM0 ) ); __m128i vSetSide = _mm_or_si128( __m128c( vSign ), ( __m128i& ) SIMD_DW_one ); __m128 vM1 = _mm_cmpneq_ps( _mm_and_ps( vForceLimit, ( __m128& ) SIMD_SP_absMask ), SIMD_SP_infinity ); __m128 vM2 = _mm_cmplt_ps( vStep, vMaxStep ); __m128 vM3 = _mm_and_ps( _mm_and_ps( vM0, vM1 ), vM2 ); vMaxStep = _mm_sel_ps( vMaxStep, vStep, vM3 ); vLimit = _mm_sel_si128( vLimit, index, vM3 ); vLimitSide = _mm_sel_si128( vLimitSide, vSetSide, __m128c( vM3 ) ); } // test the not clamped bounded variables { __m128 mask = ( __m128& ) SIMD_SP_indexedStartMask[numClamped & 3]; __m128i index = _mm_add_epi32( _mm_and_si128( _mm_shuffle_epi32( _mm_cvtsi32_si128( numClamped ), 0 ), ( __m128i& ) SIMD_DW_not3 ), ( __m128i& ) SIMD_DW_index ); int i = numClamped & ~3; for( ; i < d - 3; i += 4 ) { __m128 vNegAccel = _mm_xor_ps( _mm_load_ps( a + i ), ( __m128& ) SIMD_SP_signBit ); __m128 vDeltaAccel = _mm_and_ps( _mm_load_ps( delta_a + i ), mask ); __m128 vSide = _mm_cvtepi32_ps( _mm_load_si128( ( __m128i* )( side + i ) ) ); __m128 vM0 = _mm_cmpgt_ps( _mm_mul_ps( vSide, vDeltaAccel ), SIMD_SP_LCP_DELTA_ACCEL_EPSILON ); __m128 vStep = _mm_div32_ps( vNegAccel, _mm_sel_ps( SIMD_SP_one, vDeltaAccel, vM0 ) ); __m128 vM1 = _mm_or_ps( _mm_cmplt_ps( _mm_load_ps( lo + i ), SIMD_SP_neg_LCP_BOUND_EPSILON ), _mm_cmpgt_ps( _mm_load_ps( hi + i ), SIMD_SP_LCP_BOUND_EPSILON ) ); __m128 vM2 = _mm_cmplt_ps( vStep, vMaxStep ); __m128 vM3 = _mm_and_ps( _mm_and_ps( vM0, vM1 ), vM2 ); vMaxStep = _mm_sel_ps( vMaxStep, vStep, vM3 ); vLimit = _mm_sel_si128( vLimit, index, vM3 ); vLimitSide = _mm_sel_si128( vLimitSide, ( __m128i& ) SIMD_DW_zero, __m128c( vM3 ) ); mask = ( __m128& ) SIMD_SP_indexedStartMask[0]; index = _mm_add_epi32( index, ( __m128i& ) SIMD_DW_four ); } __m128 vNegAccel = _mm_xor_ps( _mm_load_ps( a + i ), ( __m128& ) SIMD_SP_signBit ); __m128 vDeltaAccel = _mm_and_ps( _mm_load_ps( delta_a + i ), _mm_and_ps( mask, ( __m128& ) SIMD_SP_indexedEndMask[d & 3] ) ); __m128 vSide = _mm_cvtepi32_ps( _mm_load_si128( ( __m128i* )( side + i ) ) ); __m128 vM0 = _mm_cmpgt_ps( _mm_mul_ps( vSide, vDeltaAccel ), SIMD_SP_LCP_DELTA_ACCEL_EPSILON ); __m128 vStep = _mm_div32_ps( vNegAccel, _mm_sel_ps( SIMD_SP_one, vDeltaAccel, vM0 ) ); __m128 vM1 = _mm_or_ps( _mm_cmplt_ps( _mm_load_ps( lo + i ), SIMD_SP_neg_LCP_BOUND_EPSILON ), _mm_cmpgt_ps( _mm_load_ps( hi + i ), SIMD_SP_LCP_BOUND_EPSILON ) ); __m128 vM2 = _mm_cmplt_ps( vStep, vMaxStep ); __m128 vM3 = _mm_and_ps( _mm_and_ps( vM0, vM1 ), vM2 ); vMaxStep = _mm_sel_ps( vMaxStep, vStep, vM3 ); vLimit = _mm_sel_si128( vLimit, index, vM3 ); vLimitSide = _mm_sel_si128( vLimitSide, ( __m128i& ) SIMD_DW_zero, __m128c( vM3 ) ); } { __m128 tMaxStep = _mm_shuffle_ps( vMaxStep, vMaxStep, _MM_SHUFFLE( 1, 0, 3, 2 ) ); __m128i tLimit = _mm_shuffle_epi32( vLimit, _MM_SHUFFLE( 1, 0, 3, 2 ) ); __m128i tLimitSide = _mm_shuffle_epi32( vLimitSide, _MM_SHUFFLE( 1, 0, 3, 2 ) ); __m128c mask = _mm_cmplt_ps( tMaxStep, vMaxStep ); vMaxStep = _mm_min_ps( vMaxStep, tMaxStep ); vLimit = _mm_sel_si128( vLimit, tLimit, mask ); vLimitSide = _mm_sel_si128( vLimitSide, tLimitSide, mask ); } { __m128 tMaxStep = _mm_shuffle_ps( vMaxStep, vMaxStep, _MM_SHUFFLE( 2, 3, 0, 1 ) ); __m128i tLimit = _mm_shuffle_epi32( vLimit, _MM_SHUFFLE( 2, 3, 0, 1 ) ); __m128i tLimitSide = _mm_shuffle_epi32( vLimitSide, _MM_SHUFFLE( 2, 3, 0, 1 ) ); __m128c mask = _mm_cmplt_ps( tMaxStep, vMaxStep ); vMaxStep = _mm_min_ps( vMaxStep, tMaxStep ); vLimit = _mm_sel_si128( vLimit, tLimit, mask ); vLimitSide = _mm_sel_si128( vLimitSide, tLimitSide, mask ); } _mm_store_ss( & maxStep, vMaxStep ); limit = _mm_cvtsi128_si32( vLimit ); limitSide = _mm_cvtsi128_si32( vLimitSide ); #else // default to a full step for the current variable { float negAccel = -a[d]; float deltaAccel = delta_a[d]; int m0 = ( fabs( deltaAccel ) > LCP_DELTA_ACCEL_EPSILON ); float step = negAccel / ( m0 ? deltaAccel : 1.0f ); maxStep = m0 ? step : 0.0f; limit = d; limitSide = 0; } // test the current variable { float deltaForce = dir; float forceLimit = ( deltaForce < 0.0f ) ? lo[d] : hi[d]; float step = ( forceLimit - f[d] ) / deltaForce; int setSide = ( deltaForce < 0.0f ) ? -1 : 1; int m0 = ( fabs( deltaForce ) > LCP_DELTA_FORCE_EPSILON ); int m1 = ( fabs( forceLimit ) != idMath::INFINITY ); int m2 = ( step < maxStep ); int m3 = ( m0 & m1 & m2 ); maxStep = m3 ? step : maxStep; limit = m3 ? d : limit; limitSide = m3 ? setSide : limitSide; } // test the clamped bounded variables for( int i = numUnbounded; i < numClamped; i++ ) { float deltaForce = delta_f[i]; float forceLimit = ( deltaForce < 0.0f ) ? lo[i] : hi[i]; int m0 = ( fabs( deltaForce ) > LCP_DELTA_FORCE_EPSILON ); float step = ( forceLimit - f[i] ) / ( m0 ? deltaForce : 1.0f ); int setSide = ( deltaForce < 0.0f ) ? -1 : 1; int m1 = ( fabs( forceLimit ) != idMath::INFINITY ); int m2 = ( step < maxStep ); int m3 = ( m0 & m1 & m2 ); maxStep = m3 ? step : maxStep; limit = m3 ? i : limit; limitSide = m3 ? setSide : limitSide; } // test the not clamped bounded variables for( int i = numClamped; i < d; i++ ) { float negAccel = -a[i]; float deltaAccel = delta_a[i]; int m0 = ( side[i] * deltaAccel > LCP_DELTA_ACCEL_EPSILON ); float step = negAccel / ( m0 ? deltaAccel : 1.0f ); int m1 = ( lo[i] < -LCP_BOUND_EPSILON || hi[i] > LCP_BOUND_EPSILON ); int m2 = ( step < maxStep ); int m3 = ( m0 & m1 & m2 ); maxStep = m3 ? step : maxStep; limit = m3 ? i : limit; limitSide = m3 ? 0 : limitSide; } #endif } /* ================================================================================================ SIMD test code ================================================================================================ */ //#define ENABLE_TEST_CODE #ifdef ENABLE_TEST_CODE #define TEST_TRIANGULAR_SOLVE_SIMD_EPSILON 0.1f #define TEST_TRIANGULAR_SOLVE_SIZE 50 #define TEST_FACTOR_SIMD_EPSILON 0.1f #define TEST_FACTOR_SOLVE_SIZE 50 #define NUM_TESTS 50 /* ======================== PrintClocks ======================== */ static void PrintClocks( const char* string, int dataCount, int64 clocks, int64 otherClocks = 0 ) { idLib::Printf( string ); for( int i = idStr::LengthWithoutColors( string ); i < 48; i++ ) { idLib::Printf( " " ); } if( clocks && otherClocks ) { int p = 0; if( clocks <= otherClocks ) { p = idMath::Ftoi( ( float )( otherClocks - clocks ) * 100.0f / ( float ) otherClocks ); } else { p = - idMath::Ftoi( ( float )( clocks - otherClocks ) * 100.0f / ( float ) clocks ); } idLib::Printf( "c = %4d, clcks = %5lld, %d%%\n", dataCount, clocks, p ); } else { idLib::Printf( "c = %4d, clcks = %5lld\n", dataCount, clocks ); } } /* ======================== DotProduct_Test ======================== */ static void DotProduct_Test() { ALIGN16( float fsrc0[TEST_TRIANGULAR_SOLVE_SIZE + 1]; ) ALIGN16( float fsrc1[TEST_TRIANGULAR_SOLVE_SIZE + 1]; ) idRandom srnd( 13 ); for( int i = 0; i < TEST_TRIANGULAR_SOLVE_SIZE; i++ ) { fsrc0[i] = srnd.CRandomFloat() * 10.0f; fsrc1[i] = srnd.CRandomFloat() * 10.0f; } idTimer timer; for( int i = 0; i < TEST_TRIANGULAR_SOLVE_SIZE; i++ ) { float dot1 = DotProduct_Generic( fsrc0, fsrc1, i ); int64 clocksGeneric = 0xFFFFFFFFFFFF; for( int j = 0; j < NUM_TESTS; j++ ) { fsrc1[TEST_TRIANGULAR_SOLVE_SIZE] = j; timer.Clear(); timer.Start(); dot1 = DotProduct_Generic( fsrc0, fsrc1, i ); timer.Stop(); clocksGeneric = Min( clocksGeneric, timer.ClockTicks() ); } PrintClocks( va( "DotProduct_Generic %d", i ), 1, clocksGeneric ); float dot2 = DotProduct_SIMD( fsrc0, fsrc1, i ); int64 clocksSIMD = 0xFFFFFFFFFFFF; for( int j = 0; j < NUM_TESTS; j++ ) { fsrc1[TEST_TRIANGULAR_SOLVE_SIZE] = j; timer.Clear(); timer.Start(); dot2 = DotProduct_SIMD( fsrc0, fsrc1, i ); timer.Stop(); clocksSIMD = Min( clocksSIMD, timer.ClockTicks() ); } const char* result = idMath::Fabs( dot1 - dot2 ) < 1e-4f ? "ok" : S_COLOR_RED"X"; PrintClocks( va( "DotProduct_SIMD %d %s", i, result ), 1, clocksSIMD, clocksGeneric ); } } /* ======================== LowerTriangularSolve_Test ======================== */ static void LowerTriangularSolve_Test() { idMatX L; idVecX x, b, tst; int paddedSize = ( TEST_TRIANGULAR_SOLVE_SIZE + 3 ) & ~3; L.Random( paddedSize, paddedSize, 0, -1.0f, 1.0f ); x.SetSize( paddedSize ); b.Random( paddedSize, 0, -1.0f, 1.0f ); idTimer timer; const int skip = 0; for( int i = 1; i < TEST_TRIANGULAR_SOLVE_SIZE; i++ ) { x.Zero( i ); LowerTriangularSolve_Generic( L, x.ToFloatPtr(), b.ToFloatPtr(), i, skip ); int64 clocksGeneric = 0xFFFFFFFFFFFF; for( int j = 0; j < NUM_TESTS; j++ ) { timer.Clear(); timer.Start(); LowerTriangularSolve_Generic( L, x.ToFloatPtr(), b.ToFloatPtr(), i, skip ); timer.Stop(); clocksGeneric = Min( clocksGeneric, timer.ClockTicks() ); } tst = x; x.Zero(); PrintClocks( va( "LowerTriangularSolve_Generic %dx%d", i, i ), 1, clocksGeneric ); LowerTriangularSolve_SIMD( L, x.ToFloatPtr(), b.ToFloatPtr(), i, skip ); int64 clocksSIMD = 0xFFFFFFFFFFFF; for( int j = 0; j < NUM_TESTS; j++ ) { timer.Clear(); timer.Start(); LowerTriangularSolve_SIMD( L, x.ToFloatPtr(), b.ToFloatPtr(), i, skip ); timer.Stop(); clocksSIMD = Min( clocksSIMD, timer.ClockTicks() ); } const char* result = x.Compare( tst, TEST_TRIANGULAR_SOLVE_SIMD_EPSILON ) ? "ok" : S_COLOR_RED"X"; PrintClocks( va( "LowerTriangularSolve_SIMD %dx%d %s", i, i, result ), 1, clocksSIMD, clocksGeneric ); } } /* ======================== LowerTriangularSolveTranspose_Test ======================== */ static void LowerTriangularSolveTranspose_Test() { idMatX L; idVecX x, b, tst; int paddedSize = ( TEST_TRIANGULAR_SOLVE_SIZE + 3 ) & ~3; L.Random( paddedSize, paddedSize, 0, -1.0f, 1.0f ); x.SetSize( paddedSize ); b.Random( paddedSize, 0, -1.0f, 1.0f ); idTimer timer; for( int i = 1; i < TEST_TRIANGULAR_SOLVE_SIZE; i++ ) { x.Zero( i ); LowerTriangularSolveTranspose_Generic( L, x.ToFloatPtr(), b.ToFloatPtr(), i ); int64 clocksGeneric = 0xFFFFFFFFFFFF; for( int j = 0; j < NUM_TESTS; j++ ) { timer.Clear(); timer.Start(); LowerTriangularSolveTranspose_Generic( L, x.ToFloatPtr(), b.ToFloatPtr(), i ); timer.Stop(); clocksGeneric = Min( clocksGeneric, timer.ClockTicks() ); } tst = x; x.Zero(); PrintClocks( va( "LowerTriangularSolveTranspose_Generic %dx%d", i, i ), 1, clocksGeneric ); LowerTriangularSolveTranspose_SIMD( L, x.ToFloatPtr(), b.ToFloatPtr(), i ); int64 clocksSIMD = 0xFFFFFFFFFFFF; for( int j = 0; j < NUM_TESTS; j++ ) { timer.Clear(); timer.Start(); LowerTriangularSolveTranspose_SIMD( L, x.ToFloatPtr(), b.ToFloatPtr(), i ); timer.Stop(); clocksSIMD = Min( clocksSIMD, timer.ClockTicks() ); } const char* result = x.Compare( tst, TEST_TRIANGULAR_SOLVE_SIMD_EPSILON ) ? "ok" : S_COLOR_RED"X"; PrintClocks( va( "LowerTriangularSolveTranspose_SIMD %dx%d %s", i, i, result ), 1, clocksSIMD, clocksGeneric ); } } /* ======================== LDLT_Factor_Test ======================== */ static void LDLT_Factor_Test() { idMatX src, original, mat1, mat2; idVecX invDiag1, invDiag2; int paddedSize = ( TEST_FACTOR_SOLVE_SIZE + 3 ) & ~3; original.SetSize( paddedSize, paddedSize ); src.Random( paddedSize, paddedSize, 0, -1.0f, 1.0f ); src.TransposeMultiply( original, src ); idTimer timer; for( int i = 1; i < TEST_FACTOR_SOLVE_SIZE; i++ ) { int64 clocksGeneric = 0xFFFFFFFFFFFF; for( int j = 0; j < NUM_TESTS; j++ ) { mat1 = original; invDiag1.Zero( TEST_FACTOR_SOLVE_SIZE ); timer.Clear(); timer.Start(); LDLT_Factor_Generic( mat1, invDiag1, i ); timer.Stop(); clocksGeneric = Min( clocksGeneric, timer.ClockTicks() ); } PrintClocks( va( "LDLT_Factor_Generic %dx%d", i, i ), 1, clocksGeneric ); int64 clocksSIMD = 0xFFFFFFFFFFFF; for( int j = 0; j < NUM_TESTS; j++ ) { mat2 = original; invDiag2.Zero( TEST_FACTOR_SOLVE_SIZE ); timer.Clear(); timer.Start(); LDLT_Factor_SIMD( mat2, invDiag2, i ); timer.Stop(); clocksSIMD = Min( clocksSIMD, timer.ClockTicks() ); } const char* result = mat1.Compare( mat2, TEST_FACTOR_SIMD_EPSILON ) && invDiag1.Compare( invDiag2, TEST_FACTOR_SIMD_EPSILON ) ? "ok" : S_COLOR_RED"X"; PrintClocks( va( "LDLT_Factor_SIMD %dx%d %s", i, i, result ), 1, clocksSIMD, clocksGeneric ); } } #endif #define Multiply Multiply_SIMD #define MultiplyAdd MultiplyAdd_SIMD #define BigDotProduct DotProduct_SIMD #define LowerTriangularSolve LowerTriangularSolve_SIMD #define LowerTriangularSolveTranspose LowerTriangularSolveTranspose_SIMD #define UpperTriangularSolve UpperTriangularSolve_SIMD #define LU_Factor LU_Factor_SIMD #define LDLT_Factor LDLT_Factor_SIMD #define GetMaxStep GetMaxStep_SIMD /* ================================================================================================ idLCP_Square ================================================================================================ */ /* ================================================ idLCP_Square ================================================ */ class idLCP_Square : public idLCP { public: virtual bool Solve( const idMatX& o_m, idVecX& o_x, const idVecX& o_b, const idVecX& o_lo, const idVecX& o_hi, const int* o_boxIndex ); private: idMatX m; // original matrix idVecX b; // right hand side idVecX lo, hi; // low and high bounds idVecX f, a; // force and acceleration idVecX delta_f, delta_a; // delta force and delta acceleration idMatX clamped; // LU factored sub matrix for clamped variables idVecX diagonal; // reciprocal of diagonal of U of the LU factored sub matrix for clamped variables int numUnbounded; // number of unbounded variables int numClamped; // number of clamped variables float** rowPtrs; // pointers to the rows of m int* boxIndex; // box index int* side; // tells if a variable is at the low boundary = -1, high boundary = 1 or inbetween = 0 int* permuted; // index to keep track of the permutation bool padded; // set to true if the rows of the initial matrix are 16 byte padded bool FactorClamped(); void SolveClamped( idVecX& x, const float* b ); void Swap( int i, int j ); void AddClamped( int r ); void RemoveClamped( int r ); void CalcForceDelta( int d, float dir ); void CalcAccelDelta( int d ); void ChangeForce( int d, float step ); void ChangeAccel( int d, float step ); }; /* ======================== idLCP_Square::FactorClamped ======================== */ bool idLCP_Square::FactorClamped() { for( int i = 0; i < numClamped; i++ ) { memcpy( clamped[i], rowPtrs[i], numClamped * sizeof( float ) ); } return LU_Factor( clamped, diagonal, numClamped ); } /* ======================== idLCP_Square::SolveClamped ======================== */ void idLCP_Square::SolveClamped( idVecX& x, const float* b ) { // solve L LowerTriangularSolve( clamped, x.ToFloatPtr(), b, numClamped, 0 ); // solve U UpperTriangularSolve( clamped, diagonal.ToFloatPtr(), x.ToFloatPtr(), x.ToFloatPtr(), numClamped ); } /* ======================== idLCP_Square::Swap ======================== */ void idLCP_Square::Swap( int i, int j ) { if( i == j ) { return; } SwapValues( rowPtrs[i], rowPtrs[j] ); m.SwapColumns( i, j ); b.SwapElements( i, j ); lo.SwapElements( i, j ); hi.SwapElements( i, j ); a.SwapElements( i, j ); f.SwapElements( i, j ); if( boxIndex != NULL ) { SwapValues( boxIndex[i], boxIndex[j] ); } SwapValues( side[i], side[j] ); SwapValues( permuted[i], permuted[j] ); } /* ======================== idLCP_Square::AddClamped ======================== */ void idLCP_Square::AddClamped( int r ) { assert( r >= numClamped ); // add a row at the bottom and a column at the right of the factored // matrix for the clamped variables Swap( numClamped, r ); // add row to L for( int i = 0; i < numClamped; i++ ) { float sum = rowPtrs[numClamped][i]; for( int j = 0; j < i; j++ ) { sum -= clamped[numClamped][j] * clamped[j][i]; } clamped[numClamped][i] = sum * diagonal[i]; } // add column to U for( int i = 0; i <= numClamped; i++ ) { float sum = rowPtrs[i][numClamped]; for( int j = 0; j < i; j++ ) { sum -= clamped[i][j] * clamped[j][numClamped]; } clamped[i][numClamped] = sum; } diagonal[numClamped] = 1.0f / clamped[numClamped][numClamped]; numClamped++; } /* ======================== idLCP_Square::RemoveClamped ======================== */ void idLCP_Square::RemoveClamped( int r ) { if( !verify( r < numClamped ) ) { // complete fail, most likely due to exceptional floating point values return; } numClamped--; // no need to swap and update the factored matrix when the last row and column are removed if( r == numClamped ) { return; } float* y0 = ( float* ) _alloca16( numClamped * sizeof( float ) ); float* z0 = ( float* ) _alloca16( numClamped * sizeof( float ) ); float* y1 = ( float* ) _alloca16( numClamped * sizeof( float ) ); float* z1 = ( float* ) _alloca16( numClamped * sizeof( float ) ); // the row/column need to be subtracted from the factorization for( int i = 0; i < numClamped; i++ ) { y0[i] = -rowPtrs[i][r]; } memset( y1, 0, numClamped * sizeof( float ) ); y1[r] = 1.0f; memset( z0, 0, numClamped * sizeof( float ) ); z0[r] = 1.0f; for( int i = 0; i < numClamped; i++ ) { z1[i] = -rowPtrs[r][i]; } // swap the to be removed row/column with the last row/column Swap( r, numClamped ); // the swapped last row/column need to be added to the factorization for( int i = 0; i < numClamped; i++ ) { y0[i] += rowPtrs[i][r]; } for( int i = 0; i < numClamped; i++ ) { z1[i] += rowPtrs[r][i]; } z1[r] = 0.0f; // update the beginning of the to be updated row and column for( int i = 0; i < r; i++ ) { float p0 = y0[i]; float beta1 = z1[i] * diagonal[i]; clamped[i][r] += p0; for( int j = i + 1; j < numClamped; j++ ) { z1[j] -= beta1 * clamped[i][j]; } for( int j = i + 1; j < numClamped; j++ ) { y0[j] -= p0 * clamped[j][i]; } clamped[r][i] += beta1; } // update the lower right corner starting at r,r for( int i = r; i < numClamped; i++ ) { float diag = clamped[i][i]; float p0 = y0[i]; float p1 = z0[i]; diag += p0 * p1; if( fabs( diag ) < idMath::FLT_SMALLEST_NON_DENORMAL ) { idLib::Printf( "idLCP_Square::RemoveClamped: updating factorization failed\n" ); diag = idMath::FLT_SMALLEST_NON_DENORMAL; } float beta0 = p1 / diag; float q0 = y1[i]; float q1 = z1[i]; diag += q0 * q1; if( fabs( diag ) < idMath::FLT_SMALLEST_NON_DENORMAL ) { idLib::Printf( "idLCP_Square::RemoveClamped: updating factorization failed\n" ); diag = idMath::FLT_SMALLEST_NON_DENORMAL; } float d = 1.0f / diag; float beta1 = q1 * d; clamped[i][i] = diag; diagonal[i] = d; for( int j = i + 1; j < numClamped; j++ ) { d = clamped[i][j]; d += p0 * z0[j]; z0[j] -= beta0 * d; d += q0 * z1[j]; z1[j] -= beta1 * d; clamped[i][j] = d; } for( int j = i + 1; j < numClamped; j++ ) { d = clamped[j][i]; y0[j] -= p0 * d; d += beta0 * y0[j]; y1[j] -= q0 * d; d += beta1 * y1[j]; clamped[j][i] = d; } } return; } /* ======================== idLCP_Square::CalcForceDelta Modifies this->delta_f. ======================== */ void idLCP_Square::CalcForceDelta( int d, float dir ) { delta_f[d] = dir; if( numClamped <= 0 ) { return; } // get column d of matrix float* ptr = ( float* ) _alloca16( numClamped * sizeof( float ) ); for( int i = 0; i < numClamped; i++ ) { ptr[i] = rowPtrs[i][d]; } // solve force delta SolveClamped( delta_f, ptr ); // flip force delta based on direction if( dir > 0.0f ) { ptr = delta_f.ToFloatPtr(); for( int i = 0; i < numClamped; i++ ) { ptr[i] = - ptr[i]; } } } /* ======================== idLCP_Square::CalcAccelDelta Modifies this->delta_a and uses this->delta_f. ======================== */ ID_INLINE void idLCP_Square::CalcAccelDelta( int d ) { // only the not clamped variables, including the current variable, can have a change in acceleration for( int j = numClamped; j <= d; j++ ) { // only the clamped variables and the current variable have a force delta unequal zero float dot = BigDotProduct( rowPtrs[j], delta_f.ToFloatPtr(), numClamped ); delta_a[j] = dot + rowPtrs[j][d] * delta_f[d]; } } /* ======================== idLCP_Square::ChangeForce Modifies this->f and uses this->delta_f. ======================== */ ID_INLINE void idLCP_Square::ChangeForce( int d, float step ) { // only the clamped variables and current variable have a force delta unequal zero MultiplyAdd( f.ToFloatPtr(), step, delta_f.ToFloatPtr(), numClamped ); f[d] += step * delta_f[d]; } /* ======================== idLCP_Square::ChangeAccel Modifies this->a and uses this->delta_a. ======================== */ ID_INLINE void idLCP_Square::ChangeAccel( int d, float step ) { // only the not clamped variables, including the current variable, can have an acceleration unequal zero MultiplyAdd( a.ToFloatPtr() + numClamped, step, delta_a.ToFloatPtr() + numClamped, d - numClamped + 1 ); } /* ======================== idLCP_Square::Solve ======================== */ bool idLCP_Square::Solve( const idMatX& o_m, idVecX& o_x, const idVecX& o_b, const idVecX& o_lo, const idVecX& o_hi, const int* o_boxIndex ) { // true when the matrix rows are 16 byte padded padded = ( ( o_m.GetNumRows() + 3 )&~3 ) == o_m.GetNumColumns(); assert( padded || o_m.GetNumRows() == o_m.GetNumColumns() ); assert( o_x.GetSize() == o_m.GetNumRows() ); assert( o_b.GetSize() == o_m.GetNumRows() ); assert( o_lo.GetSize() == o_m.GetNumRows() ); assert( o_hi.GetSize() == o_m.GetNumRows() ); // allocate memory for permuted input f.SetData( o_m.GetNumRows(), VECX_ALLOCA( o_m.GetNumRows() ) ); a.SetData( o_b.GetSize(), VECX_ALLOCA( o_b.GetSize() ) ); b.SetData( o_b.GetSize(), VECX_ALLOCA( o_b.GetSize() ) ); lo.SetData( o_lo.GetSize(), VECX_ALLOCA( o_lo.GetSize() ) ); hi.SetData( o_hi.GetSize(), VECX_ALLOCA( o_hi.GetSize() ) ); if( o_boxIndex != NULL ) { boxIndex = ( int* )_alloca16( o_x.GetSize() * sizeof( int ) ); memcpy( boxIndex, o_boxIndex, o_x.GetSize() * sizeof( int ) ); } else { boxIndex = NULL; } // we override the const on o_m here but on exit the matrix is unchanged m.SetData( o_m.GetNumRows(), o_m.GetNumColumns(), const_cast( o_m[0] ) ); f.Zero(); a.Zero(); b = o_b; lo = o_lo; hi = o_hi; // pointers to the rows of m rowPtrs = ( float** ) _alloca16( m.GetNumRows() * sizeof( float* ) ); for( int i = 0; i < m.GetNumRows(); i++ ) { rowPtrs[i] = m[i]; } // tells if a variable is at the low boundary, high boundary or inbetween side = ( int* ) _alloca16( m.GetNumRows() * sizeof( int ) ); // index to keep track of the permutation permuted = ( int* ) _alloca16( m.GetNumRows() * sizeof( int ) ); for( int i = 0; i < m.GetNumRows(); i++ ) { permuted[i] = i; } // permute input so all unbounded variables come first numUnbounded = 0; for( int i = 0; i < m.GetNumRows(); i++ ) { if( lo[i] == -idMath::INFINITY && hi[i] == idMath::INFINITY ) { if( numUnbounded != i ) { Swap( numUnbounded, i ); } numUnbounded++; } } // permute input so all variables using the boxIndex come last int boxStartIndex = m.GetNumRows(); if( boxIndex ) { for( int i = m.GetNumRows() - 1; i >= numUnbounded; i-- ) { if( boxIndex[i] >= 0 ) { boxStartIndex--; if( boxStartIndex != i ) { Swap( boxStartIndex, i ); } } } } // sub matrix for factorization clamped.SetData( m.GetNumRows(), m.GetNumColumns(), MATX_ALLOCA( m.GetNumRows() * m.GetNumColumns() ) ); diagonal.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) ); // all unbounded variables are clamped numClamped = numUnbounded; // if there are unbounded variables if( numUnbounded ) { // factor and solve for unbounded variables if( !FactorClamped() ) { idLib::Printf( "idLCP_Square::Solve: unbounded factorization failed\n" ); return false; } SolveClamped( f, b.ToFloatPtr() ); // if there are no bounded variables we are done if( numUnbounded == m.GetNumRows() ) { o_x = f; // the vector is not permuted return true; } } int numIgnored = 0; // allocate for delta force and delta acceleration delta_f.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) ); delta_a.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) ); // solve for bounded variables idStr failed; for( int i = numUnbounded; i < m.GetNumRows(); i++ ) { // once we hit the box start index we can initialize the low and high boundaries of the variables using the box index if( i == boxStartIndex ) { for( int j = 0; j < boxStartIndex; j++ ) { o_x[permuted[j]] = f[j]; } for( int j = boxStartIndex; j < m.GetNumRows(); j++ ) { float s = o_x[boxIndex[j]]; if( lo[j] != -idMath::INFINITY ) { lo[j] = - idMath::Fabs( lo[j] * s ); } if( hi[j] != idMath::INFINITY ) { hi[j] = idMath::Fabs( hi[j] * s ); } } } // calculate acceleration for current variable float dot = BigDotProduct( rowPtrs[i], f.ToFloatPtr(), i ); a[i] = dot - b[i]; // if already at the low boundary if( lo[i] >= -LCP_BOUND_EPSILON && a[i] >= -LCP_ACCEL_EPSILON ) { side[i] = -1; continue; } // if already at the high boundary if( hi[i] <= LCP_BOUND_EPSILON && a[i] <= LCP_ACCEL_EPSILON ) { side[i] = 1; continue; } // if inside the clamped region if( idMath::Fabs( a[i] ) <= LCP_ACCEL_EPSILON ) { side[i] = 0; AddClamped( i ); continue; } // drive the current variable into a valid region int n = 0; for( ; n < maxIterations; n++ ) { // direction to move float dir = ( a[i] <= 0.0f ) ? 1.0f : -1.0f; // calculate force delta CalcForceDelta( i, dir ); // calculate acceleration delta: delta_a = m * delta_f; CalcAccelDelta( i ); float maxStep; int limit; int limitSide; // maximum step we can take GetMaxStep( f.ToFloatPtr(), a.ToFloatPtr(), delta_f.ToFloatPtr(), delta_a.ToFloatPtr(), lo.ToFloatPtr(), hi.ToFloatPtr(), side, numUnbounded, numClamped, i, dir, maxStep, limit, limitSide ); if( maxStep <= 0.0f ) { #ifdef IGNORE_UNSATISFIABLE_VARIABLES // ignore the current variable completely lo[i] = hi[i] = 0.0f; f[i] = 0.0f; side[i] = -1; numIgnored++; #else failed.Format( "invalid step size %.4f", maxStep ); for( int j = i; j < m.GetNumRows(); j++ ) { f[j] = 0.0f; } numIgnored = m.GetNumRows() - i; #endif break; } // change force ChangeForce( i, maxStep ); // change acceleration ChangeAccel( i, maxStep ); // clamp/unclamp the variable that limited this step side[limit] = limitSide; if( limitSide == 0 ) { a[limit] = 0.0f; AddClamped( limit ); } else if( limitSide == -1 ) { f[limit] = lo[limit]; if( limit != i ) { RemoveClamped( limit ); } } else if( limitSide == 1 ) { f[limit] = hi[limit]; if( limit != i ) { RemoveClamped( limit ); } } // if the current variable limited the step we can continue with the next variable if( limit == i ) { break; } } if( n >= maxIterations ) { failed.Format( "max iterations %d", maxIterations ); break; } if( failed.Length() ) { break; } } #ifdef IGNORE_UNSATISFIABLE_VARIABLES if( numIgnored ) { if( lcp_showFailures.GetBool() ) { idLib::Printf( "idLCP_Square::Solve: %d of %d bounded variables ignored\n", numIgnored, m.GetNumRows() - numUnbounded ); } } #endif // if failed clear remaining forces if( failed.Length() ) { if( lcp_showFailures.GetBool() ) { idLib::Printf( "idLCP_Square::Solve: %s (%d of %d bounded variables ignored)\n", failed.c_str(), numIgnored, m.GetNumRows() - numUnbounded ); } } #if defined(_DEBUG) && 0 if( failed.Length() ) { // test whether or not the solution satisfies the complementarity conditions for( int i = 0; i < m.GetNumRows(); i++ ) { a[i] = -b[i]; for( int j = 0; j < m.GetNumRows(); j++ ) { a[i] += rowPtrs[i][j] * f[j]; } if( f[i] == lo[i] ) { if( lo[i] != hi[i] && a[i] < -LCP_ACCEL_EPSILON ) { int bah1 = 1; } } else if( f[i] == hi[i] ) { if( lo[i] != hi[i] && a[i] > LCP_ACCEL_EPSILON ) { int bah2 = 1; } } else if( f[i] < lo[i] || f[i] > hi[i] || idMath::Fabs( a[i] ) > 1.0f ) { int bah3 = 1; } } } #endif // unpermute result for( int i = 0; i < f.GetSize(); i++ ) { o_x[permuted[i]] = f[i]; } return true; } /* ================================================================================================ idLCP_Symmetric ================================================================================================ */ /* ================================================ idLCP_Symmetric ================================================ */ class idLCP_Symmetric : public idLCP { public: virtual bool Solve( const idMatX& o_m, idVecX& o_x, const idVecX& o_b, const idVecX& o_lo, const idVecX& o_hi, const int* o_boxIndex ); private: idMatX m; // original matrix idVecX b; // right hand side idVecX lo, hi; // low and high bounds idVecX f, a; // force and acceleration idVecX delta_f, delta_a; // delta force and delta acceleration idMatX clamped; // LDLt factored sub matrix for clamped variables idVecX diagonal; // reciprocal of diagonal of LDLt factored sub matrix for clamped variables idVecX solveCache1; // intermediate result cached in SolveClamped idVecX solveCache2; // " int numUnbounded; // number of unbounded variables int numClamped; // number of clamped variables int clampedChangeStart; // lowest row/column changed in the clamped matrix during an iteration float** rowPtrs; // pointers to the rows of m int* boxIndex; // box index int* side; // tells if a variable is at the low boundary = -1, high boundary = 1 or inbetween = 0 int* permuted; // index to keep track of the permutation bool padded; // set to true if the rows of the initial matrix are 16 byte padded bool FactorClamped(); void SolveClamped( idVecX& x, const float* b ); void Swap( int i, int j ); void AddClamped( int r, bool useSolveCache ); void RemoveClamped( int r ); void CalcForceDelta( int d, float dir ); void CalcAccelDelta( int d ); void ChangeForce( int d, float step ); void ChangeAccel( int d, float step ); }; /* ======================== idLCP_Symmetric::FactorClamped ======================== */ bool idLCP_Symmetric::FactorClamped() { clampedChangeStart = 0; for( int i = 0; i < numClamped; i++ ) { memcpy( clamped[i], rowPtrs[i], numClamped * sizeof( float ) ); } return LDLT_Factor( clamped, diagonal, numClamped ); } /* ======================== idLCP_Symmetric::SolveClamped ======================== */ void idLCP_Symmetric::SolveClamped( idVecX& x, const float* b ) { // solve L LowerTriangularSolve( clamped, solveCache1.ToFloatPtr(), b, numClamped, clampedChangeStart ); // scale with D Multiply( solveCache2.ToFloatPtr(), solveCache1.ToFloatPtr(), diagonal.ToFloatPtr(), numClamped ); // solve Lt LowerTriangularSolveTranspose( clamped, x.ToFloatPtr(), solveCache2.ToFloatPtr(), numClamped ); clampedChangeStart = numClamped; } /* ======================== idLCP_Symmetric::Swap ======================== */ void idLCP_Symmetric::Swap( int i, int j ) { if( i == j ) { return; } SwapValues( rowPtrs[i], rowPtrs[j] ); m.SwapColumns( i, j ); b.SwapElements( i, j ); lo.SwapElements( i, j ); hi.SwapElements( i, j ); a.SwapElements( i, j ); f.SwapElements( i, j ); if( boxIndex != NULL ) { SwapValues( boxIndex[i], boxIndex[j] ); } SwapValues( side[i], side[j] ); SwapValues( permuted[i], permuted[j] ); } /* ======================== idLCP_Symmetric::AddClamped ======================== */ void idLCP_Symmetric::AddClamped( int r, bool useSolveCache ) { assert( r >= numClamped ); if( numClamped < clampedChangeStart ) { clampedChangeStart = numClamped; } // add a row at the bottom and a column at the right of the factored // matrix for the clamped variables Swap( numClamped, r ); // solve for v in L * v = rowPtr[numClamped] float dot; if( useSolveCache ) { // the lower triangular solve was cached in SolveClamped called by CalcForceDelta memcpy( clamped[numClamped], solveCache2.ToFloatPtr(), numClamped * sizeof( float ) ); // calculate row dot product dot = BigDotProduct( solveCache2.ToFloatPtr(), solveCache1.ToFloatPtr(), numClamped ); } else { float* v = ( float* ) _alloca16( numClamped * sizeof( float ) ); LowerTriangularSolve( clamped, v, rowPtrs[numClamped], numClamped, 0 ); // add bottom row to L Multiply( clamped[numClamped], v, diagonal.ToFloatPtr(), numClamped ); // calculate row dot product dot = BigDotProduct( clamped[numClamped], v, numClamped ); } // update diagonal[numClamped] float d = rowPtrs[numClamped][numClamped] - dot; if( fabs( d ) < idMath::FLT_SMALLEST_NON_DENORMAL ) { idLib::Printf( "idLCP_Symmetric::AddClamped: updating factorization failed\n" ); d = idMath::FLT_SMALLEST_NON_DENORMAL; } clamped[numClamped][numClamped] = d; diagonal[numClamped] = 1.0f / d; numClamped++; } /* ======================== idLCP_Symmetric::RemoveClamped ======================== */ void idLCP_Symmetric::RemoveClamped( int r ) { if( !verify( r < numClamped ) ) { // complete fail, most likely due to exceptional floating point values return; } if( r < clampedChangeStart ) { clampedChangeStart = r; } numClamped--; // no need to swap and update the factored matrix when the last row and column are removed if( r == numClamped ) { return; } // swap the to be removed row/column with the last row/column Swap( r, numClamped ); // update the factored matrix float* addSub = ( float* ) _alloca16( numClamped * sizeof( float ) ); if( r == 0 ) { if( numClamped == 1 ) { float diag = rowPtrs[0][0]; if( fabs( diag ) < idMath::FLT_SMALLEST_NON_DENORMAL ) { idLib::Printf( "idLCP_Symmetric::RemoveClamped: updating factorization failed\n" ); diag = idMath::FLT_SMALLEST_NON_DENORMAL; } clamped[0][0] = diag; diagonal[0] = 1.0f / diag; return; } // calculate the row/column to be added to the lower right sub matrix starting at (r, r) float* original = rowPtrs[numClamped]; float* ptr = rowPtrs[r]; addSub[0] = ptr[0] - original[numClamped]; for( int i = 1; i < numClamped; i++ ) { addSub[i] = ptr[i] - original[i]; } } else { float* v = ( float* ) _alloca16( numClamped * sizeof( float ) ); // solve for v in L * v = rowPtr[r] LowerTriangularSolve( clamped, v, rowPtrs[r], r, 0 ); // update removed row Multiply( clamped[r], v, diagonal.ToFloatPtr(), r ); // if the last row/column of the matrix is updated if( r == numClamped - 1 ) { // only calculate new diagonal float dot = BigDotProduct( clamped[r], v, r ); float diag = rowPtrs[r][r] - dot; if( fabs( diag ) < idMath::FLT_SMALLEST_NON_DENORMAL ) { idLib::Printf( "idLCP_Symmetric::RemoveClamped: updating factorization failed\n" ); diag = idMath::FLT_SMALLEST_NON_DENORMAL; } clamped[r][r] = diag; diagonal[r] = 1.0f / diag; return; } // calculate the row/column to be added to the lower right sub matrix starting at (r, r) for( int i = 0; i < r; i++ ) { v[i] = clamped[r][i] * clamped[i][i]; } for( int i = r; i < numClamped; i++ ) { float sum; if( i == r ) { sum = clamped[r][r]; } else { sum = clamped[r][r] * clamped[i][r]; } float* ptr = clamped[i]; for( int j = 0; j < r; j++ ) { sum += ptr[j] * v[j]; } addSub[i] = rowPtrs[r][i] - sum; } } // add row/column to the lower right sub matrix starting at (r, r) float* v1 = ( float* ) _alloca16( numClamped * sizeof( float ) ); float* v2 = ( float* ) _alloca16( numClamped * sizeof( float ) ); float diag = idMath::SQRT_1OVER2; v1[r] = ( 0.5f * addSub[r] + 1.0f ) * diag; v2[r] = ( 0.5f * addSub[r] - 1.0f ) * diag; for( int i = r + 1; i < numClamped; i++ ) { v1[i] = v2[i] = addSub[i] * diag; } float alpha1 = 1.0f; float alpha2 = -1.0f; // simultaneous update/downdate of the sub matrix starting at (r, r) int n = clamped.GetNumColumns(); for( int i = r; i < numClamped; i++ ) { diag = clamped[i][i]; float p1 = v1[i]; float newDiag = diag + alpha1 * p1 * p1; if( fabs( newDiag ) < idMath::FLT_SMALLEST_NON_DENORMAL ) { idLib::Printf( "idLCP_Symmetric::RemoveClamped: updating factorization failed\n" ); newDiag = idMath::FLT_SMALLEST_NON_DENORMAL; } alpha1 /= newDiag; float beta1 = p1 * alpha1; alpha1 *= diag; diag = newDiag; float p2 = v2[i]; newDiag = diag + alpha2 * p2 * p2; if( fabs( newDiag ) < idMath::FLT_SMALLEST_NON_DENORMAL ) { idLib::Printf( "idLCP_Symmetric::RemoveClamped: updating factorization failed\n" ); newDiag = idMath::FLT_SMALLEST_NON_DENORMAL; } clamped[i][i] = newDiag; float invNewDiag = 1.0f / newDiag; diagonal[i] = invNewDiag; alpha2 *= invNewDiag; float beta2 = p2 * alpha2; alpha2 *= diag; // update column below diagonal (i,i) float* ptr = clamped.ToFloatPtr() + i; int j; for( j = i + 1; j < numClamped - 1; j += 2 ) { float sum0 = ptr[( j + 0 ) * n]; float sum1 = ptr[( j + 1 ) * n]; v1[j + 0] -= p1 * sum0; v1[j + 1] -= p1 * sum1; sum0 += beta1 * v1[j + 0]; sum1 += beta1 * v1[j + 1]; v2[j + 0] -= p2 * sum0; v2[j + 1] -= p2 * sum1; sum0 += beta2 * v2[j + 0]; sum1 += beta2 * v2[j + 1]; ptr[( j + 0 )*n] = sum0; ptr[( j + 1 )*n] = sum1; } for( ; j < numClamped; j++ ) { float sum = ptr[j * n]; v1[j] -= p1 * sum; sum += beta1 * v1[j]; v2[j] -= p2 * sum; sum += beta2 * v2[j]; ptr[j * n] = sum; } } } /* ======================== idLCP_Symmetric::CalcForceDelta Modifies this->delta_f. ======================== */ ID_INLINE void idLCP_Symmetric::CalcForceDelta( int d, float dir ) { delta_f[d] = dir; if( numClamped == 0 ) { return; } // solve force delta SolveClamped( delta_f, rowPtrs[d] ); // flip force delta based on direction if( dir > 0.0f ) { float* ptr = delta_f.ToFloatPtr(); for( int i = 0; i < numClamped; i++ ) { ptr[i] = - ptr[i]; } } } /* ======================== idLCP_Symmetric::CalcAccelDelta Modifies this->delta_a and uses this->delta_f. ======================== */ ID_INLINE void idLCP_Symmetric::CalcAccelDelta( int d ) { // only the not clamped variables, including the current variable, can have a change in acceleration for( int j = numClamped; j <= d; j++ ) { // only the clamped variables and the current variable have a force delta unequal zero float dot = BigDotProduct( rowPtrs[j], delta_f.ToFloatPtr(), numClamped ); delta_a[j] = dot + rowPtrs[j][d] * delta_f[d]; } } /* ======================== idLCP_Symmetric::ChangeForce Modifies this->f and uses this->delta_f. ======================== */ ID_INLINE void idLCP_Symmetric::ChangeForce( int d, float step ) { // only the clamped variables and current variable have a force delta unequal zero MultiplyAdd( f.ToFloatPtr(), step, delta_f.ToFloatPtr(), numClamped ); f[d] += step * delta_f[d]; } /* ======================== idLCP_Symmetric::ChangeAccel Modifies this->a and uses this->delta_a. ======================== */ ID_INLINE void idLCP_Symmetric::ChangeAccel( int d, float step ) { // only the not clamped variables, including the current variable, can have an acceleration unequal zero MultiplyAdd( a.ToFloatPtr() + numClamped, step, delta_a.ToFloatPtr() + numClamped, d - numClamped + 1 ); } /* ======================== idLCP_Symmetric::Solve ======================== */ bool idLCP_Symmetric::Solve( const idMatX& o_m, idVecX& o_x, const idVecX& o_b, const idVecX& o_lo, const idVecX& o_hi, const int* o_boxIndex ) { // true when the matrix rows are 16 byte padded padded = ( ( o_m.GetNumRows() + 3 )&~3 ) == o_m.GetNumColumns(); assert( padded || o_m.GetNumRows() == o_m.GetNumColumns() ); assert( o_x.GetSize() == o_m.GetNumRows() ); assert( o_b.GetSize() == o_m.GetNumRows() ); assert( o_lo.GetSize() == o_m.GetNumRows() ); assert( o_hi.GetSize() == o_m.GetNumRows() ); // allocate memory for permuted input f.SetData( o_m.GetNumRows(), VECX_ALLOCA( o_m.GetNumRows() ) ); a.SetData( o_b.GetSize(), VECX_ALLOCA( o_b.GetSize() ) ); b.SetData( o_b.GetSize(), VECX_ALLOCA( o_b.GetSize() ) ); lo.SetData( o_lo.GetSize(), VECX_ALLOCA( o_lo.GetSize() ) ); hi.SetData( o_hi.GetSize(), VECX_ALLOCA( o_hi.GetSize() ) ); if( o_boxIndex != NULL ) { boxIndex = ( int* )_alloca16( o_x.GetSize() * sizeof( int ) ); memcpy( boxIndex, o_boxIndex, o_x.GetSize() * sizeof( int ) ); } else { boxIndex = NULL; } // we override the const on o_m here but on exit the matrix is unchanged m.SetData( o_m.GetNumRows(), o_m.GetNumColumns(), const_cast< float* >( o_m[0] ) ); f.Zero(); a.Zero(); b = o_b; lo = o_lo; hi = o_hi; // pointers to the rows of m rowPtrs = ( float** ) _alloca16( m.GetNumRows() * sizeof( float* ) ); for( int i = 0; i < m.GetNumRows(); i++ ) { rowPtrs[i] = m[i]; } // tells if a variable is at the low boundary, high boundary or inbetween side = ( int* ) _alloca16( m.GetNumRows() * sizeof( int ) ); // index to keep track of the permutation permuted = ( int* ) _alloca16( m.GetNumRows() * sizeof( int ) ); for( int i = 0; i < m.GetNumRows(); i++ ) { permuted[i] = i; } // permute input so all unbounded variables come first numUnbounded = 0; for( int i = 0; i < m.GetNumRows(); i++ ) { if( lo[i] == -idMath::INFINITY && hi[i] == idMath::INFINITY ) { if( numUnbounded != i ) { Swap( numUnbounded, i ); } numUnbounded++; } } // permute input so all variables using the boxIndex come last int boxStartIndex = m.GetNumRows(); if( boxIndex != NULL ) { for( int i = m.GetNumRows() - 1; i >= numUnbounded; i-- ) { if( boxIndex[i] >= 0 ) { boxStartIndex--; if( boxStartIndex != i ) { Swap( boxStartIndex, i ); } } } } // sub matrix for factorization clamped.SetDataCacheLines( m.GetNumRows(), m.GetNumColumns(), MATX_ALLOCA_CACHE_LINES( m.GetNumRows() * m.GetNumColumns() ), true ); diagonal.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) ); solveCache1.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) ); solveCache2.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) ); // all unbounded variables are clamped numClamped = numUnbounded; // if there are unbounded variables if( numUnbounded ) { // factor and solve for unbounded variables if( !FactorClamped() ) { idLib::Printf( "idLCP_Symmetric::Solve: unbounded factorization failed\n" ); return false; } SolveClamped( f, b.ToFloatPtr() ); // if there are no bounded variables we are done if( numUnbounded == m.GetNumRows() ) { o_x = f; // the vector is not permuted return true; } } int numIgnored = 0; // allocate for delta force and delta acceleration delta_f.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) ); delta_a.SetData( m.GetNumRows(), VECX_ALLOCA( m.GetNumRows() ) ); // solve for bounded variables idStr failed; for( int i = numUnbounded; i < m.GetNumRows(); i++ ) { clampedChangeStart = 0; // once we hit the box start index we can initialize the low and high boundaries of the variables using the box index if( i == boxStartIndex ) { for( int j = 0; j < boxStartIndex; j++ ) { o_x[permuted[j]] = f[j]; } for( int j = boxStartIndex; j < m.GetNumRows(); j++ ) { float s = o_x[boxIndex[j]]; if( lo[j] != -idMath::INFINITY ) { lo[j] = - idMath::Fabs( lo[j] * s ); } if( hi[j] != idMath::INFINITY ) { hi[j] = idMath::Fabs( hi[j] * s ); } } } // calculate acceleration for current variable float dot = BigDotProduct( rowPtrs[i], f.ToFloatPtr(), i ); a[i] = dot - b[i]; // if already at the low boundary if( lo[i] >= -LCP_BOUND_EPSILON && a[i] >= -LCP_ACCEL_EPSILON ) { side[i] = -1; continue; } // if already at the high boundary if( hi[i] <= LCP_BOUND_EPSILON && a[i] <= LCP_ACCEL_EPSILON ) { side[i] = 1; continue; } // if inside the clamped region if( idMath::Fabs( a[i] ) <= LCP_ACCEL_EPSILON ) { side[i] = 0; AddClamped( i, false ); continue; } // drive the current variable into a valid region int n = 0; for( ; n < maxIterations; n++ ) { // direction to move float dir = ( a[i] <= 0.0f ) ? 1.0f : -1.0f; // calculate force delta CalcForceDelta( i, dir ); // calculate acceleration delta: delta_a = m * delta_f; CalcAccelDelta( i ); float maxStep; int limit; int limitSide; // maximum step we can take GetMaxStep( f.ToFloatPtr(), a.ToFloatPtr(), delta_f.ToFloatPtr(), delta_a.ToFloatPtr(), lo.ToFloatPtr(), hi.ToFloatPtr(), side, numUnbounded, numClamped, i, dir, maxStep, limit, limitSide ); if( maxStep <= 0.0f ) { #ifdef IGNORE_UNSATISFIABLE_VARIABLES // ignore the current variable completely lo[i] = hi[i] = 0.0f; f[i] = 0.0f; side[i] = -1; numIgnored++; #else failed.Format( "invalid step size %.4f", maxStep ); for( int j = i; j < m.GetNumRows(); j++ ) { f[j] = 0.0f; } numIgnored = m.GetNumRows() - i; #endif break; } // change force ChangeForce( i, maxStep ); // change acceleration ChangeAccel( i, maxStep ); // clamp/unclamp the variable that limited this step side[limit] = limitSide; if( limitSide == 0 ) { a[limit] = 0.0f; AddClamped( limit, ( limit == i ) ); } else if( limitSide == -1 ) { f[limit] = lo[limit]; if( limit != i ) { RemoveClamped( limit ); } } else if( limitSide == 1 ) { f[limit] = hi[limit]; if( limit != i ) { RemoveClamped( limit ); } } // if the current variable limited the step we can continue with the next variable if( limit == i ) { break; } } if( n >= maxIterations ) { failed.Format( "max iterations %d", maxIterations ); break; } if( failed.Length() ) { break; } } #ifdef IGNORE_UNSATISFIABLE_VARIABLES if( numIgnored ) { if( lcp_showFailures.GetBool() ) { idLib::Printf( "idLCP_Symmetric::Solve: %d of %d bounded variables ignored\n", numIgnored, m.GetNumRows() - numUnbounded ); } } #endif // if failed clear remaining forces if( failed.Length() ) { if( lcp_showFailures.GetBool() ) { idLib::Printf( "idLCP_Symmetric::Solve: %s (%d of %d bounded variables ignored)\n", failed.c_str(), numIgnored, m.GetNumRows() - numUnbounded ); } } #if defined(_DEBUG) && 0 if( failed.Length() ) { // test whether or not the solution satisfies the complementarity conditions for( int i = 0; i < m.GetNumRows(); i++ ) { a[i] = -b[i]; for( j = 0; j < m.GetNumRows(); j++ ) { a[i] += rowPtrs[i][j] * f[j]; } if( f[i] == lo[i] ) { if( lo[i] != hi[i] && a[i] < -LCP_ACCEL_EPSILON ) { int bah1 = 1; } } else if( f[i] == hi[i] ) { if( lo[i] != hi[i] && a[i] > LCP_ACCEL_EPSILON ) { int bah2 = 1; } } else if( f[i] < lo[i] || f[i] > hi[i] || idMath::Fabs( a[i] ) > 1.0f ) { int bah3 = 1; } } } #endif // unpermute result for( int i = 0; i < f.GetSize(); i++ ) { o_x[permuted[i]] = f[i]; } return true; } /* ================================================================================================ idLCP ================================================================================================ */ /* ======================== idLCP::AllocSquare ======================== */ idLCP* idLCP::AllocSquare() { idLCP* lcp = new idLCP_Square; lcp->SetMaxIterations( 32 ); return lcp; } /* ======================== idLCP::AllocSymmetric ======================== */ idLCP* idLCP::AllocSymmetric() { idLCP* lcp = new idLCP_Symmetric; lcp->SetMaxIterations( 32 ); return lcp; } /* ======================== idLCP::~idLCP ======================== */ idLCP::~idLCP() { } /* ======================== idLCP::SetMaxIterations ======================== */ void idLCP::SetMaxIterations( int max ) { maxIterations = max; } /* ======================== idLCP::GetMaxIterations ======================== */ int idLCP::GetMaxIterations() { return maxIterations; } /* ======================== idLCP::Test_f ======================== */ void idLCP::Test_f( const idCmdArgs& args ) { #ifdef ENABLE_TEST_CODE DotProduct_Test(); LowerTriangularSolve_Test(); LowerTriangularSolveTranspose_Test(); LDLT_Factor_Test(); #endif }