doom3-bfg/neo/idlib/geometry/DrawVert_intrinsics.h

204 lines
9.5 KiB
C

/*
===========================================================================
Doom 3 BFG Edition GPL Source Code
Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company.
This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").
Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Doom 3 BFG Edition Source Code. If not, see <http://www.gnu.org/licenses/>.
In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code. If not, please request a copy in writing from id Software at the address below.
If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
===========================================================================
*/
#ifndef __DRAWVERT_INTRINSICS_H__
#define __DRAWVERT_INTRINSICS_H__
static const __m128i vector_int_f32_sign_mask = _mm_set1_epi32( 1U << IEEE_FLT_SIGN_BIT );
static const __m128i vector_int_f32_exponent_mask = _mm_set1_epi32( ( ( 1U << IEEE_FLT_EXPONENT_BITS ) - 1 ) << IEEE_FLT_MANTISSA_BITS );
static const __m128i vector_int_f32_mantissa_mask = _mm_set1_epi32( ( 1U << IEEE_FLT_MANTISSA_BITS ) - 1 );
static const __m128i vector_int_f16_min_exponent = _mm_set1_epi32( 0 );
static const __m128i vector_int_f16_max_exponent = _mm_set1_epi32( ( 30 << IEEE_FLT16_MANTISSA_BITS ) );
static const __m128i vector_int_f16_min_mantissa = _mm_set1_epi32( 0 );
static const __m128i vector_int_f16_max_mantissa = _mm_set1_epi32( ( ( 1 << IEEE_FLT16_MANTISSA_BITS ) - 1 ) );
static const __m128i vector_int_f32_to_f16_exponent_bias = _mm_set1_epi32( ( IEEE_FLT_EXPONENT_BIAS - IEEE_FLT16_EXPONENT_BIAS ) << IEEE_FLT16_MANTISSA_BITS );
static const int f32_to_f16_sign_shift = IEEE_FLT_SIGN_BIT - IEEE_FLT16_SIGN_BIT;
static const int f32_to_f16_exponent_shift = IEEE_FLT_MANTISSA_BITS - IEEE_FLT16_MANTISSA_BITS;
static const int f32_to_f16_mantissa_shift = IEEE_FLT_MANTISSA_BITS - IEEE_FLT16_MANTISSA_BITS;
static const __m128i vector_int_zero = _mm_setzero_si128();
static const __m128i vector_int_one = _mm_set_epi32( 1, 1, 1, 1 );
static const __m128 vector_float_mask_clear_last = __m128c( _mm_set_epi32( 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF ) );
static const __m128 vector_float_last_one = { 0.0f, 0.0f, 0.0f, 1.0f };
static const __m128 vector_float_1_over_255 = { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f };
static const __m128 vector_float_1_over_4 = { 1.0f / 4.0f, 1.0f / 4.0f, 1.0f / 4.0f, 1.0f / 4.0f };
/*
====================
FastF32toF16
====================
*/
ID_INLINE_EXTERN __m128i FastF32toF16( __m128i f32_bits )
{
__m128i f16_sign = _mm_srli_epi32( _mm_and_si128( f32_bits, vector_int_f32_sign_mask ), f32_to_f16_sign_shift );
__m128i f16_exponent = _mm_srli_epi32( _mm_and_si128( f32_bits, vector_int_f32_exponent_mask ), f32_to_f16_exponent_shift );
__m128i f16_mantissa = _mm_srli_epi32( _mm_and_si128( f32_bits, vector_int_f32_mantissa_mask ), f32_to_f16_mantissa_shift );
f16_exponent = _mm_sub_epi32( f16_exponent, vector_int_f32_to_f16_exponent_bias );
const __m128i underflow = _mm_cmplt_epi32( f16_exponent, vector_int_f16_min_exponent );
const __m128i overflow = _mm_cmpgt_epi32( f16_exponent, vector_int_f16_max_exponent );
f16_exponent = _mm_sel_si128( f16_exponent, vector_int_f16_min_exponent, underflow );
f16_exponent = _mm_sel_si128( f16_exponent, vector_int_f16_max_exponent, overflow );
f16_mantissa = _mm_sel_si128( f16_mantissa, vector_int_f16_min_mantissa, underflow );
f16_mantissa = _mm_sel_si128( f16_mantissa, vector_int_f16_max_mantissa, overflow );
__m128i flt16 = _mm_or_si128( _mm_or_si128( f16_sign, f16_exponent ), f16_mantissa );
return _mm_packs_epi32( flt16, flt16 );
}
ID_INLINE_EXTERN halfFloat_t Scalar_FastF32toF16( float f32 )
{
const int f32_sign_mask = 1U << IEEE_FLT_SIGN_BIT;
const int f32_exponent_mask = ( ( 1U << IEEE_FLT_EXPONENT_BITS ) - 1 ) << IEEE_FLT_MANTISSA_BITS;
const int f32_mantissa_mask = ( 1U << IEEE_FLT_MANTISSA_BITS ) - 1;
const int f16_min_exponent = 0;
const int f16_max_exponent = ( 30 << IEEE_FLT16_MANTISSA_BITS );
const int f16_min_mantissa = 0;
const int f16_max_mantissa = ( ( 1 << IEEE_FLT16_MANTISSA_BITS ) - 1 );
const int f32_to_f16_sign_shift = IEEE_FLT_SIGN_BIT - IEEE_FLT16_SIGN_BIT;
const int f32_to_f16_exponent_shift = IEEE_FLT_MANTISSA_BITS - IEEE_FLT16_MANTISSA_BITS;
const int f32_to_f16_mantissa_shift = IEEE_FLT_MANTISSA_BITS - IEEE_FLT16_MANTISSA_BITS;
const int f32_to_f16_exponent_bias = ( IEEE_FLT_EXPONENT_BIAS - IEEE_FLT16_EXPONENT_BIAS ) << IEEE_FLT16_MANTISSA_BITS;
int f32_bits = *( unsigned int* )&f32;
int f16_sign = ( ( unsigned int )( f32_bits & f32_sign_mask ) >> f32_to_f16_sign_shift );
int f16_exponent = ( ( unsigned int )( f32_bits & f32_exponent_mask ) >> f32_to_f16_exponent_shift );
int f16_mantissa = ( ( unsigned int )( f32_bits & f32_mantissa_mask ) >> f32_to_f16_mantissa_shift );
f16_exponent -= f32_to_f16_exponent_bias;
const bool underflow = ( f16_exponent < f16_min_exponent );
const bool overflow = ( f16_exponent > f16_max_exponent );
f16_exponent = underflow ? f16_min_exponent : f16_exponent;
f16_exponent = overflow ? f16_max_exponent : f16_exponent;
f16_mantissa = underflow ? f16_min_mantissa : f16_mantissa;
f16_mantissa = overflow ? f16_max_mantissa : f16_mantissa;
return ( halfFloat_t )( f16_sign | f16_exponent | f16_mantissa );
}
/*
====================
LoadSkinnedDrawVertPosition
====================
*/
ID_INLINE_EXTERN __m128 LoadSkinnedDrawVertPosition( const idDrawVert& base, const idJointMat* joints )
{
const idJointMat& j0 = joints[base.color[0]];
const idJointMat& j1 = joints[base.color[1]];
const idJointMat& j2 = joints[base.color[2]];
const idJointMat& j3 = joints[base.color[3]];
__m128i weights_b = _mm_cvtsi32_si128( *( const unsigned int* )base.color2 );
__m128i weights_s = _mm_unpacklo_epi8( weights_b, vector_int_zero );
__m128i weights_i = _mm_unpacklo_epi16( weights_s, vector_int_zero );
__m128 weights = _mm_cvtepi32_ps( weights_i );
weights = _mm_mul_ps( weights, vector_float_1_over_255 );
__m128 w0 = _mm_splat_ps( weights, 0 );
__m128 w1 = _mm_splat_ps( weights, 1 );
__m128 w2 = _mm_splat_ps( weights, 2 );
__m128 w3 = _mm_splat_ps( weights, 3 );
__m128 matX = _mm_mul_ps( _mm_load_ps( j0.ToFloatPtr() + 0 * 4 ), w0 );
__m128 matY = _mm_mul_ps( _mm_load_ps( j0.ToFloatPtr() + 1 * 4 ), w0 );
__m128 matZ = _mm_mul_ps( _mm_load_ps( j0.ToFloatPtr() + 2 * 4 ), w0 );
matX = _mm_madd_ps( _mm_load_ps( j1.ToFloatPtr() + 0 * 4 ), w1, matX );
matY = _mm_madd_ps( _mm_load_ps( j1.ToFloatPtr() + 1 * 4 ), w1, matY );
matZ = _mm_madd_ps( _mm_load_ps( j1.ToFloatPtr() + 2 * 4 ), w1, matZ );
matX = _mm_madd_ps( _mm_load_ps( j2.ToFloatPtr() + 0 * 4 ), w2, matX );
matY = _mm_madd_ps( _mm_load_ps( j2.ToFloatPtr() + 1 * 4 ), w2, matY );
matZ = _mm_madd_ps( _mm_load_ps( j2.ToFloatPtr() + 2 * 4 ), w2, matZ );
matX = _mm_madd_ps( _mm_load_ps( j3.ToFloatPtr() + 0 * 4 ), w3, matX );
matY = _mm_madd_ps( _mm_load_ps( j3.ToFloatPtr() + 1 * 4 ), w3, matY );
matZ = _mm_madd_ps( _mm_load_ps( j3.ToFloatPtr() + 2 * 4 ), w3, matZ );
__m128 v = _mm_load_ps( base.xyz.ToFloatPtr() );
v = _mm_and_ps( v, vector_float_mask_clear_last );
v = _mm_or_ps( v, vector_float_last_one );
__m128 t0 = _mm_mul_ps( matX, v );
__m128 t1 = _mm_mul_ps( matY, v );
__m128 t2 = _mm_mul_ps( matZ, v );
__m128 t3 = vector_float_1_over_4;
__m128 s0 = _mm_unpacklo_ps( t0, t2 ); // x0, z0, x1, z1
__m128 s1 = _mm_unpackhi_ps( t0, t2 ); // x2, z2, x3, z3
__m128 s2 = _mm_unpacklo_ps( t1, t3 ); // y0, w0, y1, w1
__m128 s3 = _mm_unpackhi_ps( t1, t3 ); // y2, w2, y3, w3
__m128 r0 = _mm_unpacklo_ps( s0, s2 ); // x0, y0, z0, w0
__m128 r1 = _mm_unpackhi_ps( s0, s2 ); // x1, y1, z1, w1
__m128 r2 = _mm_unpacklo_ps( s1, s3 ); // x2, y2, z2, w2
__m128 r3 = _mm_unpackhi_ps( s1, s3 ); // x3, y3, z3, w3
r0 = _mm_add_ps( r0, r1 );
r2 = _mm_add_ps( r2, r3 );
r0 = _mm_add_ps( r0, r2 );
return r0;
}
ID_INLINE_EXTERN idVec3 Scalar_LoadSkinnedDrawVertPosition( const idDrawVert& vert, const idJointMat* joints )
{
const idJointMat& j0 = joints[vert.color[0]];
const idJointMat& j1 = joints[vert.color[1]];
const idJointMat& j2 = joints[vert.color[2]];
const idJointMat& j3 = joints[vert.color[3]];
const float w0 = vert.color2[0] * ( 1.0f / 255.0f );
const float w1 = vert.color2[1] * ( 1.0f / 255.0f );
const float w2 = vert.color2[2] * ( 1.0f / 255.0f );
const float w3 = vert.color2[3] * ( 1.0f / 255.0f );
idJointMat accum;
idJointMat::Mul( accum, j0, w0 );
idJointMat::Mad( accum, j1, w1 );
idJointMat::Mad( accum, j2, w2 );
idJointMat::Mad( accum, j3, w3 );
return accum * idVec4( vert.xyz.x, vert.xyz.y, vert.xyz.z, 1.0f );
}
#endif /* !__DRAWVERT_INTRINSICS_H__ */