/*
===========================================================================

Doom 3 BFG Edition GPL Source Code
Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company.

This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").

Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with Doom 3 BFG Edition Source Code.  If not, see <http://www.gnu.org/licenses/>.

In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code.  If not, please request a copy in writing from id Software at the address below.

If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.

===========================================================================
*/

#ifndef __DRAWVERT_INTRINSICS_H__
#define __DRAWVERT_INTRINSICS_H__


static const __m128i vector_int_f32_sign_mask					= _mm_set1_epi32( 1U << IEEE_FLT_SIGN_BIT );
static const __m128i vector_int_f32_exponent_mask				= _mm_set1_epi32( ( ( 1U << IEEE_FLT_EXPONENT_BITS ) - 1 ) << IEEE_FLT_MANTISSA_BITS );
static const __m128i vector_int_f32_mantissa_mask				= _mm_set1_epi32( ( 1U << IEEE_FLT_MANTISSA_BITS ) - 1 );
static const __m128i vector_int_f16_min_exponent				= _mm_set1_epi32( 0 );
static const __m128i vector_int_f16_max_exponent				= _mm_set1_epi32( ( 30 << IEEE_FLT16_MANTISSA_BITS ) );
static const __m128i vector_int_f16_min_mantissa				= _mm_set1_epi32( 0 );
static const __m128i vector_int_f16_max_mantissa				= _mm_set1_epi32( ( ( 1 << IEEE_FLT16_MANTISSA_BITS ) - 1 ) );
static const __m128i vector_int_f32_to_f16_exponent_bias		= _mm_set1_epi32( ( IEEE_FLT_EXPONENT_BIAS - IEEE_FLT16_EXPONENT_BIAS ) << IEEE_FLT16_MANTISSA_BITS );
static const int f32_to_f16_sign_shift							= IEEE_FLT_SIGN_BIT - IEEE_FLT16_SIGN_BIT;
static const int f32_to_f16_exponent_shift						= IEEE_FLT_MANTISSA_BITS - IEEE_FLT16_MANTISSA_BITS;
static const int f32_to_f16_mantissa_shift						= IEEE_FLT_MANTISSA_BITS - IEEE_FLT16_MANTISSA_BITS;

static const __m128i vector_int_zero							= _mm_setzero_si128();
static const __m128i vector_int_one								= _mm_set_epi32( 1, 1, 1, 1 );

static const __m128 vector_float_mask_clear_last				= __m128c( _mm_set_epi32( 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF ) );
static const __m128 vector_float_last_one						= {   0.0f,	  0.0f,   0.0f,   1.0f };
static const __m128 vector_float_1_over_255						= { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f };
static const __m128 vector_float_1_over_4						= { 1.0f / 4.0f, 1.0f / 4.0f, 1.0f / 4.0f, 1.0f / 4.0f };


/*
====================
FastF32toF16
====================
*/

ID_INLINE_EXTERN __m128i FastF32toF16( __m128i f32_bits )
{
	__m128i f16_sign     = _mm_srli_epi32( _mm_and_si128( f32_bits, vector_int_f32_sign_mask ), f32_to_f16_sign_shift );
	__m128i f16_exponent = _mm_srli_epi32( _mm_and_si128( f32_bits, vector_int_f32_exponent_mask ), f32_to_f16_exponent_shift );
	__m128i f16_mantissa = _mm_srli_epi32( _mm_and_si128( f32_bits, vector_int_f32_mantissa_mask ), f32_to_f16_mantissa_shift );
	
	f16_exponent = _mm_sub_epi32( f16_exponent, vector_int_f32_to_f16_exponent_bias );
	
	const __m128i underflow = _mm_cmplt_epi32( f16_exponent, vector_int_f16_min_exponent );
	const __m128i overflow  = _mm_cmpgt_epi32( f16_exponent, vector_int_f16_max_exponent );
	
	f16_exponent = _mm_sel_si128( f16_exponent, vector_int_f16_min_exponent, underflow );
	f16_exponent = _mm_sel_si128( f16_exponent, vector_int_f16_max_exponent, overflow );
	f16_mantissa = _mm_sel_si128( f16_mantissa, vector_int_f16_min_mantissa, underflow );
	f16_mantissa = _mm_sel_si128( f16_mantissa, vector_int_f16_max_mantissa, overflow );
	
	__m128i flt16 = _mm_or_si128( _mm_or_si128( f16_sign, f16_exponent ), f16_mantissa );
	
	return _mm_packs_epi32( flt16, flt16 );
}


ID_INLINE_EXTERN halfFloat_t Scalar_FastF32toF16( float f32 )
{
	const int f32_sign_mask				= 1U << IEEE_FLT_SIGN_BIT;
	const int f32_exponent_mask			= ( ( 1U << IEEE_FLT_EXPONENT_BITS ) - 1 ) << IEEE_FLT_MANTISSA_BITS;
	const int f32_mantissa_mask			= ( 1U << IEEE_FLT_MANTISSA_BITS ) - 1;
	const int f16_min_exponent			= 0;
	const int f16_max_exponent			= ( 30 << IEEE_FLT16_MANTISSA_BITS );
	const int f16_min_mantissa			= 0;
	const int f16_max_mantissa			= ( ( 1 << IEEE_FLT16_MANTISSA_BITS ) - 1 );
	const int f32_to_f16_sign_shift		= IEEE_FLT_SIGN_BIT - IEEE_FLT16_SIGN_BIT;
	const int f32_to_f16_exponent_shift	= IEEE_FLT_MANTISSA_BITS - IEEE_FLT16_MANTISSA_BITS;
	const int f32_to_f16_mantissa_shift	= IEEE_FLT_MANTISSA_BITS - IEEE_FLT16_MANTISSA_BITS;
	const int f32_to_f16_exponent_bias	= ( IEEE_FLT_EXPONENT_BIAS - IEEE_FLT16_EXPONENT_BIAS ) << IEEE_FLT16_MANTISSA_BITS;
	
	int f32_bits = *( unsigned int* )&f32;
	
	int f16_sign     = ( ( unsigned int )( f32_bits & f32_sign_mask ) >> f32_to_f16_sign_shift );
	int f16_exponent = ( ( unsigned int )( f32_bits & f32_exponent_mask ) >> f32_to_f16_exponent_shift );
	int f16_mantissa = ( ( unsigned int )( f32_bits & f32_mantissa_mask ) >> f32_to_f16_mantissa_shift );
	
	f16_exponent -= f32_to_f16_exponent_bias;
	
	const bool underflow = ( f16_exponent < f16_min_exponent );
	const bool overflow  = ( f16_exponent > f16_max_exponent );
	
	f16_exponent = underflow ? f16_min_exponent : f16_exponent;
	f16_exponent = overflow  ? f16_max_exponent : f16_exponent;
	f16_mantissa = underflow ? f16_min_mantissa : f16_mantissa;
	f16_mantissa = overflow  ? f16_max_mantissa : f16_mantissa;
	
	return ( halfFloat_t )( f16_sign | f16_exponent | f16_mantissa );
}

/*
====================
LoadSkinnedDrawVertPosition
====================
*/

ID_INLINE_EXTERN __m128 LoadSkinnedDrawVertPosition( const idDrawVert& base, const idJointMat* joints )
{
	const idJointMat& j0 = joints[base.color[0]];
	const idJointMat& j1 = joints[base.color[1]];
	const idJointMat& j2 = joints[base.color[2]];
	const idJointMat& j3 = joints[base.color[3]];
	
	__m128i weights_b = _mm_cvtsi32_si128( *( const unsigned int* )base.color2 );
	__m128i weights_s = _mm_unpacklo_epi8( weights_b, vector_int_zero );
	__m128i weights_i = _mm_unpacklo_epi16( weights_s, vector_int_zero );
	
	__m128 weights = _mm_cvtepi32_ps( weights_i );
	weights = _mm_mul_ps( weights, vector_float_1_over_255 );
	
	__m128 w0 = _mm_splat_ps( weights, 0 );
	__m128 w1 = _mm_splat_ps( weights, 1 );
	__m128 w2 = _mm_splat_ps( weights, 2 );
	__m128 w3 = _mm_splat_ps( weights, 3 );
	
	__m128 matX = _mm_mul_ps( _mm_load_ps( j0.ToFloatPtr() + 0 * 4 ), w0 );
	__m128 matY = _mm_mul_ps( _mm_load_ps( j0.ToFloatPtr() + 1 * 4 ), w0 );
	__m128 matZ = _mm_mul_ps( _mm_load_ps( j0.ToFloatPtr() + 2 * 4 ), w0 );
	
	matX = _mm_madd_ps( _mm_load_ps( j1.ToFloatPtr() + 0 * 4 ), w1, matX );
	matY = _mm_madd_ps( _mm_load_ps( j1.ToFloatPtr() + 1 * 4 ), w1, matY );
	matZ = _mm_madd_ps( _mm_load_ps( j1.ToFloatPtr() + 2 * 4 ), w1, matZ );
	
	matX = _mm_madd_ps( _mm_load_ps( j2.ToFloatPtr() + 0 * 4 ), w2, matX );
	matY = _mm_madd_ps( _mm_load_ps( j2.ToFloatPtr() + 1 * 4 ), w2, matY );
	matZ = _mm_madd_ps( _mm_load_ps( j2.ToFloatPtr() + 2 * 4 ), w2, matZ );
	
	matX = _mm_madd_ps( _mm_load_ps( j3.ToFloatPtr() + 0 * 4 ), w3, matX );
	matY = _mm_madd_ps( _mm_load_ps( j3.ToFloatPtr() + 1 * 4 ), w3, matY );
	matZ = _mm_madd_ps( _mm_load_ps( j3.ToFloatPtr() + 2 * 4 ), w3, matZ );
	
	__m128 v = _mm_load_ps( base.xyz.ToFloatPtr() );
	v = _mm_and_ps( v, vector_float_mask_clear_last );
	v = _mm_or_ps( v, vector_float_last_one );
	
	__m128 t0 = _mm_mul_ps( matX, v );
	__m128 t1 = _mm_mul_ps( matY, v );
	__m128 t2 = _mm_mul_ps( matZ, v );
	__m128 t3 = vector_float_1_over_4;
	
	__m128 s0 = _mm_unpacklo_ps( t0, t2 );	// x0, z0, x1, z1
	__m128 s1 = _mm_unpackhi_ps( t0, t2 );	// x2, z2, x3, z3
	__m128 s2 = _mm_unpacklo_ps( t1, t3 );	// y0, w0, y1, w1
	__m128 s3 = _mm_unpackhi_ps( t1, t3 );	// y2, w2, y3, w3
	
	__m128 r0 = _mm_unpacklo_ps( s0, s2 );	// x0, y0, z0, w0
	__m128 r1 = _mm_unpackhi_ps( s0, s2 );	// x1, y1, z1, w1
	__m128 r2 = _mm_unpacklo_ps( s1, s3 );	// x2, y2, z2, w2
	__m128 r3 = _mm_unpackhi_ps( s1, s3 );	// x3, y3, z3, w3
	
	r0 = _mm_add_ps( r0, r1 );
	r2 = _mm_add_ps( r2, r3 );
	r0 = _mm_add_ps( r0, r2 );
	
	return r0;
}


ID_INLINE_EXTERN idVec3 Scalar_LoadSkinnedDrawVertPosition( const idDrawVert& vert, const idJointMat* joints )
{
	const idJointMat& j0 = joints[vert.color[0]];
	const idJointMat& j1 = joints[vert.color[1]];
	const idJointMat& j2 = joints[vert.color[2]];
	const idJointMat& j3 = joints[vert.color[3]];
	
	const float w0 = vert.color2[0] * ( 1.0f / 255.0f );
	const float w1 = vert.color2[1] * ( 1.0f / 255.0f );
	const float w2 = vert.color2[2] * ( 1.0f / 255.0f );
	const float w3 = vert.color2[3] * ( 1.0f / 255.0f );
	
	idJointMat accum;
	idJointMat::Mul( accum, j0, w0 );
	idJointMat::Mad( accum, j1, w1 );
	idJointMat::Mad( accum, j2, w2 );
	idJointMat::Mad( accum, j3, w3 );
	
	return accum * idVec4( vert.xyz.x, vert.xyz.y, vert.xyz.z, 1.0f );
}

#endif /* !__DRAWVERT_INTRINSICS_H__ */