etqw-sdk/source/idlib/math/Simd_Xenon.cpp

// Copyright (C) 2007 Id Software, Inc.
//


#include "../precompiled.h"
#pragma hdrstop

#include "Simd_Generic.h"
#include "Simd_Xenon.h"

//===============================================================
//
//	Xenon implementation of idSIMDProcessor
//
//===============================================================

#ifdef _XENON

#if !defined( SHUFFLE_D )
#define SHUFFLE_D( x, y, z, w )	(( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
#endif

static __vector4i vmxi_byte_zero				= { 0x00000000, 0x00000000, 0x00000000, 0x00000000 };
static __vector4i vmxi_dword_not				= { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
static __vector4i vmxi_dword_perm_replacelast	= { 0x00010203, 0x04050607, 0x08090A0B, 0x10111213 };	// 00 01 02 10
static __vector4i vmxi_dword_perm_plane_x		= { 0x00010203, 0x04050607, 0x10111213, 0x14151617 };	// 00 01 10 11
static __vector4i vmxi_dword_perm_plane_y		= { 0x08090A0B, 0x0C0D0E0F, 0x18191A1B, 0x1C1D1E1F };	// 02 03 12 13
static __vector4i vmxi_dword_perm_matrix		= { 0x10111213, 0x10111213, 0x10111213, 0x0C0D0E0F };	// 10 10 10 03
static __vector4i vmxi_dword_perm_quat2mat1		= { 0x10111213, 0x04050607, 0x08090A0B, 0x0C0D0E0F };	// 10 01 02 03
static __vector4i vmxi_dword_perm_quat2mat2		= { 0x00010203, 0x04050607, 0x18191A1B, 0x1C1D1E1F };	// 00 01 12 13
static __vector4i vmxi_dword_perm_quat2mat3		= { 0x00010203, 0x04050607, 0x08090A0B, 0x10111213 };	// 00 01 02 10
static __vector4i vmxi_dword_perm_quat2mat4		= { 0x04050607, 0x00010203, 0x0C0D0E0F, 0x14151617 };	// 01 00 03 11
static __vector4i vmxi_dword_perm_quat2mat5		= { 0x08090A0B, 0x0C0D0E0F, 0x00010203, 0x18191A1B };	// 02 03 00 12
static __vector4i vmxi_dword_mask_clear_last	= { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
static __vector4i vmxi_dword_quat2mat_swizzle0	= { 0x0C080400, 0x0C080400, 0x0C080400, 0x0C080400 };	// C 8 4 0
static __vector4i vmxi_dword_quat2mat_swizzle1	= { 0x0004080C, 0x0004080C, 0x0004080C, 0x0004080C };	// 0 4 8 C
static __vector4i vmxi_dword_quat2mat_swizzle2	= { 0x04000C08, 0x04000C08, 0x04000C08, 0x04000C08 };	// 4 0 C 8
static __vector4i vmxi_dword_quat2mat_swizzle3	= { 0x080C0004, 0x080C0004, 0x080C0004, 0x080C0004 };	// 8 C 0 4
static __vector4i vmxi_dword_quat2mat_splat0	= { 0x00000000, 0x01010101, 0x02020202, 0x03030303 };	// 0 1 2 3
static __vector4i vmxi_dword_quat2mat_splat1	= { 0x04040404, 0x05050505, 0x06060606, 0x07070707 };	// 4 5 6 7
static __vector4i vmxi_dword_quat2mat_splat2	= { 0x08080808, 0x09090909, 0x0A0A0A0A, 0x0B0B0B0B };	// 8 9 A B
static __vector4i vmxi_dword_quat2mat_splat3	= { 0x0C0C0C0C, 0x0D0D0D0D, 0x0E0E0E0E, 0x0F0F0F0F };	// C D E F
static __vector4i vmxi_dword_quat2mat_or		= { 0x00010203, 0x00010203, 0x00010203, 0x00010203 };	// 0 1 2 3
static __vector4i vmxi_float_quat2mat_xor		= { 0x80000000, 0x00000000, 0x00000000, 0x00000000 };	// - + + +
static __vector4i vmxi_dword_overlay_mask0		= {    1 <<  0,    1 <<  1,    1 <<  8,    1 <<  9 };
static __vector4i vmxi_dword_overlay_mask1		= {    1 << 16,    1 << 17,    1 << 24,    1 << 25 };
static __vector4i vmxi_dword_overlay_mask2		= {    1 <<  2,    1 <<  3,    1 << 10,    1 << 11 };
static __vector4i vmxi_dword_overlay_mask3		= {    1 << 18,    1 << 19,    1 << 26,    1 << 27 };
static __vector4i vmxi_dword_overlay_xor		= { 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F };
static __vector4i vmxi_dword_overlay_perm		= { 0x03020100, 0x03020100, 0x03020100, 0x03020100 };
static __vector4i vmxi_dword_decal_mask0		= {    1 <<  0,    1 <<  1,    1 <<  2,    1 <<  3 };	//  0  1  2  3
static __vector4i vmxi_dword_decal_mask1		= {    1 <<  8,    1 <<  9,    1 << 10,    1 << 11 };	//  8  9 10 11
static __vector4i vmxi_dword_decal_mask2		= {    1 << 16,    1 << 17,    1 << 18,    1 << 19 };	// 16 17 18 19
static __vector4i vmxi_dword_decal_mask3		= {    1 << 24,    1 << 25,    1 << 26,    1 << 27 };	// 24 25 26 27
static __vector4i vmxi_dword_decal_mask4		= {    1 <<  4,    1 <<  5,    1 << 12,    1 << 13 };	//  4  5 12 13
static __vector4i vmxi_dword_decal_mask5		= {    1 << 20,    1 << 21,    1 << 28,    1 << 29 };	// 20 21 28 29
static __vector4i vmxi_dword_trace_mask0		= {    1 <<  0,    1 <<  1,    1 <<  2,    1 <<  3 };	//  0  1  2  3
static __vector4i vmxi_dword_trace_mask1		= {    1 <<  8,    1 <<  9,    1 << 10,    1 << 11 };	//  8  9 10 11
static __vector4i vmxi_dword_trace_mask2		= {    1 << 16,    1 << 17,    1 << 18,    1 << 19 };	// 16 17 18 19
static __vector4i vmxi_dword_trace_mask3		= {    1 << 24,    1 << 25,    1 << 26,    1 << 27 };	// 24 25 26 27
static __vector4i vmxi_dword_trace_mask4		= {    1 <<  4,    1 <<  5,    1 <<  6,    1 <<  7 };	//  4  5  6  7
static __vector4i vmxi_dword_trace_mask5		= {    1 << 12,    1 << 13,    1 << 14,    1 << 15 };	// 12 13 14 15
static __vector4i vmxi_dword_trace_mask6		= {    1 << 20,    1 << 21,    1 << 22,    1 << 23 };	// 20 21 22 23
static __vector4i vmxi_dword_trace_mask7		= {    1 << 28,    1 << 29,    1 << 30,    1 << 31 };	// 28 29 30 31
static __vector4i vmxi_dword_trace_xor			= { 0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0 };
static __vector4i vmxi_dword_facing_mask0		= {    1 << 24,    1 << 24,    1 << 24,    1 << 24 };
static __vector4i vmxi_dword_facing_mask1		= {    1 << 16,    1 << 16,    1 << 16,    1 << 16 };
static __vector4i vmxi_dword_facing_mask2		= {    1 <<  8,    1 <<  8,    1 <<  8,    1 <<  8 };
static __vector4i vmxi_dword_facing_mask3		= {    1 <<  0,    1 <<  0,    1 <<  0,    1 <<  0 };
static __vector4i vmxi_float_sign_bit			= { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
static __vector4i vmxi_float_pos_infinity		= { 0x7149F2CA, 0x7149F2CA, 0x7149F2CA, 0x7149F2CA };
static __vector4i vmxi_float_neg_infinity		= { 0xF149F2CA, 0xF149F2CA, 0xF149F2CA, 0xF149F2CA };

static __vector4 vmx_float_zero					= {      0.0f,      0.0f,      0.0f,      0.0f };
static __vector4 vmx_float_one					= {      1.0f,      1.0f,      1.0f,      1.0f };
static __vector4 vmx_float_neg_one				= {     -1.0f,     -1.0f,     -1.0f,     -1.0f };
static __vector4 vmx_float_last_one				= {      0.0f,      0.0f,      0.0f,      1.0f };
static __vector4 vmx_float_tiny					= {    1e-10f,    1e-10f,    1e-10f,    1e-10f };
static __vector4 vmx_float_PI					= { M_PI*0.5f, M_PI*0.5f, M_PI*0.5f, M_PI*0.5f };

static __vector4 vmx_float_quat2mat_mad1		= { -1.0f,  1.0f, -1.0f, -1.0f };	// - + - -
static __vector4 vmx_float_quat2mat_mad2		= { -1.0f, -1.0f, -1.0f,  1.0f };	// - - - +
static __vector4 vmx_float_quat2mat_mad3		= { -1.0f, -1.0f,  1.0f, -1.0f };	// - - + -
static __vector4 vmx_float_quat2mat_add			= {  1.0f,  0.0f,  0.0f,  0.0f };	// 1 0 0 0

static __vector4 vmx_float_rsqrt_c0				= {  -3.0f,  -3.0f,  -3.0f,  -3.0f };
static __vector4 vmx_float_rsqrt_c1				= {  -0.5f,  -0.5f,  -0.5f,  -0.5f };
static __vector4 vmx_float_rsqrt_c2				= { -0.25f, -0.25f, -0.25f, -0.25f };

static __vector4 vmx_float_sin_c0				= { -2.39e-08f, -2.39e-08f, -2.39e-08f, -2.39e-08f };
static __vector4 vmx_float_sin_c1				= {  2.7526e-06f, 2.7526e-06f, 2.7526e-06f, 2.7526e-06f };
static __vector4 vmx_float_sin_c2				= { -1.98409e-04f, -1.98409e-04f, -1.98409e-04f, -1.98409e-04f };
static __vector4 vmx_float_sin_c3				= {  8.3333315e-03f, 8.3333315e-03f, 8.3333315e-03f, 8.3333315e-03f };
static __vector4 vmx_float_sin_c4				= { -1.666666664e-01f, -1.666666664e-01f, -1.666666664e-01f, -1.666666664e-01f };

static __vector4 vmx_float_atan_c0				= {  0.0028662257f,  0.0028662257f,  0.0028662257f,  0.0028662257f };
static __vector4 vmx_float_atan_c1				= { -0.0161657367f, -0.0161657367f, -0.0161657367f, -0.0161657367f };
static __vector4 vmx_float_atan_c2				= {  0.0429096138f,  0.0429096138f,  0.0429096138f,  0.0429096138f };
static __vector4 vmx_float_atan_c3				= { -0.0752896400f, -0.0752896400f, -0.0752896400f, -0.0752896400f };
static __vector4 vmx_float_atan_c4				= {  0.1065626393f,  0.1065626393f,  0.1065626393f,  0.1065626393f };
static __vector4 vmx_float_atan_c5				= { -0.1420889944f, -0.1420889944f, -0.1420889944f, -0.1420889944f };
static __vector4 vmx_float_atan_c6				= {  0.1999355085f,  0.1999355085f,  0.1999355085f,  0.1999355085f };
static __vector4 vmx_float_atan_c7				= { -0.3333314528f, -0.3333314528f, -0.3333314528f, -0.3333314528f };

#define vmx_byte_zero							__vspltisw( 0 )		//*(__vector4 *)&vmxi_byte_zero
#define vmx_dword_not							__vspltisw( -1 )	// *(__vector4 *)&vmxi_dword_not
#define vmx_dword_perm_replacelast				*(__vector4 *)&vmxi_dword_perm_replacelast
#define vmx_dword_perm_plane_x					*(__vector4 *)&vmxi_dword_perm_plane_x
#define vmx_dword_perm_plane_y					*(__vector4 *)&vmxi_dword_perm_plane_y
#define vmx_dword_perm_matrix					*(__vector4 *)&vmxi_dword_perm_matrix
#define vmx_dword_perm_quat2mat1				*(__vector4 *)&vmxi_dword_perm_quat2mat1
#define vmx_dword_perm_quat2mat2				*(__vector4 *)&vmxi_dword_perm_quat2mat2
#define vmx_dword_perm_quat2mat3				*(__vector4 *)&vmxi_dword_perm_quat2mat3
#define vmx_dword_perm_quat2mat4				*(__vector4 *)&vmxi_dword_perm_quat2mat4
#define vmx_dword_perm_quat2mat5				*(__vector4 *)&vmxi_dword_perm_quat2mat5
#define vmx_dword_mask_clear_last				*(__vector4 *)&vmxi_dword_mask_clear_last
#define vmx_dword_quat2mat_swizzle0				*(__vector4 *)&vmxi_dword_quat2mat_swizzle0
#define vmx_dword_quat2mat_swizzle1				*(__vector4 *)&vmxi_dword_quat2mat_swizzle1
#define vmx_dword_quat2mat_swizzle2				*(__vector4 *)&vmxi_dword_quat2mat_swizzle2
#define vmx_dword_quat2mat_swizzle3				*(__vector4 *)&vmxi_dword_quat2mat_swizzle3
#define vmx_dword_quat2mat_splat0				*(__vector4 *)&vmxi_dword_quat2mat_splat0
#define vmx_dword_quat2mat_splat1				*(__vector4 *)&vmxi_dword_quat2mat_splat1
#define vmx_dword_quat2mat_splat2				*(__vector4 *)&vmxi_dword_quat2mat_splat2
#define vmx_dword_quat2mat_splat3				*(__vector4 *)&vmxi_dword_quat2mat_splat3
#define vmx_dword_quat2mat_or					*(__vector4 *)&vmxi_dword_quat2mat_or
#define vmx_float_quat2mat_xor					*(__vector4 *)&vmxi_float_quat2mat_xor
#define vmx_dword_overlay_mask0					*(__vector4 *)&vmxi_dword_overlay_mask0
#define vmx_dword_overlay_mask1					*(__vector4 *)&vmxi_dword_overlay_mask1
#define vmx_dword_overlay_mask2					*(__vector4 *)&vmxi_dword_overlay_mask2
#define vmx_dword_overlay_mask3					*(__vector4 *)&vmxi_dword_overlay_mask3
#define vmx_dword_overlay_xor					*(__vector4 *)&vmxi_dword_overlay_xor
#define vmx_dword_overlay_perm					*(__vector4 *)&vmxi_dword_overlay_perm
#define vmx_dword_decal_mask0					*(__vector4 *)&vmxi_dword_decal_mask0
#define vmx_dword_decal_mask1					*(__vector4 *)&vmxi_dword_decal_mask1
#define vmx_dword_decal_mask2					*(__vector4 *)&vmxi_dword_decal_mask2
#define vmx_dword_decal_mask3					*(__vector4 *)&vmxi_dword_decal_mask3
#define vmx_dword_decal_mask4					*(__vector4 *)&vmxi_dword_decal_mask4
#define vmx_dword_decal_mask5					*(__vector4 *)&vmxi_dword_decal_mask5
#define vmx_dword_trace_mask0					*(__vector4 *)&vmxi_dword_trace_mask0
#define vmx_dword_trace_mask1					*(__vector4 *)&vmxi_dword_trace_mask1
#define vmx_dword_trace_mask2					*(__vector4 *)&vmxi_dword_trace_mask2
#define vmx_dword_trace_mask3					*(__vector4 *)&vmxi_dword_trace_mask3
#define vmx_dword_trace_mask4					*(__vector4 *)&vmxi_dword_trace_mask4
#define vmx_dword_trace_mask5					*(__vector4 *)&vmxi_dword_trace_mask5
#define vmx_dword_trace_mask6					*(__vector4 *)&vmxi_dword_trace_mask6
#define vmx_dword_trace_mask7					*(__vector4 *)&vmxi_dword_trace_mask7
#define vmx_dword_trace_xor						*(__vector4 *)&vmxi_dword_trace_xor
#define vmx_dword_facing_mask0					*(__vector4 *)&vmxi_dword_facing_mask0
#define vmx_dword_facing_mask1					*(__vector4 *)&vmxi_dword_facing_mask1
#define vmx_dword_facing_mask2					*(__vector4 *)&vmxi_dword_facing_mask2
#define vmx_dword_facing_mask3					*(__vector4 *)&vmxi_dword_facing_mask3
#define vmx_float_sign_bit						*(__vector4 *)&vmxi_float_sign_bit
#define vmx_float_pos_infinity					*(__vector4 *)&vmxi_float_pos_infinity
#define vmx_float_neg_infinity					*(__vector4 *)&vmxi_float_neg_infinity

/*
============
idSIMD_Xenon::GetName
============
*/
const char *idSIMD_Xenon::GetName( void ) const {
	return "Xenon";
}

/*
============
Xenon_ReciprocalSqrt
============
*/
ID_INLINE float Xenon_ReciprocalSqrt( float x ) {
	float r = __frsqrte( x );
	return ( x * r * r + -3.0f ) * ( r * -0.5f );
}

/*
============
Xenon_SinZeroHalfPI

  The angle must be between zero and half PI.
============
*/
ID_INLINE float Xenon_SinZeroHalfPI( float a ) {
	float s, t;

	assert( a >= 0.0f && a <= idMath::HALF_PI );

	s = a * a;
	t = -2.39e-08f;
	t *= s;
	t += 2.7526e-06f;
	t *= s;
	t += -1.98409e-04f;
	t *= s;
	t += 8.3333315e-03f;
	t *= s;
	t += -1.666666664e-01f;
	t *= s;
	t += 1.0f;
	t *= a;

	return t;
}

/*
============
Xenon_ATanPositive

  Both 'x' and 'y' must be positive.
============
*/
ID_INLINE float Xenon_ATanPositive( float y, float x ) {
	float a, b, c, d, s, t;

	assert( y >= 0.0f && x >= 0.0f );

	a = y - x;
	b = __fsel( a, -x, y );
	c = __fsel( a, y, x );
	d = __fsel( a, idMath::HALF_PI, 0.0f );

	a = b / c;
	s = a * a;
	t = 0.0028662257f;
	t *= s;
	t += -0.0161657367f;
	t *= s;
	t += 0.0429096138f;
	t *= s;
	t += -0.0752896400f;
	t *= s;
	t += 0.1065626393f;
	t *= s;
	t += -0.1420889944f;
	t *= s;
	t += 0.1999355085f;
	t *= s;
	t += -0.3333314528f;
	t *= s;
	t += 1.0f;
	t *= a;
	t += d;

	return t;
}

/*
============
idSIMD_Xenon::MinMax
============
*/
void VPCALL idSIMD_Xenon::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *__restrict src, const int count ) {
	const idDrawVert *__restrict end = src + count;

	assert_16_byte_aligned( src );
	assert_sizeof_16_byte_multiple( idDrawVert );

	__vector4 mask = vmx_dword_mask_clear_last;

	__vector4 min0 = vmx_float_pos_infinity;
	__vector4 min1 = vmx_float_pos_infinity;
	__vector4 min2 = vmx_float_pos_infinity;
	__vector4 min3 = vmx_float_pos_infinity;
	__vector4 min4 = vmx_float_pos_infinity;
	__vector4 min5 = vmx_float_pos_infinity;
	__vector4 min6 = vmx_float_pos_infinity;
	__vector4 min7 = vmx_float_pos_infinity;

	__vector4 max0 = vmx_float_neg_infinity;
	__vector4 max1 = vmx_float_neg_infinity;
	__vector4 max2 = vmx_float_neg_infinity;
	__vector4 max3 = vmx_float_neg_infinity;
	__vector4 max4 = vmx_float_neg_infinity;
	__vector4 max5 = vmx_float_neg_infinity;
	__vector4 max6 = vmx_float_neg_infinity;
	__vector4 max7 = vmx_float_neg_infinity;

	for ( ; src + 7 < end; src += 8 ) {
		__vector4 v0 = *(__vector4 *)&src[0].xyz;
		__vector4 v1 = *(__vector4 *)&src[1].xyz;
		__vector4 v2 = *(__vector4 *)&src[2].xyz;
		__vector4 v3 = *(__vector4 *)&src[3].xyz;
		__vector4 v4 = *(__vector4 *)&src[4].xyz;
		__vector4 v5 = *(__vector4 *)&src[5].xyz;
		__vector4 v6 = *(__vector4 *)&src[6].xyz;
		__vector4 v7 = *(__vector4 *)&src[7].xyz;

		__vector4 m0 = __vand( v0, mask );
		__vector4 m1 = __vand( v1, mask );
		__vector4 m2 = __vand( v2, mask );
		__vector4 m3 = __vand( v3, mask );
		__vector4 m4 = __vand( v4, mask );
		__vector4 m5 = __vand( v5, mask );
		__vector4 m6 = __vand( v6, mask );
		__vector4 m7 = __vand( v7, mask );

		min0 = __vminfp( min0, m0 );
		min1 = __vminfp( min1, m1 );
		min2 = __vminfp( min2, m2 );
		min3 = __vminfp( min3, m3 );
		min4 = __vminfp( min4, m4 );
		min5 = __vminfp( min5, m5 );
		min6 = __vminfp( min6, m6 );
		min7 = __vminfp( min7, m7 );

		max0 = __vmaxfp( max0, m0 );
		max1 = __vmaxfp( max1, m1 );
		max2 = __vmaxfp( max2, m2 );
		max3 = __vmaxfp( max3, m3 );
		max4 = __vmaxfp( max4, m4 );
		max5 = __vmaxfp( max5, m5 );
		max6 = __vmaxfp( max6, m6 );
		max7 = __vmaxfp( max7, m7 );
	}

	for ( ; src < end; src++ ) {
		__vector4 v0 = *(__vector4 *)&src[0].xyz;
		__vector4 m0 = __vand( v0, mask );

		min0 = __vminfp( min0, m0 );
		max0 = __vmaxfp( max0, m0 );
	}

	min0 = __vminfp( min0, min1 );
	max0 = __vmaxfp( max0, max1 );
	min2 = __vminfp( min2, min3 );
	max2 = __vmaxfp( max2, max3 );
	min4 = __vminfp( min4, min5 );
	max4 = __vmaxfp( max4, max5 );
	min6 = __vminfp( min6, min7 );
	max6 = __vmaxfp( max6, max7 );

	min0 = __vminfp( min0, min2 );
	max0 = __vmaxfp( max0, max2 );
	min4 = __vminfp( min4, min6 );
	max4 = __vmaxfp( max4, max6 );

	min0 = __vminfp( min0, min4 );
	max0 = __vmaxfp( max0, max4 );

	min[0] = min0.x;
	min[1] = min0.y;
	min[2] = min0.z;
	max[0] = max0.x;
	max[1] = max0.y;
	max[2] = max0.z;
}

/*
============
idSIMD_Xenon::MinMax
============
*/
void VPCALL idSIMD_Xenon::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *__restrict src, const int *indexes, const int count ) {
	const int *__restrict end = indexes + count;

	assert_16_byte_aligned( src );
	assert_sizeof_16_byte_multiple( idDrawVert );

	__vector4 mask = vmx_dword_mask_clear_last;

	__vector4 min0 = vmx_float_pos_infinity;
	__vector4 min1 = vmx_float_pos_infinity;
	__vector4 min2 = vmx_float_pos_infinity;
	__vector4 min3 = vmx_float_pos_infinity;
	__vector4 min4 = vmx_float_pos_infinity;
	__vector4 min5 = vmx_float_pos_infinity;
	__vector4 min6 = vmx_float_pos_infinity;
	__vector4 min7 = vmx_float_pos_infinity;

	__vector4 max0 = vmx_float_neg_infinity;
	__vector4 max1 = vmx_float_neg_infinity;
	__vector4 max2 = vmx_float_neg_infinity;
	__vector4 max3 = vmx_float_neg_infinity;
	__vector4 max4 = vmx_float_neg_infinity;
	__vector4 max5 = vmx_float_neg_infinity;
	__vector4 max6 = vmx_float_neg_infinity;
	__vector4 max7 = vmx_float_neg_infinity;

	for ( ; indexes + 7 < end; indexes += 8 ) {
		__vector4 v0 = *(__vector4 *)&src[indexes[0]].xyz;
		__vector4 v1 = *(__vector4 *)&src[indexes[1]].xyz;
		__vector4 v2 = *(__vector4 *)&src[indexes[2]].xyz;
		__vector4 v3 = *(__vector4 *)&src[indexes[3]].xyz;
		__vector4 v4 = *(__vector4 *)&src[indexes[4]].xyz;
		__vector4 v5 = *(__vector4 *)&src[indexes[5]].xyz;
		__vector4 v6 = *(__vector4 *)&src[indexes[6]].xyz;
		__vector4 v7 = *(__vector4 *)&src[indexes[7]].xyz;

		__vector4 m0 = __vand( v0, mask );
		__vector4 m1 = __vand( v1, mask );
		__vector4 m2 = __vand( v2, mask );
		__vector4 m3 = __vand( v3, mask );
		__vector4 m4 = __vand( v4, mask );
		__vector4 m5 = __vand( v5, mask );
		__vector4 m6 = __vand( v6, mask );
		__vector4 m7 = __vand( v7, mask );

		min0 = __vminfp( min0, m0 );
		min1 = __vminfp( min1, m1 );
		min2 = __vminfp( min2, m2 );
		min3 = __vminfp( min3, m3 );
		min4 = __vminfp( min4, m4 );
		min5 = __vminfp( min5, m5 );
		min6 = __vminfp( min6, m6 );
		min7 = __vminfp( min7, m7 );

		max0 = __vmaxfp( max0, m0 );
		max1 = __vmaxfp( max1, m1 );
		max2 = __vmaxfp( max2, m2 );
		max3 = __vmaxfp( max3, m3 );
		max4 = __vmaxfp( max4, m4 );
		max5 = __vmaxfp( max5, m5 );
		max6 = __vmaxfp( max6, m6 );
		max7 = __vmaxfp( max7, m7 );
	}

	for ( ; indexes < end; indexes++ ) {
		__vector4 v0 = *(__vector4 *)&src[indexes[0]].xyz;
		__vector4 m0 = __vand( v0, mask );

		min0 = __vminfp( min0, m0 );
		max0 = __vmaxfp( max0, m0 );
	}

	min0 = __vminfp( min0, min1 );
	max0 = __vmaxfp( max0, max1 );
	min2 = __vminfp( min2, min3 );
	max2 = __vmaxfp( max2, max3 );
	min4 = __vminfp( min4, min5 );
	max4 = __vmaxfp( max4, max5 );
	min6 = __vminfp( min6, min7 );
	max6 = __vmaxfp( max6, max7 );

	min0 = __vminfp( min0, min2 );
	max0 = __vmaxfp( max0, max2 );
	min4 = __vminfp( min4, min6 );
	max4 = __vmaxfp( max4, max6 );

	min0 = __vminfp( min0, min4 );
	max0 = __vmaxfp( max0, max4 );

	min[0] = min0.x;
	min[1] = min0.y;
	min[2] = min0.z;
	max[0] = max0.x;
	max[1] = max0.y;
	max[2] = max0.z;
}

/*
============
idSIMD_Xenon::BlendJoints
============
*/
void VPCALL idSIMD_Xenon::BlendJoints( idJointQuat *__restrict joints, const idJointQuat *__restrict blendJoints, const float lerp, const int *__restrict index, const int numJoints ) {
	int i;

	assert_16_byte_aligned( joints );
	assert_16_byte_aligned( blendJoints );
	assert_16_byte_aligned( JOINTQUAT_Q_OFFSET );
	assert_16_byte_aligned( JOINTQUAT_T_OFFSET );
	assert_sizeof_16_byte_multiple( idJointQuat );

	if ( lerp <= 0.0f ) {
		return;
	} else if ( lerp >= 1.0f ) {
		for ( i = 0; i < numJoints; i++ ) {
			int j = index[i];
			joints[j] = blendJoints[j];
		}
		return;
	}

	__vector4 vlerp = { lerp, lerp, lerp, lerp };

	for ( i = 0; i < numJoints - 7; i += 8 ) {
		int n0 = index[i+0];
		int n1 = index[i+1];
		int n2 = index[i+2];
		int n3 = index[i+3];
		int n4 = index[i+4];
		int n5 = index[i+5];
		int n6 = index[i+6];
		int n7 = index[i+7];

		__vector4 jqa_0 = *(__vector4 *)&joints[n0].q;
		__vector4 jqb_0 = *(__vector4 *)&joints[n1].q;
		__vector4 jqc_0 = *(__vector4 *)&joints[n2].q;
		__vector4 jqd_0 = *(__vector4 *)&joints[n3].q;

		__vector4 jta_0 = *(__vector4 *)&joints[n0].t;
		__vector4 jtb_0 = *(__vector4 *)&joints[n1].t;
		__vector4 jtc_0 = *(__vector4 *)&joints[n2].t;
		__vector4 jtd_0 = *(__vector4 *)&joints[n3].t;

		__vector4 bqa_0 = *(__vector4 *)&blendJoints[n0].q;
		__vector4 bqb_0 = *(__vector4 *)&blendJoints[n1].q;
		__vector4 bqc_0 = *(__vector4 *)&blendJoints[n2].q;
		__vector4 bqd_0 = *(__vector4 *)&blendJoints[n3].q;

		__vector4 bta_0 = *(__vector4 *)&blendJoints[n0].t;
		__vector4 btb_0 = *(__vector4 *)&blendJoints[n1].t;
		__vector4 btc_0 = *(__vector4 *)&blendJoints[n2].t;
		__vector4 btd_0 = *(__vector4 *)&blendJoints[n3].t;

		__vector4 jqa_1 = *(__vector4 *)&joints[n4].q;
		__vector4 jqb_1 = *(__vector4 *)&joints[n5].q;
		__vector4 jqc_1 = *(__vector4 *)&joints[n6].q;
		__vector4 jqd_1 = *(__vector4 *)&joints[n7].q;

		__vector4 jta_1 = *(__vector4 *)&joints[n4].t;
		__vector4 jtb_1 = *(__vector4 *)&joints[n5].t;
		__vector4 jtc_1 = *(__vector4 *)&joints[n6].t;
		__vector4 jtd_1 = *(__vector4 *)&joints[n7].t;

		__vector4 bqa_1 = *(__vector4 *)&blendJoints[n4].q;
		__vector4 bqb_1 = *(__vector4 *)&blendJoints[n5].q;
		__vector4 bqc_1 = *(__vector4 *)&blendJoints[n6].q;
		__vector4 bqd_1 = *(__vector4 *)&blendJoints[n7].q;

		__vector4 bta_1 = *(__vector4 *)&blendJoints[n4].t;
		__vector4 btb_1 = *(__vector4 *)&blendJoints[n5].t;
		__vector4 btc_1 = *(__vector4 *)&blendJoints[n6].t;
		__vector4 btd_1 = *(__vector4 *)&blendJoints[n7].t;

		bta_0 = __vsubfp( bta_0, jta_0 );
		btb_0 = __vsubfp( btb_0, jtb_0 );
		btc_0 = __vsubfp( btc_0, jtc_0 );
		btd_0 = __vsubfp( btd_0, jtd_0 );

		bta_1 = __vsubfp( bta_1, jta_1 );
		btb_1 = __vsubfp( btb_1, jtb_1 );
		btc_1 = __vsubfp( btc_1, jtc_1 );
		btd_1 = __vsubfp( btd_1, jtd_1 );

		jta_0 = __vmaddfp( vlerp, bta_0, jta_0 );
		jtb_0 = __vmaddfp( vlerp, btb_0, jtb_0 );
		jtc_0 = __vmaddfp( vlerp, btc_0, jtc_0 );
		jtd_0 = __vmaddfp( vlerp, btd_0, jtd_0 );

		jta_1 = __vmaddfp( vlerp, bta_1, jta_1 );
		jtb_1 = __vmaddfp( vlerp, btb_1, jtb_1 );
		jtc_1 = __vmaddfp( vlerp, btc_1, jtc_1 );
		jtd_1 = __vmaddfp( vlerp, btd_1, jtd_1 );

		__stvx( jta_0, &joints[n0].t, 0 );
		__stvx( jtb_0, &joints[n1].t, 0 );
		__stvx( jtc_0, &joints[n2].t, 0 );
		__stvx( jtd_0, &joints[n3].t, 0 );

		__stvx( jta_1, &joints[n4].t, 0 );
		__stvx( jtb_1, &joints[n5].t, 0 );
		__stvx( jtc_1, &joints[n6].t, 0 );
		__stvx( jtd_1, &joints[n7].t, 0 );

		__vector4 jqr_0 = __vmrghw( jqa_0, jqb_0 );
		__vector4 jqs_0 = __vmrghw( jqc_0, jqd_0 );
		__vector4 jqt_0 = __vmrglw( jqa_0, jqb_0 );
		__vector4 jqu_0 = __vmrglw( jqc_0, jqd_0 );

		__vector4 jqr_1 = __vmrghw( jqa_1, jqb_1 );
		__vector4 jqs_1 = __vmrghw( jqc_1, jqd_1 );
		__vector4 jqt_1 = __vmrglw( jqa_1, jqb_1 );
		__vector4 jqu_1 = __vmrglw( jqc_1, jqd_1 );

		__vector4 bqr_0 = __vmrghw( bqa_0, bqb_0 );
		__vector4 bqs_0 = __vmrghw( bqc_0, bqd_0 );
		__vector4 bqt_0 = __vmrglw( bqa_0, bqb_0 );
		__vector4 bqu_0 = __vmrglw( bqc_0, bqd_0 );

		__vector4 bqr_1 = __vmrghw( bqa_1, bqb_1 );
		__vector4 bqs_1 = __vmrghw( bqc_1, bqd_1 );
		__vector4 bqt_1 = __vmrglw( bqa_1, bqb_1 );
		__vector4 bqu_1 = __vmrglw( bqc_1, bqd_1 );

		__vector4 jqx_0 = __vperm( jqr_0, jqs_0, vmx_dword_perm_plane_x );
		__vector4 jqy_0 = __vperm( jqr_0, jqs_0, vmx_dword_perm_plane_y );
		__vector4 jqz_0 = __vperm( jqt_0, jqu_0, vmx_dword_perm_plane_x );
		__vector4 jqw_0 = __vperm( jqt_0, jqu_0, vmx_dword_perm_plane_y );

		__vector4 jqx_1 = __vperm( jqr_1, jqs_1, vmx_dword_perm_plane_x );
		__vector4 jqy_1 = __vperm( jqr_1, jqs_1, vmx_dword_perm_plane_y );
		__vector4 jqz_1 = __vperm( jqt_1, jqu_1, vmx_dword_perm_plane_x );
		__vector4 jqw_1 = __vperm( jqt_1, jqu_1, vmx_dword_perm_plane_y );

		__vector4 bqx_0 = __vperm( bqr_0, bqs_0, vmx_dword_perm_plane_x );
		__vector4 bqy_0 = __vperm( bqr_0, bqs_0, vmx_dword_perm_plane_y );
		__vector4 bqz_0 = __vperm( bqt_0, bqu_0, vmx_dword_perm_plane_x );
		__vector4 bqw_0 = __vperm( bqt_0, bqu_0, vmx_dword_perm_plane_y );

		__vector4 bqx_1 = __vperm( bqr_1, bqs_1, vmx_dword_perm_plane_x );
		__vector4 bqy_1 = __vperm( bqr_1, bqs_1, vmx_dword_perm_plane_y );
		__vector4 bqz_1 = __vperm( bqt_1, bqu_1, vmx_dword_perm_plane_x );
		__vector4 bqw_1 = __vperm( bqt_1, bqu_1, vmx_dword_perm_plane_y );

		__vector4 cosoma_0 = __vmulfp( jqx_0, bqx_0 );
		__vector4 cosomb_0 = __vmulfp( jqy_0, bqy_0 );
		__vector4 cosomc_0 = __vmulfp( jqz_0, bqz_0 );
		__vector4 cosomd_0 = __vmulfp( jqw_0, bqw_0 );

		__vector4 cosoma_1 = __vmulfp( jqx_1, bqx_1 );
		__vector4 cosomb_1 = __vmulfp( jqy_1, bqy_1 );
		__vector4 cosomc_1 = __vmulfp( jqz_1, bqz_1 );
		__vector4 cosomd_1 = __vmulfp( jqw_1, bqw_1 );

		__vector4 cosome_0 = __vaddfp( cosoma_0, cosomb_0 );
		__vector4 cosomf_0 = __vaddfp( cosomc_0, cosomd_0 );
		__vector4 cosomg_0 = __vaddfp( cosome_0, cosomf_0 );

		__vector4 cosome_1 = __vaddfp( cosoma_1, cosomb_1 );
		__vector4 cosomf_1 = __vaddfp( cosomc_1, cosomd_1 );
		__vector4 cosomg_1 = __vaddfp( cosome_1, cosomf_1 );

		__vector4 sign_0 = __vand( cosomg_0, vmx_float_sign_bit );
		__vector4 cosom_0 = __vandc( cosomg_0, vmx_float_sign_bit );
		__vector4 ss_0 = __vnmsubfp( cosom_0, cosom_0, vmx_float_one );

		ss_0 = __vmaxfp( ss_0, vmx_float_tiny );

		__vector4 sign_1 = __vand( cosomg_1, vmx_float_sign_bit );
		__vector4 cosom_1 = __vandc( cosomg_1, vmx_float_sign_bit );
		__vector4 ss_1 = __vnmsubfp( cosom_1, cosom_1, vmx_float_one );

		ss_1 = __vmaxfp( ss_1, vmx_float_tiny );

		__vector4 rs_0 = __vrsqrtefp( ss_0 );
		__vector4 sq_0 = __vmulfp( rs_0, rs_0 );
		__vector4 sh_0 = __vmulfp( rs_0, vmx_float_rsqrt_c1 );
		__vector4 sx_0 = __vmaddfp( ss_0, sq_0, vmx_float_rsqrt_c0 );
		__vector4 sinom_0 = __vmulfp( sh_0, sx_0 );						// sinom = sqrt( ss );

		ss_0 = __vmulfp( ss_0, sinom_0 );

		__vector4 rs_1 = __vrsqrtefp( ss_1 );
		__vector4 sq_1 = __vmulfp( rs_1, rs_1 );
		__vector4 sh_1 = __vmulfp( rs_1, vmx_float_rsqrt_c1 );
		__vector4 sx_1 = __vmaddfp( ss_1, sq_1, vmx_float_rsqrt_c0 );
		__vector4 sinom_1 = __vmulfp( sh_1, sx_1 );						// sinom = sqrt( ss );

		ss_1 = __vmulfp( ss_1, sinom_1 );

		__vector4 min_0 = __vminfp( ss_0, cosom_0 );
		__vector4 max_0 = __vmaxfp( ss_0, cosom_0 );
		__vector4 mask_0 = __vcmpeqfp( min_0, cosom_0 );
		__vector4 masksign_0 = __vand( mask_0, vmx_float_sign_bit );
		__vector4 maskPI_0 = __vand( mask_0, vmx_float_PI );

		__vector4 min_1 = __vminfp( ss_1, cosom_1 );
		__vector4 max_1 = __vmaxfp( ss_1, cosom_1 );
		__vector4 mask_1 = __vcmpeqfp( min_1, cosom_1 );
		__vector4 masksign_1 = __vand( mask_1, vmx_float_sign_bit );
		__vector4 maskPI_1 = __vand( mask_1, vmx_float_PI );

		__vector4 rcpa_0 = __vrefp( max_0 );
		__vector4 rcpb_0 = __vmulfp( max_0, rcpa_0 );
		__vector4 rcpd_0 = __vaddfp( rcpa_0, rcpa_0 );
		__vector4 rcp_0 = __vnmsubfp( rcpb_0, rcpa_0, rcpd_0 );			// 1 / y or 1 / x
		__vector4 ata_0 = __vmulfp( min_0, rcp_0 );						// x / y or y / x

		__vector4 rcpa_1 = __vrefp( max_1 );
		__vector4 rcpb_1 = __vmulfp( max_1, rcpa_1 );
		__vector4 rcpd_1 = __vaddfp( rcpa_1, rcpa_1 );
		__vector4 rcp_1 = __vnmsubfp( rcpb_1, rcpa_1, rcpd_1 );			// 1 / y or 1 / x
		__vector4 ata_1 = __vmulfp( min_1, rcp_1 );						// x / y or y / x

		__vector4 atb_0 = __vxor( ata_0, masksign_0 );					// -x / y or y / x
		__vector4 atc_0 = __vmulfp( atb_0, atb_0 );
		__vector4 atd_0 = __vmaddfp( atc_0, vmx_float_atan_c0, vmx_float_atan_c1 );

		__vector4 atb_1 = __vxor( ata_1, masksign_1 );					// -x / y or y / x
		__vector4 atc_1 = __vmulfp( atb_1, atb_1 );
		__vector4 atd_1 = __vmaddfp( atc_1, vmx_float_atan_c0, vmx_float_atan_c1 );

		atd_0 = __vmaddfp( atd_0, atc_0, vmx_float_atan_c2 );
		atd_0 = __vmaddfp( atd_0, atc_0, vmx_float_atan_c3 );
		atd_0 = __vmaddfp( atd_0, atc_0, vmx_float_atan_c4 );
		atd_0 = __vmaddfp( atd_0, atc_0, vmx_float_atan_c5 );
		atd_0 = __vmaddfp( atd_0, atc_0, vmx_float_atan_c6 );
		atd_0 = __vmaddfp( atd_0, atc_0, vmx_float_atan_c7 );
		atd_0 = __vmaddfp( atd_0, atc_0, vmx_float_one );

		atd_1 = __vmaddfp( atd_1, atc_1, vmx_float_atan_c2 );
		atd_1 = __vmaddfp( atd_1, atc_1, vmx_float_atan_c3 );
		atd_1 = __vmaddfp( atd_1, atc_1, vmx_float_atan_c4 );
		atd_1 = __vmaddfp( atd_1, atc_1, vmx_float_atan_c5 );
		atd_1 = __vmaddfp( atd_1, atc_1, vmx_float_atan_c6 );
		atd_1 = __vmaddfp( atd_1, atc_1, vmx_float_atan_c7 );
		atd_1 = __vmaddfp( atd_1, atc_1, vmx_float_one );

		__vector4 omega_a_0 = __vmaddfp( atd_0, atb_0, maskPI_0 );
		__vector4 omega_b_0 = __vmulfp( vlerp, omega_a_0 );
		omega_a_0 = __vsubfp( omega_a_0, omega_b_0 );

		__vector4 omega_a_1 = __vmaddfp( atd_1, atb_1, maskPI_1 );
		__vector4 omega_b_1 = __vmulfp( vlerp, omega_a_1 );
		omega_a_1 = __vsubfp( omega_a_1, omega_b_1 );

		__vector4 sinsa_0 = __vmulfp( omega_a_0, omega_a_0 );
		__vector4 sinsb_0 = __vmulfp( omega_b_0, omega_b_0 );
		__vector4 sina_0 = __vmaddfp( sinsa_0, vmx_float_sin_c0, vmx_float_sin_c1 );
		__vector4 sinb_0 = __vmaddfp( sinsb_0, vmx_float_sin_c0, vmx_float_sin_c1 );
		sina_0 = __vmaddfp( sina_0, sinsa_0, vmx_float_sin_c2 );
		sinb_0 = __vmaddfp( sinb_0, sinsb_0, vmx_float_sin_c2 );
		sina_0 = __vmaddfp( sina_0, sinsa_0, vmx_float_sin_c3 );
		sinb_0 = __vmaddfp( sinb_0, sinsb_0, vmx_float_sin_c3 );
		sina_0 = __vmaddfp( sina_0, sinsa_0, vmx_float_sin_c4 );
		sinb_0 = __vmaddfp( sinb_0, sinsb_0, vmx_float_sin_c4 );
		sina_0 = __vmaddfp( sina_0, sinsa_0, vmx_float_one );
		sinb_0 = __vmaddfp( sinb_0, sinsb_0, vmx_float_one );
		sina_0 = __vmulfp( sina_0, omega_a_0 );
		sinb_0 = __vmulfp( sinb_0, omega_b_0 );
		__vector4 scalea_0 = __vmulfp( sina_0, sinom_0 );
		__vector4 scaleb_0 = __vmulfp( sinb_0, sinom_0 );

		__vector4 sinsa_1 = __vmulfp( omega_a_1, omega_a_1 );
		__vector4 sinsb_1 = __vmulfp( omega_b_1, omega_b_1 );
		__vector4 sina_1 = __vmaddfp( sinsa_1, vmx_float_sin_c0, vmx_float_sin_c1 );
		__vector4 sinb_1 = __vmaddfp( sinsb_1, vmx_float_sin_c0, vmx_float_sin_c1 );
		sina_1 = __vmaddfp( sina_1, sinsa_1, vmx_float_sin_c2 );
		sinb_1 = __vmaddfp( sinb_1, sinsb_1, vmx_float_sin_c2 );
		sina_1 = __vmaddfp( sina_1, sinsa_1, vmx_float_sin_c3 );
		sinb_1 = __vmaddfp( sinb_1, sinsb_1, vmx_float_sin_c3 );
		sina_1 = __vmaddfp( sina_1, sinsa_1, vmx_float_sin_c4 );
		sinb_1 = __vmaddfp( sinb_1, sinsb_1, vmx_float_sin_c4 );
		sina_1 = __vmaddfp( sina_1, sinsa_1, vmx_float_one );
		sinb_1 = __vmaddfp( sinb_1, sinsb_1, vmx_float_one );
		sina_1 = __vmulfp( sina_1, omega_a_1 );
		sinb_1 = __vmulfp( sinb_1, omega_b_1 );
		__vector4 scalea_1 = __vmulfp( sina_1, sinom_1 );
		__vector4 scaleb_1 = __vmulfp( sinb_1, sinom_1 );

		scaleb_0 = __vxor( scaleb_0, sign_0 );

		scaleb_1 = __vxor( scaleb_1, sign_1 );

		jqx_0 = __vmulfp( jqx_0, scalea_0 );
		jqy_0 = __vmulfp( jqy_0, scalea_0 );
		jqz_0 = __vmulfp( jqz_0, scalea_0 );
		jqw_0 = __vmulfp( jqw_0, scalea_0 );

		jqx_1 = __vmulfp( jqx_1, scalea_1 );
		jqy_1 = __vmulfp( jqy_1, scalea_1 );
		jqz_1 = __vmulfp( jqz_1, scalea_1 );
		jqw_1 = __vmulfp( jqw_1, scalea_1 );

		jqx_0 = __vmaddfp( bqx_0, scaleb_0, jqx_0 );
		jqy_0 = __vmaddfp( bqy_0, scaleb_0, jqy_0 );
		jqz_0 = __vmaddfp( bqz_0, scaleb_0, jqz_0 );
		jqw_0 = __vmaddfp( bqw_0, scaleb_0, jqw_0 );

		jqx_1 = __vmaddfp( bqx_1, scaleb_1, jqx_1 );
		jqy_1 = __vmaddfp( bqy_1, scaleb_1, jqy_1 );
		jqz_1 = __vmaddfp( bqz_1, scaleb_1, jqz_1 );
		jqw_1 = __vmaddfp( bqw_1, scaleb_1, jqw_1 );

		__vector4 tp0_0 = __vmrghw( jqx_0, jqy_0 );
		__vector4 tp1_0 = __vmrghw( jqz_0, jqw_0 );
		__vector4 tp2_0 = __vmrglw( jqx_0, jqy_0 );
		__vector4 tp3_0 = __vmrglw( jqz_0, jqw_0 );

		__vector4 tp0_1 = __vmrghw( jqx_1, jqy_1 );
		__vector4 tp1_1 = __vmrghw( jqz_1, jqw_1 );
		__vector4 tp2_1 = __vmrglw( jqx_1, jqy_1 );
		__vector4 tp3_1 = __vmrglw( jqz_1, jqw_1 );

		__vector4 p0_0 = __vperm( tp0_0, tp1_0, vmx_dword_perm_plane_x );
		__vector4 p1_0 = __vperm( tp0_0, tp1_0, vmx_dword_perm_plane_y );
		__vector4 p2_0 = __vperm( tp2_0, tp3_0, vmx_dword_perm_plane_x );
		__vector4 p3_0 = __vperm( tp2_0, tp3_0, vmx_dword_perm_plane_y );

		__vector4 p0_1 = __vperm( tp0_1, tp1_1, vmx_dword_perm_plane_x );
		__vector4 p1_1 = __vperm( tp0_1, tp1_1, vmx_dword_perm_plane_y );
		__vector4 p2_1 = __vperm( tp2_1, tp3_1, vmx_dword_perm_plane_x );
		__vector4 p3_1 = __vperm( tp2_1, tp3_1, vmx_dword_perm_plane_y );

		__stvx( p0_0, &joints[n0].q, 0 );
		__stvx( p1_0, &joints[n1].q, 0 );
		__stvx( p2_0, &joints[n2].q, 0 );
		__stvx( p3_0, &joints[n3].q, 0 );

		__stvx( p0_1, &joints[n4].q, 0 );
		__stvx( p1_1, &joints[n5].q, 0 );
		__stvx( p2_1, &joints[n6].q, 0 );
		__stvx( p3_1, &joints[n7].q, 0 );
	}

	for ( ; i < numJoints - 3; i += 4 ) {
		int n0 = index[i+0];
		int n1 = index[i+1];
		int n2 = index[i+2];
		int n3 = index[i+3];

		__vector4 jqa_0 = *(__vector4 *)&joints[n0].q;
		__vector4 jqb_0 = *(__vector4 *)&joints[n1].q;
		__vector4 jqc_0 = *(__vector4 *)&joints[n2].q;
		__vector4 jqd_0 = *(__vector4 *)&joints[n3].q;

		__vector4 jta_0 = *(__vector4 *)&joints[n0].t;
		__vector4 jtb_0 = *(__vector4 *)&joints[n1].t;
		__vector4 jtc_0 = *(__vector4 *)&joints[n2].t;
		__vector4 jtd_0 = *(__vector4 *)&joints[n3].t;

		__vector4 bqa_0 = *(__vector4 *)&blendJoints[n0].q;
		__vector4 bqb_0 = *(__vector4 *)&blendJoints[n1].q;
		__vector4 bqc_0 = *(__vector4 *)&blendJoints[n2].q;
		__vector4 bqd_0 = *(__vector4 *)&blendJoints[n3].q;

		__vector4 bta_0 = *(__vector4 *)&blendJoints[n0].t;
		__vector4 btb_0 = *(__vector4 *)&blendJoints[n1].t;
		__vector4 btc_0 = *(__vector4 *)&blendJoints[n2].t;
		__vector4 btd_0 = *(__vector4 *)&blendJoints[n3].t;

		bta_0 = __vsubfp( bta_0, jta_0 );
		btb_0 = __vsubfp( btb_0, jtb_0 );
		btc_0 = __vsubfp( btc_0, jtc_0 );
		btd_0 = __vsubfp( btd_0, jtd_0 );

		jta_0 = __vmaddfp( vlerp, bta_0, jta_0 );
		jtb_0 = __vmaddfp( vlerp, btb_0, jtb_0 );
		jtc_0 = __vmaddfp( vlerp, btc_0, jtc_0 );
		jtd_0 = __vmaddfp( vlerp, btd_0, jtd_0 );

		__stvx( jta_0, &joints[n0].t, 0 );
		__stvx( jtb_0, &joints[n1].t, 0 );
		__stvx( jtc_0, &joints[n2].t, 0 );
		__stvx( jtd_0, &joints[n3].t, 0 );

		__vector4 jqr_0 = __vmrghw( jqa_0, jqb_0 );
		__vector4 jqs_0 = __vmrghw( jqc_0, jqd_0 );
		__vector4 jqt_0 = __vmrglw( jqa_0, jqb_0 );
		__vector4 jqu_0 = __vmrglw( jqc_0, jqd_0 );

		__vector4 bqr_0 = __vmrghw( bqa_0, bqb_0 );
		__vector4 bqs_0 = __vmrghw( bqc_0, bqd_0 );
		__vector4 bqt_0 = __vmrglw( bqa_0, bqb_0 );
		__vector4 bqu_0 = __vmrglw( bqc_0, bqd_0 );

		__vector4 jqx_0 = __vperm( jqr_0, jqs_0, vmx_dword_perm_plane_x );
		__vector4 jqy_0 = __vperm( jqr_0, jqs_0, vmx_dword_perm_plane_y );
		__vector4 jqz_0 = __vperm( jqt_0, jqu_0, vmx_dword_perm_plane_x );
		__vector4 jqw_0 = __vperm( jqt_0, jqu_0, vmx_dword_perm_plane_y );

		__vector4 bqx_0 = __vperm( bqr_0, bqs_0, vmx_dword_perm_plane_x );
		__vector4 bqy_0 = __vperm( bqr_0, bqs_0, vmx_dword_perm_plane_y );
		__vector4 bqz_0 = __vperm( bqt_0, bqu_0, vmx_dword_perm_plane_x );
		__vector4 bqw_0 = __vperm( bqt_0, bqu_0, vmx_dword_perm_plane_y );

		__vector4 cosoma_0 = __vmulfp( jqx_0, bqx_0 );
		__vector4 cosomb_0 = __vmulfp( jqy_0, bqy_0 );
		__vector4 cosomc_0 = __vmulfp( jqz_0, bqz_0 );
		__vector4 cosomd_0 = __vmulfp( jqw_0, bqw_0 );

		__vector4 cosome_0 = __vaddfp( cosoma_0, cosomb_0 );
		__vector4 cosomf_0 = __vaddfp( cosomc_0, cosomd_0 );
		__vector4 cosomg_0 = __vaddfp( cosome_0, cosomf_0 );

		__vector4 sign_0 = __vand( cosomg_0, vmx_float_sign_bit );
		__vector4 cosom_0 = __vandc( cosomg_0, vmx_float_sign_bit );
		__vector4 ss_0 = __vnmsubfp( cosom_0, cosom_0, vmx_float_one );

		ss_0 = __vmaxfp( ss_0, vmx_float_tiny );

		__vector4 rs_0 = __vrsqrtefp( ss_0 );
		__vector4 sq_0 = __vmulfp( rs_0, rs_0 );
		__vector4 sh_0 = __vmulfp( rs_0, vmx_float_rsqrt_c1 );
		__vector4 sx_0 = __vmaddfp( ss_0, sq_0, vmx_float_rsqrt_c0 );
		__vector4 sinom_0 = __vmulfp( sh_0, sx_0 );						// sinom = sqrt( ss );

		ss_0 = __vmulfp( ss_0, sinom_0 );

		__vector4 min_0 = __vminfp( ss_0, cosom_0 );
		__vector4 max_0 = __vmaxfp( ss_0, cosom_0 );
		__vector4 mask_0 = __vcmpeqfp( min_0, cosom_0 );
		__vector4 masksign_0 = __vand( mask_0, vmx_float_sign_bit );
		__vector4 maskPI_0 = __vand( mask_0, vmx_float_PI );

		__vector4 rcpa_0 = __vrefp( max_0 );
		__vector4 rcpb_0 = __vmulfp( max_0, rcpa_0 );
		__vector4 rcpd_0 = __vaddfp( rcpa_0, rcpa_0 );
		__vector4 rcp_0 = __vnmsubfp( rcpb_0, rcpa_0, rcpd_0 );			// 1 / y or 1 / x
		__vector4 ata_0 = __vmulfp( min_0, rcp_0 );						// x / y or y / x

		__vector4 atb_0 = __vxor( ata_0, masksign_0 );					// -x / y or y / x
		__vector4 atc_0 = __vmulfp( atb_0, atb_0 );
		__vector4 atd_0 = __vmaddfp( atc_0, vmx_float_atan_c0, vmx_float_atan_c1 );

		atd_0 = __vmaddfp( atd_0, atc_0, vmx_float_atan_c2 );
		atd_0 = __vmaddfp( atd_0, atc_0, vmx_float_atan_c3 );
		atd_0 = __vmaddfp( atd_0, atc_0, vmx_float_atan_c4 );
		atd_0 = __vmaddfp( atd_0, atc_0, vmx_float_atan_c5 );
		atd_0 = __vmaddfp( atd_0, atc_0, vmx_float_atan_c6 );
		atd_0 = __vmaddfp( atd_0, atc_0, vmx_float_atan_c7 );
		atd_0 = __vmaddfp( atd_0, atc_0, vmx_float_one );

		__vector4 omega_a_0 = __vmaddfp( atd_0, atb_0, maskPI_0 );
		__vector4 omega_b_0 = __vmulfp( vlerp, omega_a_0 );
		omega_a_0 = __vsubfp( omega_a_0, omega_b_0 );

		__vector4 sinsa_0 = __vmulfp( omega_a_0, omega_a_0 );
		__vector4 sinsb_0 = __vmulfp( omega_b_0, omega_b_0 );
		__vector4 sina_0 = __vmaddfp( sinsa_0, vmx_float_sin_c0, vmx_float_sin_c1 );
		__vector4 sinb_0 = __vmaddfp( sinsb_0, vmx_float_sin_c0, vmx_float_sin_c1 );
		sina_0 = __vmaddfp( sina_0, sinsa_0, vmx_float_sin_c2 );
		sinb_0 = __vmaddfp( sinb_0, sinsb_0, vmx_float_sin_c2 );
		sina_0 = __vmaddfp( sina_0, sinsa_0, vmx_float_sin_c3 );
		sinb_0 = __vmaddfp( sinb_0, sinsb_0, vmx_float_sin_c3 );
		sina_0 = __vmaddfp( sina_0, sinsa_0, vmx_float_sin_c4 );
		sinb_0 = __vmaddfp( sinb_0, sinsb_0, vmx_float_sin_c4 );
		sina_0 = __vmaddfp( sina_0, sinsa_0, vmx_float_one );
		sinb_0 = __vmaddfp( sinb_0, sinsb_0, vmx_float_one );
		sina_0 = __vmulfp( sina_0, omega_a_0 );
		sinb_0 = __vmulfp( sinb_0, omega_b_0 );
		__vector4 scalea_0 = __vmulfp( sina_0, sinom_0 );
		__vector4 scaleb_0 = __vmulfp( sinb_0, sinom_0 );

		scaleb_0 = __vxor( scaleb_0, sign_0 );

		jqx_0 = __vmulfp( jqx_0, scalea_0 );
		jqy_0 = __vmulfp( jqy_0, scalea_0 );
		jqz_0 = __vmulfp( jqz_0, scalea_0 );
		jqw_0 = __vmulfp( jqw_0, scalea_0 );

		jqx_0 = __vmaddfp( bqx_0, scaleb_0, jqx_0 );
		jqy_0 = __vmaddfp( bqy_0, scaleb_0, jqy_0 );
		jqz_0 = __vmaddfp( bqz_0, scaleb_0, jqz_0 );
		jqw_0 = __vmaddfp( bqw_0, scaleb_0, jqw_0 );

		__vector4 tp0_0 = __vmrghw( jqx_0, jqy_0 );
		__vector4 tp1_0 = __vmrghw( jqz_0, jqw_0 );
		__vector4 tp2_0 = __vmrglw( jqx_0, jqy_0 );
		__vector4 tp3_0 = __vmrglw( jqz_0, jqw_0 );

		__vector4 p0_0 = __vperm( tp0_0, tp1_0, vmx_dword_perm_plane_x );
		__vector4 p1_0 = __vperm( tp0_0, tp1_0, vmx_dword_perm_plane_y );
		__vector4 p2_0 = __vperm( tp2_0, tp3_0, vmx_dword_perm_plane_x );
		__vector4 p3_0 = __vperm( tp2_0, tp3_0, vmx_dword_perm_plane_y );

		__stvx( p0_0, &joints[n0].q, 0 );
		__stvx( p1_0, &joints[n1].q, 0 );
		__stvx( p2_0, &joints[n2].q, 0 );
		__stvx( p3_0, &joints[n3].q, 0 );
	}

	for ( ; i < numJoints; i++ ) {
		int n = index[i];

		idVec3 &jointVert = joints[n].t;
		const idVec3 &blendVert = blendJoints[n].t;

		jointVert[0] += lerp * ( blendVert[0] - jointVert[0] );
		jointVert[1] += lerp * ( blendVert[1] - jointVert[1] );
		jointVert[2] += lerp * ( blendVert[2] - jointVert[2] );

		idQuat &jointQuat = joints[n].q;
		const idQuat &blendQuat = blendJoints[n].q;

		float cosom;
		float sinom;
		float omega;
		float scale0;
		float scale1;
		float signBit;

		cosom = jointQuat.x * blendQuat.x + jointQuat.y * blendQuat.y + jointQuat.z * blendQuat.z + jointQuat.w * blendQuat.w;

		signBit = __fsel( cosom, 1.0f, -1.0f );

		cosom = __fabs( cosom );

		scale0 = 1.0f - cosom * cosom;
		scale0 = __fsel( -scale0, 1e-10f, scale0 );
		sinom = Xenon_ReciprocalSqrt( scale0 );
		omega = Xenon_ATanPositive( scale0 * sinom, cosom );
		scale0 = Xenon_SinZeroHalfPI( ( 1.0f - lerp ) * omega ) * sinom;
		scale1 = Xenon_SinZeroHalfPI( lerp * omega ) * sinom;

		scale1 *= signBit;

		jointQuat.x = scale0 * jointQuat.x + scale1 * blendQuat.x;
		jointQuat.y = scale0 * jointQuat.y + scale1 * blendQuat.y;
		jointQuat.z = scale0 * jointQuat.z + scale1 * blendQuat.z;
		jointQuat.w = scale0 * jointQuat.w + scale1 * blendQuat.w;
	}
}

/*
============
idSIMD_Xenon::BlendJointsFast
============
*/
void VPCALL idSIMD_Xenon::BlendJointsFast( idJointQuat *__restrict joints, const idJointQuat *__restrict blendJoints, const float lerp, const int *index, const int numJoints ) {
	int i;

	assert_16_byte_aligned( joints );
	assert_16_byte_aligned( blendJoints );
	assert_16_byte_aligned( JOINTQUAT_Q_OFFSET );
	assert_16_byte_aligned( JOINTQUAT_T_OFFSET );
	assert_sizeof_16_byte_multiple( idJointQuat );

	if ( lerp <= 0.0f ) {
		return;
	} else if ( lerp >= 1.0f ) {
		for ( i = 0; i < numJoints; i++ ) {
			int j = index[i];
			joints[j] = blendJoints[j];
		}
		return;
	}

	float scaledLerp = lerp / ( 1.0f - lerp );

	__vector4 vlerp = { lerp, lerp, lerp, lerp };
	__vector4 vscaledLerp = { scaledLerp, scaledLerp, scaledLerp, scaledLerp };

	for ( i = 0; i < numJoints - 7; i += 8 ) {
		int n0 = index[i+0];
		int n1 = index[i+1];
		int n2 = index[i+2];
		int n3 = index[i+3];
		int n4 = index[i+4];
		int n5 = index[i+5];
		int n6 = index[i+6];
		int n7 = index[i+7];

		__vector4 jqa_0 = *(__vector4 *)&joints[n0].q;
		__vector4 jqb_0 = *(__vector4 *)&joints[n1].q;
		__vector4 jqc_0 = *(__vector4 *)&joints[n2].q;
		__vector4 jqd_0 = *(__vector4 *)&joints[n3].q;

		__vector4 jta_0 = *(__vector4 *)&joints[n0].t;
		__vector4 jtb_0 = *(__vector4 *)&joints[n1].t;
		__vector4 jtc_0 = *(__vector4 *)&joints[n2].t;
		__vector4 jtd_0 = *(__vector4 *)&joints[n3].t;

		__vector4 bqa_0 = *(__vector4 *)&blendJoints[n0].q;
		__vector4 bqb_0 = *(__vector4 *)&blendJoints[n1].q;
		__vector4 bqc_0 = *(__vector4 *)&blendJoints[n2].q;
		__vector4 bqd_0 = *(__vector4 *)&blendJoints[n3].q;

		__vector4 bta_0 = *(__vector4 *)&blendJoints[n0].t;
		__vector4 btb_0 = *(__vector4 *)&blendJoints[n1].t;
		__vector4 btc_0 = *(__vector4 *)&blendJoints[n2].t;
		__vector4 btd_0 = *(__vector4 *)&blendJoints[n3].t;

		__vector4 jqa_1 = *(__vector4 *)&joints[n4].q;
		__vector4 jqb_1 = *(__vector4 *)&joints[n5].q;
		__vector4 jqc_1 = *(__vector4 *)&joints[n6].q;
		__vector4 jqd_1 = *(__vector4 *)&joints[n7].q;

		__vector4 jta_1 = *(__vector4 *)&joints[n4].t;
		__vector4 jtb_1 = *(__vector4 *)&joints[n5].t;
		__vector4 jtc_1 = *(__vector4 *)&joints[n6].t;
		__vector4 jtd_1 = *(__vector4 *)&joints[n7].t;

		__vector4 bqa_1 = *(__vector4 *)&blendJoints[n4].q;
		__vector4 bqb_1 = *(__vector4 *)&blendJoints[n5].q;
		__vector4 bqc_1 = *(__vector4 *)&blendJoints[n6].q;
		__vector4 bqd_1 = *(__vector4 *)&blendJoints[n7].q;

		__vector4 bta_1 = *(__vector4 *)&blendJoints[n4].t;
		__vector4 btb_1 = *(__vector4 *)&blendJoints[n5].t;
		__vector4 btc_1 = *(__vector4 *)&blendJoints[n6].t;
		__vector4 btd_1 = *(__vector4 *)&blendJoints[n7].t;

		bta_0 = __vsubfp( bta_0, jta_0 );
		btb_0 = __vsubfp( btb_0, jtb_0 );
		btc_0 = __vsubfp( btc_0, jtc_0 );
		btd_0 = __vsubfp( btd_0, jtd_0 );

		bta_1 = __vsubfp( bta_1, jta_1 );
		btb_1 = __vsubfp( btb_1, jtb_1 );
		btc_1 = __vsubfp( btc_1, jtc_1 );
		btd_1 = __vsubfp( btd_1, jtd_1 );

		jta_0 = __vmaddfp( vlerp, bta_0, jta_0 );
		jtb_0 = __vmaddfp( vlerp, btb_0, jtb_0 );
		jtc_0 = __vmaddfp( vlerp, btc_0, jtc_0 );
		jtd_0 = __vmaddfp( vlerp, btd_0, jtd_0 );

		jta_1 = __vmaddfp( vlerp, bta_1, jta_1 );
		jtb_1 = __vmaddfp( vlerp, btb_1, jtb_1 );
		jtc_1 = __vmaddfp( vlerp, btc_1, jtc_1 );
		jtd_1 = __vmaddfp( vlerp, btd_1, jtd_1 );

		__stvx( jta_0, &joints[n0].t, 0 );
		__stvx( jtb_0, &joints[n1].t, 0 );
		__stvx( jtc_0, &joints[n2].t, 0 );
		__stvx( jtd_0, &joints[n3].t, 0 );

		__stvx( jta_1, &joints[n4].t, 0 );
		__stvx( jtb_1, &joints[n5].t, 0 );
		__stvx( jtc_1, &joints[n6].t, 0 );
		__stvx( jtd_1, &joints[n7].t, 0 );

		__vector4 jqr_0 = __vmrghw( jqa_0, jqb_0 );
		__vector4 jqs_0 = __vmrghw( jqc_0, jqd_0 );
		__vector4 jqt_0 = __vmrglw( jqa_0, jqb_0 );
		__vector4 jqu_0 = __vmrglw( jqc_0, jqd_0 );

		__vector4 jqr_1 = __vmrghw( jqa_1, jqb_1 );
		__vector4 jqs_1 = __vmrghw( jqc_1, jqd_1 );
		__vector4 jqt_1 = __vmrglw( jqa_1, jqb_1 );
		__vector4 jqu_1 = __vmrglw( jqc_1, jqd_1 );

		__vector4 bqr_0 = __vmrghw( bqa_0, bqb_0 );
		__vector4 bqs_0 = __vmrghw( bqc_0, bqd_0 );
		__vector4 bqt_0 = __vmrglw( bqa_0, bqb_0 );
		__vector4 bqu_0 = __vmrglw( bqc_0, bqd_0 );

		__vector4 bqr_1 = __vmrghw( bqa_1, bqb_1 );
		__vector4 bqs_1 = __vmrghw( bqc_1, bqd_1 );
		__vector4 bqt_1 = __vmrglw( bqa_1, bqb_1 );
		__vector4 bqu_1 = __vmrglw( bqc_1, bqd_1 );

		__vector4 jqx_0 = __vperm( jqr_0, jqs_0, vmx_dword_perm_plane_x );
		__vector4 jqy_0 = __vperm( jqr_0, jqs_0, vmx_dword_perm_plane_y );
		__vector4 jqz_0 = __vperm( jqt_0, jqu_0, vmx_dword_perm_plane_x );
		__vector4 jqw_0 = __vperm( jqt_0, jqu_0, vmx_dword_perm_plane_y );

		__vector4 jqx_1 = __vperm( jqr_1, jqs_1, vmx_dword_perm_plane_x );
		__vector4 jqy_1 = __vperm( jqr_1, jqs_1, vmx_dword_perm_plane_y );
		__vector4 jqz_1 = __vperm( jqt_1, jqu_1, vmx_dword_perm_plane_x );
		__vector4 jqw_1 = __vperm( jqt_1, jqu_1, vmx_dword_perm_plane_y );

		__vector4 bqx_0 = __vperm( bqr_0, bqs_0, vmx_dword_perm_plane_x );
		__vector4 bqy_0 = __vperm( bqr_0, bqs_0, vmx_dword_perm_plane_y );
		__vector4 bqz_0 = __vperm( bqt_0, bqu_0, vmx_dword_perm_plane_x );
		__vector4 bqw_0 = __vperm( bqt_0, bqu_0, vmx_dword_perm_plane_y );

		__vector4 bqx_1 = __vperm( bqr_1, bqs_1, vmx_dword_perm_plane_x );
		__vector4 bqy_1 = __vperm( bqr_1, bqs_1, vmx_dword_perm_plane_y );
		__vector4 bqz_1 = __vperm( bqt_1, bqu_1, vmx_dword_perm_plane_x );
		__vector4 bqw_1 = __vperm( bqt_1, bqu_1, vmx_dword_perm_plane_y );

		__vector4 cosoma_0 = __vmulfp( jqx_0, bqx_0 );
		__vector4 cosomb_0 = __vmulfp( jqy_0, bqy_0 );
		__vector4 cosomc_0 = __vmulfp( jqz_0, bqz_0 );
		__vector4 cosomd_0 = __vmulfp( jqw_0, bqw_0 );

		__vector4 cosoma_1 = __vmulfp( jqx_1, bqx_1 );
		__vector4 cosomb_1 = __vmulfp( jqy_1, bqy_1 );
		__vector4 cosomc_1 = __vmulfp( jqz_1, bqz_1 );
		__vector4 cosomd_1 = __vmulfp( jqw_1, bqw_1 );

		__vector4 cosome_0 = __vaddfp( cosoma_0, cosomb_0 );
		__vector4 cosomf_0 = __vaddfp( cosomc_0, cosomd_0 );
		__vector4 cosom_0 = __vaddfp( cosome_0, cosomf_0 );

		__vector4 cosome_1 = __vaddfp( cosoma_1, cosomb_1 );
		__vector4 cosomf_1 = __vaddfp( cosomc_1, cosomd_1 );
		__vector4 cosom_1 = __vaddfp( cosome_1, cosomf_1 );

		__vector4 sign_0 = __vand( cosom_0, vmx_float_sign_bit );
		__vector4 scale_0 = __vxor( vscaledLerp, sign_0 );

		__vector4 sign_1 = __vand( cosom_1, vmx_float_sign_bit );
		__vector4 scale_1 = __vxor( vscaledLerp, sign_1 );

		jqx_0 = __vmaddfp( scale_0, bqx_0, jqx_0 );
		jqy_0 = __vmaddfp( scale_0, bqy_0, jqy_0 );
		jqz_0 = __vmaddfp( scale_0, bqz_0, jqz_0 );
		jqw_0 = __vmaddfp( scale_0, bqw_0, jqw_0 );

		jqx_1 = __vmaddfp( scale_1, bqx_1, jqx_1 );
		jqy_1 = __vmaddfp( scale_1, bqy_1, jqy_1 );
		jqz_1 = __vmaddfp( scale_1, bqz_1, jqz_1 );
		jqw_1 = __vmaddfp( scale_1, bqw_1, jqw_1 );

		__vector4 da_0 = __vmulfp( jqx_0, jqx_0 );
		__vector4 db_0 = __vmulfp( jqy_0, jqy_0 );
		__vector4 dc_0 = __vmulfp( jqz_0, jqz_0 );
		__vector4 dd_0 = __vmulfp( jqw_0, jqw_0 );

		__vector4 da_1 = __vmulfp( jqx_1, jqx_1 );
		__vector4 db_1 = __vmulfp( jqy_1, jqy_1 );
		__vector4 dc_1 = __vmulfp( jqz_1, jqz_1 );
		__vector4 dd_1 = __vmulfp( jqw_1, jqw_1 );

		__vector4 de_0 = __vaddfp( da_0, db_0 );
		__vector4 df_0 = __vaddfp( dc_0, dd_0 );
		__vector4 d_0 = __vaddfp( de_0, df_0 );

		__vector4 de_1 = __vaddfp( da_1, db_1 );
		__vector4 df_1 = __vaddfp( dc_1, dd_1 );
		__vector4 d_1 = __vaddfp( de_1, df_1 );

		__vector4 rs_0 = __vrsqrtefp( d_0 );
		__vector4 sq_0 = __vmulfp( rs_0, rs_0 );
		__vector4 sh_0 = __vmulfp( rs_0, vmx_float_rsqrt_c1 );
		__vector4 sx_0 = __vmaddfp( d_0, sq_0, vmx_float_rsqrt_c0 );
		__vector4 s_0 = __vmulfp( sh_0, sx_0 );

		__vector4 rs_1 = __vrsqrtefp( d_1 );
		__vector4 sq_1 = __vmulfp( rs_1, rs_1 );
		__vector4 sh_1 = __vmulfp( rs_1, vmx_float_rsqrt_c1 );
		__vector4 sx_1 = __vmaddfp( d_1, sq_1, vmx_float_rsqrt_c0 );
		__vector4 s_1 = __vmulfp( sh_1, sx_1 );

		jqx_0 = __vmulfp( jqx_0, s_0 );
		jqy_0 = __vmulfp( jqy_0, s_0 );
		jqz_0 = __vmulfp( jqz_0, s_0 );
		jqw_0 = __vmulfp( jqw_0, s_0 );

		jqx_1 = __vmulfp( jqx_1, s_1 );
		jqy_1 = __vmulfp( jqy_1, s_1 );
		jqz_1 = __vmulfp( jqz_1, s_1 );
		jqw_1 = __vmulfp( jqw_1, s_1 );

		__vector4 tp0_0 = __vmrghw( jqx_0, jqy_0 );
		__vector4 tp1_0 = __vmrghw( jqz_0, jqw_0 );
		__vector4 tp2_0 = __vmrglw( jqx_0, jqy_0 );
		__vector4 tp3_0 = __vmrglw( jqz_0, jqw_0 );

		__vector4 tp0_1 = __vmrghw( jqx_1, jqy_1 );
		__vector4 tp1_1 = __vmrghw( jqz_1, jqw_1 );
		__vector4 tp2_1 = __vmrglw( jqx_1, jqy_1 );
		__vector4 tp3_1 = __vmrglw( jqz_1, jqw_1 );

		__vector4 p0_0 = __vperm( tp0_0, tp1_0, vmx_dword_perm_plane_x );
		__vector4 p1_0 = __vperm( tp0_0, tp1_0, vmx_dword_perm_plane_y );
		__vector4 p2_0 = __vperm( tp2_0, tp3_0, vmx_dword_perm_plane_x );
		__vector4 p3_0 = __vperm( tp2_0, tp3_0, vmx_dword_perm_plane_y );

		__vector4 p0_1 = __vperm( tp0_1, tp1_1, vmx_dword_perm_plane_x );
		__vector4 p1_1 = __vperm( tp0_1, tp1_1, vmx_dword_perm_plane_y );
		__vector4 p2_1 = __vperm( tp2_1, tp3_1, vmx_dword_perm_plane_x );
		__vector4 p3_1 = __vperm( tp2_1, tp3_1, vmx_dword_perm_plane_y );

		__stvx( p0_0, &joints[n0].q, 0 );
		__stvx( p1_0, &joints[n1].q, 0 );
		__stvx( p2_0, &joints[n2].q, 0 );
		__stvx( p3_0, &joints[n3].q, 0 );

		__stvx( p0_1, &joints[n4].q, 0 );
		__stvx( p1_1, &joints[n5].q, 0 );
		__stvx( p2_1, &joints[n6].q, 0 );
		__stvx( p3_1, &joints[n7].q, 0 );
	}

	for ( ; i < numJoints - 3; i += 4 ) {
		int n0 = index[i+0];
		int n1 = index[i+1];
		int n2 = index[i+2];
		int n3 = index[i+3];

		__vector4 jqa_0 = *(__vector4 *)&joints[n0].q;
		__vector4 jqb_0 = *(__vector4 *)&joints[n1].q;
		__vector4 jqc_0 = *(__vector4 *)&joints[n2].q;
		__vector4 jqd_0 = *(__vector4 *)&joints[n3].q;

		__vector4 jta_0 = *(__vector4 *)&joints[n0].t;
		__vector4 jtb_0 = *(__vector4 *)&joints[n1].t;
		__vector4 jtc_0 = *(__vector4 *)&joints[n2].t;
		__vector4 jtd_0 = *(__vector4 *)&joints[n3].t;

		__vector4 bqa_0 = *(__vector4 *)&blendJoints[n0].q;
		__vector4 bqb_0 = *(__vector4 *)&blendJoints[n1].q;
		__vector4 bqc_0 = *(__vector4 *)&blendJoints[n2].q;
		__vector4 bqd_0 = *(__vector4 *)&blendJoints[n3].q;

		__vector4 bta_0 = *(__vector4 *)&blendJoints[n0].t;
		__vector4 btb_0 = *(__vector4 *)&blendJoints[n1].t;
		__vector4 btc_0 = *(__vector4 *)&blendJoints[n2].t;
		__vector4 btd_0 = *(__vector4 *)&blendJoints[n3].t;

		bta_0 = __vsubfp( bta_0, jta_0 );
		btb_0 = __vsubfp( btb_0, jtb_0 );
		btc_0 = __vsubfp( btc_0, jtc_0 );
		btd_0 = __vsubfp( btd_0, jtd_0 );

		jta_0 = __vmaddfp( vlerp, bta_0, jta_0 );
		jtb_0 = __vmaddfp( vlerp, btb_0, jtb_0 );
		jtc_0 = __vmaddfp( vlerp, btc_0, jtc_0 );
		jtd_0 = __vmaddfp( vlerp, btd_0, jtd_0 );

		__stvx( jta_0, &joints[n0].t, 0 );
		__stvx( jtb_0, &joints[n1].t, 0 );
		__stvx( jtc_0, &joints[n2].t, 0 );
		__stvx( jtd_0, &joints[n3].t, 0 );

		__vector4 jqr_0 = __vmrghw( jqa_0, jqb_0 );
		__vector4 jqs_0 = __vmrghw( jqc_0, jqd_0 );
		__vector4 jqt_0 = __vmrglw( jqa_0, jqb_0 );
		__vector4 jqu_0 = __vmrglw( jqc_0, jqd_0 );

		__vector4 bqr_0 = __vmrghw( bqa_0, bqb_0 );
		__vector4 bqs_0 = __vmrghw( bqc_0, bqd_0 );
		__vector4 bqt_0 = __vmrglw( bqa_0, bqb_0 );
		__vector4 bqu_0 = __vmrglw( bqc_0, bqd_0 );

		__vector4 jqx_0 = __vperm( jqr_0, jqs_0, vmx_dword_perm_plane_x );
		__vector4 jqy_0 = __vperm( jqr_0, jqs_0, vmx_dword_perm_plane_y );
		__vector4 jqz_0 = __vperm( jqt_0, jqu_0, vmx_dword_perm_plane_x );
		__vector4 jqw_0 = __vperm( jqt_0, jqu_0, vmx_dword_perm_plane_y );

		__vector4 bqx_0 = __vperm( bqr_0, bqs_0, vmx_dword_perm_plane_x );
		__vector4 bqy_0 = __vperm( bqr_0, bqs_0, vmx_dword_perm_plane_y );
		__vector4 bqz_0 = __vperm( bqt_0, bqu_0, vmx_dword_perm_plane_x );
		__vector4 bqw_0 = __vperm( bqt_0, bqu_0, vmx_dword_perm_plane_y );

		__vector4 cosoma_0 = __vmulfp( jqx_0, bqx_0 );
		__vector4 cosomb_0 = __vmulfp( jqy_0, bqy_0 );
		__vector4 cosomc_0 = __vmulfp( jqz_0, bqz_0 );
		__vector4 cosomd_0 = __vmulfp( jqw_0, bqw_0 );

		__vector4 cosome_0 = __vaddfp( cosoma_0, cosomb_0 );
		__vector4 cosomf_0 = __vaddfp( cosomc_0, cosomd_0 );
		__vector4 cosom_0 = __vaddfp( cosome_0, cosomf_0 );

		__vector4 sign_0 = __vand( cosom_0, vmx_float_sign_bit );

		__vector4 scale_0 = __vxor( vscaledLerp, sign_0 );

		jqx_0 = __vmaddfp( scale_0, bqx_0, jqx_0 );
		jqy_0 = __vmaddfp( scale_0, bqy_0, jqy_0 );
		jqz_0 = __vmaddfp( scale_0, bqz_0, jqz_0 );
		jqw_0 = __vmaddfp( scale_0, bqw_0, jqw_0 );

		__vector4 da_0 = __vmulfp( jqx_0, jqx_0 );
		__vector4 db_0 = __vmulfp( jqy_0, jqy_0 );
		__vector4 dc_0 = __vmulfp( jqz_0, jqz_0 );
		__vector4 dd_0 = __vmulfp( jqw_0, jqw_0 );

		__vector4 de_0 = __vaddfp( da_0, db_0 );
		__vector4 df_0 = __vaddfp( dc_0, dd_0 );
		__vector4 d_0 = __vaddfp( de_0, df_0 );

		__vector4 rs_0 = __vrsqrtefp( d_0 );
		__vector4 sq_0 = __vmulfp( rs_0, rs_0 );
		__vector4 sh_0 = __vmulfp( rs_0, vmx_float_rsqrt_c1 );
		__vector4 sx_0 = __vmaddfp( d_0, sq_0, vmx_float_rsqrt_c0 );
		__vector4 s_0 = __vmulfp( sh_0, sx_0 );

		jqx_0 = __vmulfp( jqx_0, s_0 );
		jqy_0 = __vmulfp( jqy_0, s_0 );
		jqz_0 = __vmulfp( jqz_0, s_0 );
		jqw_0 = __vmulfp( jqw_0, s_0 );

		__vector4 tp0_0 = __vmrghw( jqx_0, jqy_0 );
		__vector4 tp1_0 = __vmrghw( jqz_0, jqw_0 );
		__vector4 tp2_0 = __vmrglw( jqx_0, jqy_0 );
		__vector4 tp3_0 = __vmrglw( jqz_0, jqw_0 );

		__vector4 p0_0 = __vperm( tp0_0, tp1_0, vmx_dword_perm_plane_x );
		__vector4 p1_0 = __vperm( tp0_0, tp1_0, vmx_dword_perm_plane_y );
		__vector4 p2_0 = __vperm( tp2_0, tp3_0, vmx_dword_perm_plane_x );
		__vector4 p3_0 = __vperm( tp2_0, tp3_0, vmx_dword_perm_plane_y );

		__stvx( p0_0, &joints[n0].q, 0 );
		__stvx( p1_0, &joints[n1].q, 0 );
		__stvx( p2_0, &joints[n2].q, 0 );
		__stvx( p3_0, &joints[n3].q, 0 );
	}

	for ( ; i < numJoints; i++ ) {
		int n = index[i];

		idVec3 &jointVert = joints[n].t;
		const idVec3 &blendVert = blendJoints[n].t;

		jointVert[0] += lerp * ( blendVert[0] - jointVert[0] );
		jointVert[1] += lerp * ( blendVert[1] - jointVert[1] );
		jointVert[2] += lerp * ( blendVert[2] - jointVert[2] );

		idQuat &jointQuat = joints[n].q;
		const idQuat &blendQuat = blendJoints[n].q;

		float cosom, scale, s;

		cosom = jointQuat.x * blendQuat.x + jointQuat.y * blendQuat.y + jointQuat.z * blendQuat.z + jointQuat.w * blendQuat.w;

		scale = __fsel( cosom, scaledLerp, -scaledLerp );

		jointQuat.x += scale * blendQuat.x;
		jointQuat.y += scale * blendQuat.y;
		jointQuat.z += scale * blendQuat.z;
		jointQuat.w += scale * blendQuat.w;

		s = jointQuat.x * jointQuat.x + jointQuat.y * jointQuat.y + jointQuat.z * jointQuat.z + jointQuat.w * jointQuat.w;
		s = Xenon_ReciprocalSqrt( s );

		jointQuat.x *= s;
		jointQuat.y *= s;
		jointQuat.z *= s;
		jointQuat.w *= s;
	}
}

/*
============
idSIMD_Xenon::ConvertJointQuatsToJointMats
============
*/
void VPCALL idSIMD_Xenon::ConvertJointQuatsToJointMats( idJointMat * __restrict jointMats, const idJointQuat * __restrict jointQuats, const int numJoints ) {
	idJointMat * __restrict end = jointMats + numJoints;

	assert_16_byte_aligned( jointMats );
	assert_16_byte_aligned( jointQuats );

	for ( ; jointMats + 3 < end; jointMats += 4, jointQuats += 4 ) {

		__vector4 q0 = __lvx( jointQuats, 0*32+0 );
		__vector4 r0 = __lvx( jointQuats, 0*32+16 );
		__vector4 q1 = __lvx( jointQuats, 1*32+0 );
		__vector4 r1 = __lvx( jointQuats, 1*32+16 );
		__vector4 q2 = __lvx( jointQuats, 2*32+0 );
		__vector4 r2 = __lvx( jointQuats, 2*32+16 );
		__vector4 q3 = __lvx( jointQuats, 3*32+0 );
		__vector4 r3 = __lvx( jointQuats, 3*32+16 );

		__vector4 d0 = __vaddfp( q0, q0 );
		__vector4 d1 = __vaddfp( q1, q1 );
		__vector4 d2 = __vaddfp( q2, q2 );
		__vector4 d3 = __vaddfp( q3, q3 );

		__vector4 sa0 = __vpermwi( q0, SHUFFLE_D( 1, 0, 0, 1 ) );
		__vector4 sb0 = __vpermwi( d0, SHUFFLE_D( 1, 1, 2, 2 ) );
		__vector4 sc0 = __vpermwi( q0, SHUFFLE_D( 2, 3, 3, 3 ) );
		__vector4 sd0 = __vpermwi( d0, SHUFFLE_D( 2, 2, 1, 0 ) );
		__vector4 sa1 = __vpermwi( q1, SHUFFLE_D( 1, 0, 0, 1 ) );
		__vector4 sb1 = __vpermwi( d1, SHUFFLE_D( 1, 1, 2, 2 ) );
		__vector4 sc1 = __vpermwi( q1, SHUFFLE_D( 2, 3, 3, 3 ) );
		__vector4 sd1 = __vpermwi( d1, SHUFFLE_D( 2, 2, 1, 0 ) );
		__vector4 sa2 = __vpermwi( q2, SHUFFLE_D( 1, 0, 0, 1 ) );
		__vector4 sb2 = __vpermwi( d2, SHUFFLE_D( 1, 1, 2, 2 ) );
		__vector4 sc2 = __vpermwi( q2, SHUFFLE_D( 2, 3, 3, 3 ) );
		__vector4 sd2 = __vpermwi( d2, SHUFFLE_D( 2, 2, 1, 0 ) );
		__vector4 sa3 = __vpermwi( q3, SHUFFLE_D( 1, 0, 0, 1 ) );
		__vector4 sb3 = __vpermwi( d3, SHUFFLE_D( 1, 1, 2, 2 ) );
		__vector4 sc3 = __vpermwi( q3, SHUFFLE_D( 2, 3, 3, 3 ) );
		__vector4 sd3 = __vpermwi( d3, SHUFFLE_D( 2, 2, 1, 0 ) );

		__vector4 ma0 = __vmulfp( sa0, sb0 );								//  yy2,  xy2,  xz2,  yz2
		__vector4 mb0 = __vmulfp( sc0, sd0 );								//  zz2,  wz2,  wy2,  wx2
		__vector4 mc0 = __vmulfp( q0, d0 );									//  xx2,  yy2,  zz2,  ww2
		__vector4 ma1 = __vmulfp( sa1, sb1 );								//  yy2,  xy2,  xz2,  yz2
		__vector4 mb1 = __vmulfp( sc1, sd1 );								//  zz2,  wz2,  wy2,  wx2
		__vector4 mc1 = __vmulfp( q1, d1 );									//  xx2,  yy2,  zz2,  ww2
		__vector4 ma2 = __vmulfp( sa2, sb2 );								//  yy2,  xy2,  xz2,  yz2
		__vector4 mb2 = __vmulfp( sc2, sd2 );								//  zz2,  wz2,  wy2,  wx2
		__vector4 mc2 = __vmulfp( q2, d2 );									//  xx2,  yy2,  zz2,  ww2
		__vector4 ma3 = __vmulfp( sa3, sb3 );								//  yy2,  xy2,  xz2,  yz2
		__vector4 mb3 = __vmulfp( sc3, sd3 );								//  zz2,  wz2,  wy2,  wx2
		__vector4 mc3 = __vmulfp( q3, d3 );									//  xx2,  yy2,  zz2,  ww2

		__vector4 md0 = __vperm( ma0, mc0, vmx_dword_perm_quat2mat1 );		//  xx2,  xy2,  xz2,  yz2							// 10, 01, 02, 03
		__vector4 me0 = __vperm( ma0, mb0, vmx_dword_perm_quat2mat2 );		//  yy2,  xy2,  wy2,  wx2							// 00, 01, 12, 13
		__vector4 md1 = __vperm( ma1, mc1, vmx_dword_perm_quat2mat1 );		//  xx2,  xy2,  xz2,  yz2							// 10, 01, 02, 03
		__vector4 me1 = __vperm( ma1, mb1, vmx_dword_perm_quat2mat2 );		//  yy2,  xy2,  wy2,  wx2							// 00, 01, 12, 13
		__vector4 md2 = __vperm( ma2, mc2, vmx_dword_perm_quat2mat1 );		//  xx2,  xy2,  xz2,  yz2							// 10, 01, 02, 03
		__vector4 me2 = __vperm( ma2, mb2, vmx_dword_perm_quat2mat2 );		//  yy2,  xy2,  wy2,  wx2							// 00, 01, 12, 13
		__vector4 md3 = __vperm( ma3, mc3, vmx_dword_perm_quat2mat1 );		//  xx2,  xy2,  xz2,  yz2							// 10, 01, 02, 03
		__vector4 me3 = __vperm( ma3, mb3, vmx_dword_perm_quat2mat2 );		//  yy2,  xy2,  wy2,  wx2							// 00, 01, 12, 13

		__vector4 mf0 = __vxor( ma0, vmx_float_quat2mat_xor );				// -yy2,  xy2,  xz2,  yz2							// - + + +
		__vector4 mg0 = __vxor( md0, vmx_float_quat2mat_xor );				// -xx2,  xy2,  xz2,  yz2							// - + + +
		__vector4 mf1 = __vxor( ma1, vmx_float_quat2mat_xor );				// -yy2,  xy2,  xz2,  yz2							// - + + +
		__vector4 mg1 = __vxor( md1, vmx_float_quat2mat_xor );				// -xx2,  xy2,  xz2,  yz2							// - + + +
		__vector4 mf2 = __vxor( ma2, vmx_float_quat2mat_xor );				// -yy2,  xy2,  xz2,  yz2							// - + + +
		__vector4 mg2 = __vxor( md2, vmx_float_quat2mat_xor );				// -xx2,  xy2,  xz2,  yz2							// - + + +
		__vector4 mf3 = __vxor( ma3, vmx_float_quat2mat_xor );				// -yy2,  xy2,  xz2,  yz2							// - + + +
		__vector4 mg3 = __vxor( md3, vmx_float_quat2mat_xor );				// -xx2,  xy2,  xz2,  yz2							// - + + +

		__vector4 ra0 = __vmaddfp( mb0, vmx_float_quat2mat_mad1, mf0 );		// -yy2 - zz2, xy2 + wz2, xz2 + wy2,				// - + - -
		__vector4 rb0 = __vmaddfp( mb0, vmx_float_quat2mat_mad2, mg0 );		// -xx2 - zz2, xy2 - wz2,          , yz2 + wx2		// - - - +
		__vector4 rc0 = __vmaddfp( me0, vmx_float_quat2mat_mad3, mg0 );		// -xx2 - yy2,          , xz2 + wy2, yz2 + wx2		// - - + -
		__vector4 ra1 = __vmaddfp( mb1, vmx_float_quat2mat_mad1, mf1 );		// -yy2 - zz2, xy2 + wz2, xz2 + wy2,				// - + - -
		__vector4 rb1 = __vmaddfp( mb1, vmx_float_quat2mat_mad2, mg1 );		// -xx2 - zz2, xy2 - wz2,          , yz2 + wx2		// - - - +
		__vector4 rc1 = __vmaddfp( me1, vmx_float_quat2mat_mad3, mg1 );		// -xx2 - yy2,          , xz2 + wy2, yz2 + wx2		// - - + -
		__vector4 ra2 = __vmaddfp( mb2, vmx_float_quat2mat_mad1, mf2 );		// -yy2 - zz2, xy2 + wz2, xz2 + wy2,				// - + - -
		__vector4 rb2 = __vmaddfp( mb2, vmx_float_quat2mat_mad2, mg2 );		// -xx2 - zz2, xy2 - wz2,          , yz2 + wx2		// - - - +
		__vector4 rc2 = __vmaddfp( me2, vmx_float_quat2mat_mad3, mg2 );		// -xx2 - yy2,          , xz2 + wy2, yz2 + wx2		// - - + -
		__vector4 ra3 = __vmaddfp( mb3, vmx_float_quat2mat_mad1, mf3 );		// -yy2 - zz2, xy2 + wz2, xz2 + wy2,				// - + - -
		__vector4 rb3 = __vmaddfp( mb3, vmx_float_quat2mat_mad2, mg3 );		// -xx2 - zz2, xy2 - wz2,          , yz2 + wx2		// - - - +
		__vector4 rc3 = __vmaddfp( me3, vmx_float_quat2mat_mad3, mg3 );		// -xx2 - yy2,          , xz2 + wy2, yz2 + wx2		// - - + -

		__vector4 re0 = __vaddfp( ra0, vmx_float_quat2mat_add );			// 1 0 0 0
		__vector4 rf0 = __vaddfp( rb0, vmx_float_quat2mat_add );			// 1 0 0 0
		__vector4 rg0 = __vaddfp( rc0, vmx_float_quat2mat_add );			// 1 0 0 0
		__vector4 re1 = __vaddfp( ra1, vmx_float_quat2mat_add );			// 1 0 0 0
		__vector4 rf1 = __vaddfp( rb1, vmx_float_quat2mat_add );			// 1 0 0 0
		__vector4 rg1 = __vaddfp( rc1, vmx_float_quat2mat_add );			// 1 0 0 0
		__vector4 re2 = __vaddfp( ra2, vmx_float_quat2mat_add );			// 1 0 0 0
		__vector4 rf2 = __vaddfp( rb2, vmx_float_quat2mat_add );			// 1 0 0 0
		__vector4 rg2 = __vaddfp( rc2, vmx_float_quat2mat_add );			// 1 0 0 0
		__vector4 re3 = __vaddfp( ra3, vmx_float_quat2mat_add );			// 1 0 0 0
		__vector4 rf3 = __vaddfp( rb3, vmx_float_quat2mat_add );			// 1 0 0 0
		__vector4 rg3 = __vaddfp( rc3, vmx_float_quat2mat_add );			// 1 0 0 0

		__vector4 rh0 = __vperm( re0, r0, vmx_dword_perm_quat2mat3 );		// 00 01 02 10
		__vector4 ri0 = __vperm( rf0, r0, vmx_dword_perm_quat2mat4 );		// 01 00 03 11
		__vector4 rj0 = __vperm( rg0, r0, vmx_dword_perm_quat2mat5 );		// 02 03 00 12
		__vector4 rh1 = __vperm( re1, r1, vmx_dword_perm_quat2mat3 );		// 00 01 02 10
		__vector4 ri1 = __vperm( rf1, r1, vmx_dword_perm_quat2mat4 );		// 01 00 03 11
		__vector4 rj1 = __vperm( rg1, r1, vmx_dword_perm_quat2mat5 );		// 02 03 00 12
		__vector4 rh2 = __vperm( re2, r2, vmx_dword_perm_quat2mat3 );		// 00 01 02 10
		__vector4 ri2 = __vperm( rf2, r2, vmx_dword_perm_quat2mat4 );		// 01 00 03 11
		__vector4 rj2 = __vperm( rg2, r2, vmx_dword_perm_quat2mat5 );		// 02 03 00 12
		__vector4 rh3 = __vperm( re3, r3, vmx_dword_perm_quat2mat3 );		// 00 01 02 10
		__vector4 ri3 = __vperm( rf3, r3, vmx_dword_perm_quat2mat4 );		// 01 00 03 11
		__vector4 rj3 = __vperm( rg3, r3, vmx_dword_perm_quat2mat5 );		// 02 03 00 12

		__stvx( rh0, jointMats, 0*48+0 );
		__stvx( ri0, jointMats, 0*48+16 );
		__stvx( rj0, jointMats, 0*48+32 );
		__stvx( rh1, jointMats, 1*48+0 );
		__stvx( ri1, jointMats, 1*48+16 );
		__stvx( rj1, jointMats, 1*48+32 );
		__stvx( rh2, jointMats, 2*48+0 );
		__stvx( ri2, jointMats, 2*48+16 );
		__stvx( rj2, jointMats, 2*48+32 );
		__stvx( rh3, jointMats, 3*48+0 );
		__stvx( ri3, jointMats, 3*48+16 );
		__stvx( rj3, jointMats, 3*48+32 );
	}

	for ( ; jointMats < end; jointMats++, jointQuats++ ) {

		__vector4 q0 = __lvx( jointQuats, 0 );
		__vector4 r0 = __lvx( jointQuats, 16 );
		__vector4 d0 = __vaddfp( q0, q0 );

		__vector4 sa0 = __vpermwi( q0, SHUFFLE_D( 1, 0, 0, 1 ) );
		__vector4 sb0 = __vpermwi( d0, SHUFFLE_D( 1, 1, 2, 2 ) );

		__vector4 sc0 = __vpermwi( q0, SHUFFLE_D( 2, 3, 3, 3 ) );
		__vector4 sd0 = __vpermwi( d0, SHUFFLE_D( 2, 2, 1, 0 ) );

		__vector4 ma0 = __vmulfp( sa0, sb0 );								//  yy2,  xy2,  xz2,  yz2
		__vector4 mb0 = __vmulfp( sc0, sd0 );								//  zz2,  wz2,  wy2,  wx2
		__vector4 mc0 = __vmulfp( q0, d0 );									//  xx2,  yy2,  zz2,  ww2

		__vector4 md0 = __vperm( ma0, mc0, vmx_dword_perm_quat2mat1 );		//  xx2,  xy2,  xz2,  yz2							// 10, 01, 02, 03
		__vector4 me0 = __vperm( ma0, mb0, vmx_dword_perm_quat2mat2 );		//  yy2,  xy2,  wy2,  wx2							// 00, 01, 12, 13

		__vector4 mf0 = __vxor( ma0, vmx_float_quat2mat_xor );				// -yy2,  xy2,  xz2,  yz2							// - + + +
		__vector4 mg0 = __vxor( md0, vmx_float_quat2mat_xor );				// -xx2,  xy2,  xz2,  yz2							// - + + +

		__vector4 ra0 = __vmaddfp( mb0, vmx_float_quat2mat_mad1, mf0 );		// -yy2 - zz2, xy2 + wz2, xz2 + wy2,				// - + - -
		__vector4 rb0 = __vmaddfp( mb0, vmx_float_quat2mat_mad2, mg0 );		// -xx2 - zz2, xy2 - wz2,          , yz2 + wx2		// - - - +
		__vector4 rc0 = __vmaddfp( me0, vmx_float_quat2mat_mad3, mg0 );		// -xx2 - yy2,          , xz2 + wy2, yz2 + wx2		// - - + -

		__vector4 re0 = __vaddfp( ra0, vmx_float_quat2mat_add );			// 1 0 0 0
		__vector4 rf0 = __vaddfp( rb0, vmx_float_quat2mat_add );			// 1 0 0 0
		__vector4 rg0 = __vaddfp( rc0, vmx_float_quat2mat_add );			// 1 0 0 0

		__vector4 rh0 = __vperm( re0, r0, vmx_dword_perm_quat2mat3 );		// 00 01 02 10
		__vector4 ri0 = __vperm( rf0, r0, vmx_dword_perm_quat2mat4 );		// 01 00 03 11
		__vector4 rj0 = __vperm( rg0, r0, vmx_dword_perm_quat2mat5 );		// 02 03 00 12

		__stvx( rh0, jointMats, 0 );
		__stvx( ri0, jointMats, 16 );
		__stvx( rj0, jointMats, 32 );
	}
}

/*
============
idSIMD_Xenon::ConvertJointMatsToJointQuats
============
*/
void VPCALL idSIMD_Xenon::ConvertJointMatsToJointQuats( idJointQuat * __restrict jointQuats, const idJointMat * __restrict jointMats, const int numJoints ) {
	int i;

	assert_16_byte_aligned( jointMats );
	assert_16_byte_aligned( jointQuats );
	compile_time_assert( (UINT_PTR)(&((idJointQuat *)0)->t) == (UINT_PTR)(&((idJointQuat *)0)->q) + (UINT_PTR)sizeof( ((idJointQuat *)0)->q ) );

	__vector4 zero = vmx_float_zero;

	for ( i = 0; i < numJoints - 3; i += 4 ) {
		const float *__restrict m = (float *)&jointMats[i];
		float *__restrict q = (float *)&jointQuats[i];

		__vector4 ma0 = __lvx( m, 0*48+0 );
		__vector4 ma1 = __lvx( m, 0*48+16 );
		__vector4 ma2 = __lvx( m, 0*48+32 );

		__vector4 mb0 = __lvx( m, 1*48+0 );
		__vector4 mb1 = __lvx( m, 1*48+16 );
		__vector4 mb2 = __lvx( m, 1*48+32 );

		__vector4 mc0 = __lvx( m, 2*48+0 );
		__vector4 mc1 = __lvx( m, 2*48+16 );
		__vector4 mc2 = __lvx( m, 2*48+32 );

		__vector4 md0 = __lvx( m, 3*48+0 );
		__vector4 md1 = __lvx( m, 3*48+16 );
		__vector4 md2 = __lvx( m, 3*48+32 );

		__vector4 ta0 = __vmrghw( ma0, mb0 );
		__vector4 ta1 = __vmrghw( mc0, md0 );
		__vector4 ta2 = __vmrglw( ma0, mb0 );
		__vector4 ta3 = __vmrglw( mc0, md0 );

		__vector4 tb0 = __vmrghw( ma1, mb1 );
		__vector4 tb1 = __vmrghw( mc1, md1 );
		__vector4 tb2 = __vmrglw( ma1, mb1 );
		__vector4 tb3 = __vmrglw( mc1, md1 );

		__vector4 tc0 = __vmrghw( ma2, mb2 );
		__vector4 tc1 = __vmrghw( mc2, md2 );
		__vector4 tc2 = __vmrglw( ma2, mb2 );
		__vector4 tc3 = __vmrglw( mc2, md2 );

		__vector4 m00 = __vperm( ta0, ta1, vmx_dword_perm_plane_x );
		__vector4 m01 = __vperm( ta0, ta1, vmx_dword_perm_plane_y );
		__vector4 m02 = __vperm( ta2, ta3, vmx_dword_perm_plane_x );
		__vector4 m03 = __vperm( ta2, ta3, vmx_dword_perm_plane_y );

		__vector4 m10 = __vperm( tb0, tb1, vmx_dword_perm_plane_x );
		__vector4 m11 = __vperm( tb0, tb1, vmx_dword_perm_plane_y );
		__vector4 m12 = __vperm( tb2, tb3, vmx_dword_perm_plane_x );
		__vector4 m13 = __vperm( tb2, tb3, vmx_dword_perm_plane_y );

		__vector4 m20 = __vperm( tc0, tc1, vmx_dword_perm_plane_x );
		__vector4 m21 = __vperm( tc0, tc1, vmx_dword_perm_plane_y );
		__vector4 m22 = __vperm( tc2, tc3, vmx_dword_perm_plane_x );
		__vector4 m23 = __vperm( tc2, tc3, vmx_dword_perm_plane_y );

		__vector4 b00 = __vaddfp( m00, m11 );
		__vector4 b11 = __vcmpgtfp( m00, m22 );
		__vector4 b01 = __vaddfp( b00, m22 );
		__vector4 b10 = __vcmpgtfp( m00, m11 );
		__vector4 b0  = __vcmpgtfp( b01, zero );
		__vector4 b1  = __vand( b10, b11 );
		__vector4 b2  = __vcmpgtfp( m11, m22 );

		__vector4 m0  = b0;
		__vector4 m1  = __vandc( b1, b0 );
		__vector4 p1  = __vor( b0, b1 );
		__vector4 p2  = __vor( p1, b2 );
		__vector4 m2  = __vandc( b2, p1 );
		__vector4 m3  = __vxor( p2, vmx_dword_not );

		__vector4 i0  = __vor( m2, m3 );
		__vector4 i1  = __vor( m1, m3 );
		__vector4 i2  = __vor( m1, m2 );

		__vector4 s0  = __vand( i0, vmx_float_sign_bit );
		__vector4 s1  = __vand( i1, vmx_float_sign_bit );
		__vector4 s2  = __vand( i2, vmx_float_sign_bit );

		__vector4 n0  = __vand( m0, vmx_dword_quat2mat_swizzle0 );
		__vector4 n1  = __vand( m1, vmx_dword_quat2mat_swizzle1 );
		__vector4 n2  = __vand( m2, vmx_dword_quat2mat_swizzle2 );
		__vector4 n3  = __vand( m3, vmx_dword_quat2mat_swizzle3 );

		__vector4 n4  = __vor( n0, n1 );
		__vector4 n5  = __vor( n2, n3 );
		__vector4 n6  = __vor( n4, n5 );

		m00 = __vxor( m00, s0 );
		m11 = __vxor( m11, s1 );
		m22 = __vxor( m22, s2 );
		m21 = __vxor( m21, s0 );
		m02 = __vxor( m02, s1 );
		m10 = __vxor( m10, s2 );

		__vector4 t0  = __vaddfp( m00, m11 );
		__vector4 t1  = __vaddfp( m22, vmx_float_one );
		__vector4 q0  = __vaddfp( t0, t1 );
		__vector4 q1  = __vsubfp( m01, m10 );
		__vector4 q2  = __vsubfp( m20, m02 );
		__vector4 q3  = __vsubfp( m12, m21 );

		__vector4 rs = __vrsqrtefp( q0 );
		__vector4 sq = __vmulfp( rs, rs );
		__vector4 sh = __vmulfp( rs, vmx_float_rsqrt_c2 );
		__vector4 sx = __vmaddfp( q0, sq, vmx_float_rsqrt_c0 );
		__vector4 s  = __vmulfp( sh, sx );

		q0 = __vmulfp( q0, s );
		q1 = __vmulfp( q1, s );
		q2 = __vmulfp( q2, s );
		q3 = __vmulfp( q3, s );

		__vector4 tq0 = __vmrghw( q0, q1 );
		__vector4 tq1 = __vmrghw( q2, q3 );
		__vector4 tq2 = __vmrglw( q0, q1 );
		__vector4 tq3 = __vmrglw( q2, q3 );

		__vector4 tt0 = __vmrghw( m03, m13 );
		__vector4 tt1 = __vmrghw( m23, zero );
		__vector4 tt2 = __vmrglw( m03, m13 );
		__vector4 tt3 = __vmrglw( m23, zero );

		__vector4 sw0 = __vperm( n6, zero, vmx_dword_quat2mat_splat0 );
		__vector4 sw1 = __vperm( n6, zero, vmx_dword_quat2mat_splat1 );
		__vector4 sw2 = __vperm( n6, zero, vmx_dword_quat2mat_splat2 );
		__vector4 sw3 = __vperm( n6, zero, vmx_dword_quat2mat_splat3 );

		sw0 = __vor( sw0, vmx_dword_quat2mat_or );
		sw1 = __vor( sw1, vmx_dword_quat2mat_or );
		sw2 = __vor( sw2, vmx_dword_quat2mat_or );
		sw3 = __vor( sw3, vmx_dword_quat2mat_or );

		__vector4 r0 = __vperm( tq0, tq1, vmx_dword_perm_plane_x );
		__vector4 r1 = __vperm( tq0, tq1, vmx_dword_perm_plane_y );
		__vector4 r2 = __vperm( tq2, tq3, vmx_dword_perm_plane_x );
		__vector4 r3 = __vperm( tq2, tq3, vmx_dword_perm_plane_y );

		__vector4 r4 = __vperm( tt0, tt1, vmx_dword_perm_plane_x );
		__vector4 r5 = __vperm( tt0, tt1, vmx_dword_perm_plane_y );
		__vector4 r6 = __vperm( tt2, tt3, vmx_dword_perm_plane_x );
		__vector4 r7 = __vperm( tt2, tt3, vmx_dword_perm_plane_y );

		r0 = __vperm( r0, r0, sw0 );
		r1 = __vperm( r1, r1, sw1 );
		r2 = __vperm( r2, r2, sw2 );
		r3 = __vperm( r3, r3, sw3 );

		__stvx( r0, q, 0*16 );
		__stvx( r4, q, 1*16 );
		__stvx( r1, q, 2*16 );
		__stvx( r5, q, 3*16 );
		__stvx( r2, q, 4*16 );
		__stvx( r6, q, 5*16 );
		__stvx( r3, q, 6*16 );
		__stvx( r7, q, 7*16 );
	}

	int k0 = (3<<0)|(2<<2)|(1<<4)|(0<<6);
	int k1 = (0<<0)|(1<<2)|(2<<4)|(3<<6);
	int k2 = (1<<0)|(0<<2)|(3<<4)|(2<<6);
	int k3 = (2<<0)|(3<<2)|(0<<4)|(1<<6);
	float sign[2] = { 1.0f, -1.0f };

	for ( ; i < numJoints; i++, jointMats++, jointQuats++ ) {
		const float *m = (float *)jointMats;
		float *q = (float *)jointQuats;

		int b0 = m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f;
		int b1 = m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2];
		int b2 = m[1 * 4 + 1] > m[2 * 4 + 2];

		int m0 = b0;
		int m1 = !b0 & b1;
		int m2 = !( b0 | b1 ) & b2;
		int m3 = !( b0 | b1 | b2 );

		int i0 = ( m2 | m3 );
		int i1 = ( m1 | m3 );
		int i2 = ( m1 | m2 );

		float s0 = sign[i0];
		float s1 = sign[i1];
		float s2 = sign[i2];

		int index = ( -m0 & k0 ) | ( -m1 & k1 ) | ( -m2 & k2 ) | ( -m3 & k3 );

		float t = s0 * m[0 * 4 + 0] + s1 * m[1 * 4 + 1] + s2 * m[2 * 4 + 2] + 1.0f;
		float s = __frsqrte( t );
		s = ( t * s * s + -3.0f ) * ( s * -0.25f );

		q[(index>>0)&3] = t * s;
		q[(index>>2)&3] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
		q[(index>>4)&3] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
		q[(index>>6)&3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;

		q[4] = m[0 * 4 + 3];
		q[5] = m[1 * 4 + 3];
		q[6] = m[2 * 4 + 3];
		q[7] = 0.0f;
	}
}

/*
============
idSIMD_Xenon::TransformJoints
============
*/
void VPCALL idSIMD_Xenon::TransformJoints( idJointMat *__restrict jointMats, const int *__restrict parents, const int firstJoint, const int lastJoint ) {
	for( int i = firstJoint; i <= lastJoint; i++ ) {
		assert( parents[i] < i );

		float *__restrict m1 = jointMats[parents[i]].ToFloatPtr();
		float *__restrict m2 = jointMats[i].ToFloatPtr();

		__vector4 m1a0 = __lvx( m1, 0 );
		__vector4 m1b0 = __lvx( m1, 16 );
		__vector4 m1c0 = __lvx( m1, 32 );

		__vector4 m2a0 = __lvx( m2, 0 );
		__vector4 m2b0 = __lvx( m2, 16 );
		__vector4 m2c0 = __lvx( m2, 32 );

		__vector4 ta0 = __vspltw( m1a0, 0 );
		__vector4 tb0 = __vspltw( m1a0, 1 );
		__vector4 tc0 = __vspltw( m1a0, 2 );

		__vector4 td0 = __vspltw( m1b0, 0 );
		__vector4 te0 = __vspltw( m1b0, 1 );
		__vector4 tf0 = __vspltw( m1b0, 2 );

		__vector4 tg0 = __vspltw( m1c0, 0 );
		__vector4 th0 = __vspltw( m1c0, 1 );
		__vector4 ti0 = __vspltw( m1c0, 2 );

		__vector4 tj0 = __vperm( m1a0, vmx_float_zero, vmx_dword_perm_matrix );
		__vector4 tk0 = __vperm( m1b0, vmx_float_zero, vmx_dword_perm_matrix );
		__vector4 tl0 = __vperm( m1c0, vmx_float_zero, vmx_dword_perm_matrix );

		__vector4 ra0 = __vmulfp( ta0, m2a0 );
		__vector4 rb0 = __vmulfp( tb0, m2b0 );
		__vector4 rc0 = __vmulfp( tc0, m2c0 );

		__vector4 rd0 = __vmulfp( td0, m2a0 );
		__vector4 re0 = __vmulfp( te0, m2b0 );
		__vector4 rf0 = __vmulfp( tf0, m2c0 );

		__vector4 rg0 = __vmulfp( tg0, m2a0 );
		__vector4 rh0 = __vmulfp( th0, m2b0 );
		__vector4 ri0 = __vmulfp( ti0, m2c0 );

		__vector4 sa0 = __vaddfp( ra0, rb0 );
		__vector4 sb0 = __vaddfp( rc0, tj0 );
		__vector4 sc0 = __vaddfp( sa0, sb0 );

		__vector4 sd0 = __vaddfp( rd0, re0 );
		__vector4 se0 = __vaddfp( rf0, tk0 );
		__vector4 sf0 = __vaddfp( sd0, se0 );

		__vector4 sg0 = __vaddfp( rg0, rh0 );
		__vector4 sh0 = __vaddfp( ri0, tl0 );
		__vector4 si0 = __vaddfp( sg0, sh0 );

		__stvx( sc0, m2, 0 );
		__stvx( sf0, m2, 16 );
		__stvx( si0, m2, 32 );
	}
}

/*
============
idSIMD_Xenon::UntransformJoints
============
*/
void VPCALL idSIMD_Xenon::UntransformJoints( idJointMat *__restrict jointMats, const int *__restrict parents, const int firstJoint, const int lastJoint ) {
	for( int i = lastJoint; i >= firstJoint; i-- ) {
		assert( parents[i] < i );

		float *__restrict m1 = jointMats[parents[i]].ToFloatPtr();
		float *__restrict m2 = jointMats[i].ToFloatPtr();

		__vector4 m1a0 = __lvx( m1, 0 );
		__vector4 m1b0 = __lvx( m1, 16 );
		__vector4 m1c0 = __lvx( m1, 32 );

		__vector4 m2a0 = __lvx( m2, 0 );
		__vector4 m2b0 = __lvx( m2, 16 );
		__vector4 m2c0 = __lvx( m2, 32 );

		__vector4 ta0 = __vspltw( m1a0, 0 );
		__vector4 tb0 = __vspltw( m1a0, 1 );
		__vector4 tc0 = __vspltw( m1a0, 2 );

		__vector4 td0 = __vspltw( m1b0, 0 );
		__vector4 te0 = __vspltw( m1b0, 1 );
		__vector4 tf0 = __vspltw( m1b0, 2 );

		__vector4 tg0 = __vspltw( m1c0, 0 );
		__vector4 th0 = __vspltw( m1c0, 1 );
		__vector4 ti0 = __vspltw( m1c0, 2 );

		__vector4 tj0 = __vperm( m1a0, vmx_float_zero, vmx_dword_perm_matrix );
		__vector4 tk0 = __vperm( m1b0, vmx_float_zero, vmx_dword_perm_matrix );
		__vector4 tl0 = __vperm( m1c0, vmx_float_zero, vmx_dword_perm_matrix );

		m2a0 = __vsubfp( m2a0, tj0 );
		m2b0 = __vsubfp( m2b0, tk0 );
		m2c0 = __vsubfp( m2c0, tl0 );

		__vector4 ra0 = __vmulfp( ta0, m2a0 );
		__vector4 rb0 = __vmulfp( td0, m2b0 );
		__vector4 rc0 = __vmulfp( tg0, m2c0 );

		__vector4 rd0 = __vmulfp( tb0, m2a0 );
		__vector4 re0 = __vmulfp( te0, m2b0 );
		__vector4 rf0 = __vmulfp( th0, m2c0 );

		__vector4 rg0 = __vmulfp( tc0, m2a0 );
		__vector4 rh0 = __vmulfp( tf0, m2b0 );
		__vector4 ri0 = __vmulfp( ti0, m2c0 );

		__vector4 sa0 = __vaddfp( ra0, rb0 );
		__vector4 sb0 = __vaddfp( rc0, sa0 );

		__vector4 sd0 = __vaddfp( rd0, re0 );
		__vector4 se0 = __vaddfp( rf0, sd0 );

		__vector4 sg0 = __vaddfp( rg0, rh0 );
		__vector4 sh0 = __vaddfp( ri0, sg0 );

		__stvx( sb0, m2, 0 );
		__stvx( se0, m2, 16 );
		__stvx( sh0, m2, 32 );
	}
}

/*
============
idSIMD_Xenon::MultiplyJoints
============
*/
void VPCALL idSIMD_Xenon::MultiplyJoints( idJointMat *__restrict result, const idJointMat *__restrict joints1, const idJointMat *__restrict joints2, const int numJoints ) {
	idJointMat *__restrict end = result + numJoints;

	assert_16_byte_aligned( result );
	assert_16_byte_aligned( joints1 );
	assert_16_byte_aligned( joints2 );
	assert_sizeof_16_byte_multiple( idJointMat );

	for ( ; result + 1 < end; result += 2, joints1 += 2, joints2 += 2 ) {
		__vector4 m1a0 = __lvx( joints1, 0 );
		__vector4 m1b0 = __lvx( joints1, 16 );
		__vector4 m1c0 = __lvx( joints1, 32 );

		__vector4 m1a1 = __lvx( joints1, 48+0 );
		__vector4 m1b1 = __lvx( joints1, 48+16 );
		__vector4 m1c1 = __lvx( joints1, 48+32 );

		__vector4 m2a0 = __lvx( joints2, 0 );
		__vector4 m2b0 = __lvx( joints2, 16 );
		__vector4 m2c0 = __lvx( joints2, 32 );

		__vector4 m2a1 = __lvx( joints2, 48+0 );
		__vector4 m2b1 = __lvx( joints2, 48+16 );
		__vector4 m2c1 = __lvx( joints2, 48+32 );

		__vector4 ta0 = __vspltw( m1a0, 0 );
		__vector4 tb0 = __vspltw( m1a0, 1 );
		__vector4 tc0 = __vspltw( m1a0, 2 );

		__vector4 td0 = __vspltw( m1b0, 0 );
		__vector4 te0 = __vspltw( m1b0, 1 );
		__vector4 tf0 = __vspltw( m1b0, 2 );

		__vector4 tg0 = __vspltw( m1c0, 0 );
		__vector4 th0 = __vspltw( m1c0, 1 );
		__vector4 ti0 = __vspltw( m1c0, 2 );

		__vector4 ta1 = __vspltw( m1a1, 0 );
		__vector4 tb1 = __vspltw( m1a1, 1 );
		__vector4 tc1 = __vspltw( m1a1, 2 );

		__vector4 td1 = __vspltw( m1b1, 0 );
		__vector4 te1 = __vspltw( m1b1, 1 );
		__vector4 tf1 = __vspltw( m1b1, 2 );

		__vector4 tg1 = __vspltw( m1c1, 0 );
		__vector4 th1 = __vspltw( m1c1, 1 );
		__vector4 ti1 = __vspltw( m1c1, 2 );

		__vector4 tj0 = __vperm( m1a0, vmx_float_zero, vmx_dword_perm_matrix );
		__vector4 tk0 = __vperm( m1b0, vmx_float_zero, vmx_dword_perm_matrix );
		__vector4 tl0 = __vperm( m1c0, vmx_float_zero, vmx_dword_perm_matrix );

		__vector4 tj1 = __vperm( m1a1, vmx_float_zero, vmx_dword_perm_matrix );
		__vector4 tk1 = __vperm( m1b1, vmx_float_zero, vmx_dword_perm_matrix );
		__vector4 tl1 = __vperm( m1c1, vmx_float_zero, vmx_dword_perm_matrix );

		__vector4 ra0 = __vmulfp( ta0, m2a0 );
		__vector4 rb0 = __vmulfp( tb0, m2b0 );
		__vector4 rc0 = __vmulfp( tc0, m2c0 );

		__vector4 rd0 = __vmulfp( td0, m2a0 );
		__vector4 re0 = __vmulfp( te0, m2b0 );
		__vector4 rf0 = __vmulfp( tf0, m2c0 );

		__vector4 rg0 = __vmulfp( tg0, m2a0 );
		__vector4 rh0 = __vmulfp( th0, m2b0 );
		__vector4 ri0 = __vmulfp( ti0, m2c0 );

		__vector4 sa0 = __vaddfp( ra0, rb0 );
		__vector4 sb0 = __vaddfp( rc0, tj0 );
		__vector4 sd0 = __vaddfp( rd0, re0 );
		__vector4 se0 = __vaddfp( rf0, tk0 );
		__vector4 sg0 = __vaddfp( rg0, rh0 );
		__vector4 sh0 = __vaddfp( ri0, tl0 );

		__vector4 ra1 = __vmulfp( ta1, m2a1 );
		__vector4 rb1 = __vmulfp( tb1, m2b1 );
		__vector4 rc1 = __vmulfp( tc1, m2c1 );

		__vector4 rd1 = __vmulfp( td1, m2a1 );
		__vector4 re1 = __vmulfp( te1, m2b1 );
		__vector4 rf1 = __vmulfp( tf1, m2c1 );

		__vector4 rg1 = __vmulfp( tg1, m2a1 );
		__vector4 rh1 = __vmulfp( th1, m2b1 );
		__vector4 ri1 = __vmulfp( ti1, m2c1 );

		__vector4 sa1 = __vaddfp( ra1, rb1 );
		__vector4 sb1 = __vaddfp( rc1, tj1 );
		__vector4 sd1 = __vaddfp( rd1, re1 );
		__vector4 se1 = __vaddfp( rf1, tk1 );
		__vector4 sg1 = __vaddfp( rg1, rh1 );
		__vector4 sh1 = __vaddfp( ri1, tl1 );

		__vector4 sc0 = __vaddfp( sa0, sb0 );
		__vector4 sf0 = __vaddfp( sd0, se0 );
		__vector4 si0 = __vaddfp( sg0, sh0 );

		__vector4 sc1 = __vaddfp( sa1, sb1 );
		__vector4 sf1 = __vaddfp( sd1, se1 );
		__vector4 si1 = __vaddfp( sg1, sh1 );

		__stvx( sc0, result, 0 );
		__stvx( sf0, result, 16 );
		__stvx( si0, result, 32 );

		__stvx( sc1, result, 48+0 );
		__stvx( sf1, result, 48+16 );
		__stvx( si1, result, 48+32 );
	}

	for ( ; result < end; result++, joints1++, joints2++ ) {
		__vector4 m1a0 = __lvx( joints1, 0 );
		__vector4 m1b0 = __lvx( joints1, 16 );
		__vector4 m1c0 = __lvx( joints1, 32 );

		__vector4 m2a0 = __lvx( joints2, 0 );
		__vector4 m2b0 = __lvx( joints2, 16 );
		__vector4 m2c0 = __lvx( joints2, 32 );

		__vector4 ta0 = __vspltw( m1a0, 0 );
		__vector4 tb0 = __vspltw( m1a0, 1 );
		__vector4 tc0 = __vspltw( m1a0, 2 );

		__vector4 td0 = __vspltw( m1b0, 0 );
		__vector4 te0 = __vspltw( m1b0, 1 );
		__vector4 tf0 = __vspltw( m1b0, 2 );

		__vector4 tg0 = __vspltw( m1c0, 0 );
		__vector4 th0 = __vspltw( m1c0, 1 );
		__vector4 ti0 = __vspltw( m1c0, 2 );

		__vector4 tj0 = __vperm( m1a0, vmx_float_zero, vmx_dword_perm_matrix );
		__vector4 tk0 = __vperm( m1b0, vmx_float_zero, vmx_dword_perm_matrix );
		__vector4 tl0 = __vperm( m1c0, vmx_float_zero, vmx_dword_perm_matrix );

		__vector4 ra0 = __vmulfp( ta0, m2a0 );
		__vector4 rb0 = __vmulfp( tb0, m2b0 );
		__vector4 rc0 = __vmulfp( tc0, m2c0 );

		__vector4 rd0 = __vmulfp( td0, m2a0 );
		__vector4 re0 = __vmulfp( te0, m2b0 );
		__vector4 rf0 = __vmulfp( tf0, m2c0 );

		__vector4 rg0 = __vmulfp( tg0, m2a0 );
		__vector4 rh0 = __vmulfp( th0, m2b0 );
		__vector4 ri0 = __vmulfp( ti0, m2c0 );

		__vector4 sa0 = __vaddfp( ra0, rb0 );
		__vector4 sb0 = __vaddfp( rc0, tj0 );
		__vector4 sc0 = __vaddfp( sa0, sb0 );

		__vector4 sd0 = __vaddfp( rd0, re0 );
		__vector4 se0 = __vaddfp( rf0, tk0 );
		__vector4 sf0 = __vaddfp( sd0, se0 );

		__vector4 sg0 = __vaddfp( rg0, rh0 );
		__vector4 sh0 = __vaddfp( ri0, tl0 );
		__vector4 si0 = __vaddfp( sg0, sh0 );

		__stvx( sc0, result, 0 );
		__stvx( sf0, result, 16 );
		__stvx( si0, result, 32 );
	}
}

/*
============
idSIMD_Xenon::TransformVerts
============
*/
void VPCALL idSIMD_Xenon::TransformVerts( idDrawVert *__restrict verts, const int numVerts, const idJointMat *__restrict joints, const idVec4 *__restrict base, const jointWeight_t *__restrict weights, const int numWeights ) {
	const byte *__restrict jointsPtr = (byte *)joints;
	const idDrawVert *__restrict end = verts + numVerts;

	assert_16_byte_aligned( joints );
	assert_16_byte_aligned( base );
	assert_sizeof_16_byte_multiple( idJointMat );
	assert_sizeof_16_byte_multiple( idVec4 );

	for( ; verts < end; verts++, weights++, base++ ) {

		__vector4 *matrix = (__vector4 *) ( jointsPtr + weights->jointMatOffset );
		__vector4 baseVector = *(__vector4 *) base;

		__vector4 m0 = matrix[0];
		__vector4 m1 = matrix[1];
		__vector4 m2 = matrix[2];

		__vector4 vx = __vmulfp( m0, baseVector );
		__vector4 vy = __vmulfp( m1, baseVector );
		__vector4 vz = __vmulfp( m2, baseVector );

		while( weights->nextVertexOffset != JOINTWEIGHT_SIZE ) {
			weights++;
			base++;

			__vector4 *matrix = (__vector4 *) ( jointsPtr + weights->jointMatOffset );
			__vector4 baseVector = *(__vector4 *) base;

			__vector4 m0 = matrix[0];
			__vector4 m1 = matrix[1];
			__vector4 m2 = matrix[2];

			vx = __vmaddfp( m0, baseVector, vx );
			vy = __vmaddfp( m1, baseVector, vy );
			vz = __vmaddfp( m2, baseVector, vz );
		}

		float *result = verts->xyz.ToFloatPtr();

		vx = __vmsum4fp( vx, vmx_float_one );
		vy = __vmsum4fp( vy, vmx_float_one );
		vz = __vmsum4fp( vz, vmx_float_one );

		__stvewx( vx, result, 0 );
		__stvewx( vy, result, 4 );
		__stvewx( vz, result, 8 );
	}
}

/*
============
idSIMD_Xenon::TransformShadowVerts
============
*/
void VPCALL idSIMD_Xenon::TransformShadowVerts( idDrawVert *__restrict verts, const int numVerts, const idJointMat *__restrict joints, const idDrawVert *__restrict base, const jointWeight_t *__restrict weights, const int numWeights ) {
	const byte *__restrict jointsPtr = (byte *)joints;
	const byte *__restrict weightsPtr = (byte *)weights;
	const idDrawVert *__restrict end = verts + numVerts;

	assert_16_byte_aligned( joints );
	assert_16_byte_aligned( base );
	assert_sizeof_16_byte_multiple( idJointMat );
	assert_sizeof_16_byte_multiple( idVec4 );

	for( ; verts + 7 < end; verts += 8, base += 3*8 ) {

		__vector4 *matrix0 = (__vector4 *) ( jointsPtr + ((jointWeight_t *)weightsPtr)->jointMatOffset );
		weightsPtr += ((jointWeight_t *)weightsPtr)->nextVertexOffset;
		__vector4 *matrix1 = (__vector4 *) ( jointsPtr + ((jointWeight_t *)weightsPtr)->jointMatOffset );
		weightsPtr += ((jointWeight_t *)weightsPtr)->nextVertexOffset;
		__vector4 *matrix2 = (__vector4 *) ( jointsPtr + ((jointWeight_t *)weightsPtr)->jointMatOffset );
		weightsPtr += ((jointWeight_t *)weightsPtr)->nextVertexOffset;
		__vector4 *matrix3 = (__vector4 *) ( jointsPtr + ((jointWeight_t *)weightsPtr)->jointMatOffset );
		weightsPtr += ((jointWeight_t *)weightsPtr)->nextVertexOffset;
		__vector4 *matrix4 = (__vector4 *) ( jointsPtr + ((jointWeight_t *)weightsPtr)->jointMatOffset );
		weightsPtr += ((jointWeight_t *)weightsPtr)->nextVertexOffset;
		__vector4 *matrix5 = (__vector4 *) ( jointsPtr + ((jointWeight_t *)weightsPtr)->jointMatOffset );
		weightsPtr += ((jointWeight_t *)weightsPtr)->nextVertexOffset;
		__vector4 *matrix6 = (__vector4 *) ( jointsPtr + ((jointWeight_t *)weightsPtr)->jointMatOffset );
		weightsPtr += ((jointWeight_t *)weightsPtr)->nextVertexOffset;
		__vector4 *matrix7 = (__vector4 *) ( jointsPtr + ((jointWeight_t *)weightsPtr)->jointMatOffset );
		weightsPtr += ((jointWeight_t *)weightsPtr)->nextVertexOffset;

		float *result = verts->xyz.ToFloatPtr();

		__vector4 mx0 = matrix0[0];
		__vector4 my0 = matrix0[1];
		__vector4 mz0 = matrix0[2];
		__vector4 mx1 = matrix1[0];
		__vector4 my1 = matrix1[1];
		__vector4 mz1 = matrix1[2];
		__vector4 mx2 = matrix2[0];
		__vector4 my2 = matrix2[1];
		__vector4 mz2 = matrix2[2];
		__vector4 mx3 = matrix3[0];
		__vector4 my3 = matrix3[1];
		__vector4 mz3 = matrix3[2];

		__vector4 mx4 = matrix4[0];
		__vector4 my4 = matrix4[1];
		__vector4 mz4 = matrix4[2];
		__vector4 mx5 = matrix5[0];
		__vector4 my5 = matrix5[1];
		__vector4 mz5 = matrix5[2];
		__vector4 mx6 = matrix6[0];
		__vector4 my6 = matrix6[1];
		__vector4 mz6 = matrix6[2];
		__vector4 mx7 = matrix7[0];
		__vector4 my7 = matrix7[1];
		__vector4 mz7 = matrix7[2];

		__vector4 b0 = (__vector4 &) base[0*3].xyz;
		__vector4 b1 = (__vector4 &) base[1*3].xyz;
		__vector4 b2 = (__vector4 &) base[2*3].xyz;
		__vector4 b3 = (__vector4 &) base[3*3].xyz;
		__vector4 b4 = (__vector4 &) base[4*3].xyz;
		__vector4 b5 = (__vector4 &) base[5*3].xyz;
		__vector4 b6 = (__vector4 &) base[6*3].xyz;
		__vector4 b7 = (__vector4 &) base[7*3].xyz;

		b0 = __vand( b0, vmx_dword_mask_clear_last );
		b1 = __vand( b1, vmx_dword_mask_clear_last );
		b2 = __vand( b2, vmx_dword_mask_clear_last );
		b3 = __vand( b3, vmx_dword_mask_clear_last );
		b4 = __vand( b4, vmx_dword_mask_clear_last );
		b5 = __vand( b5, vmx_dword_mask_clear_last );
		b6 = __vand( b6, vmx_dword_mask_clear_last );
		b7 = __vand( b7, vmx_dword_mask_clear_last );

		b0 = __vor( b0, vmx_float_last_one );
		b1 = __vor( b1, vmx_float_last_one );
		b2 = __vor( b2, vmx_float_last_one );
		b3 = __vor( b3, vmx_float_last_one );
		b4 = __vor( b4, vmx_float_last_one );
		b5 = __vor( b5, vmx_float_last_one );
		b6 = __vor( b6, vmx_float_last_one );
		b7 = __vor( b7, vmx_float_last_one );

		__vector4 vx0 = __vmsum4fp( mx0, b0 );
		__vector4 vy0 = __vmsum4fp( my0, b0 );
		__vector4 vz0 = __vmsum4fp( mz0, b0 );
		__vector4 vx1 = __vmsum4fp( mx1, b1 );
		__vector4 vy1 = __vmsum4fp( my1, b1 );
		__vector4 vz1 = __vmsum4fp( mz1, b1 );
		__vector4 vx2 = __vmsum4fp( mx2, b2 );
		__vector4 vy2 = __vmsum4fp( my2, b2 );
		__vector4 vz2 = __vmsum4fp( mz2, b2 );
		__vector4 vx3 = __vmsum4fp( mx3, b3 );
		__vector4 vy3 = __vmsum4fp( my3, b3 );
		__vector4 vz3 = __vmsum4fp( mz3, b3 );

		__stvewx( vx0, result, 0*DRAWVERT_SIZE+0 );
		__stvewx( vy0, result, 0*DRAWVERT_SIZE+4 );
		__stvewx( vz0, result, 0*DRAWVERT_SIZE+8 );
		__stvewx( vx1, result, 1*DRAWVERT_SIZE+0 );
		__stvewx( vy1, result, 1*DRAWVERT_SIZE+4 );
		__stvewx( vz1, result, 1*DRAWVERT_SIZE+8 );
		__stvewx( vx2, result, 2*DRAWVERT_SIZE+0 );
		__stvewx( vy2, result, 2*DRAWVERT_SIZE+4 );
		__stvewx( vz2, result, 2*DRAWVERT_SIZE+8 );
		__stvewx( vx3, result, 3*DRAWVERT_SIZE+0 );
		__stvewx( vy3, result, 3*DRAWVERT_SIZE+4 );
		__stvewx( vz3, result, 3*DRAWVERT_SIZE+8 );

		__vector4 vx4 = __vmsum4fp( mx4, b4 );
		__vector4 vy4 = __vmsum4fp( my4, b4 );
		__vector4 vz4 = __vmsum4fp( mz4, b4 );
		__vector4 vx5 = __vmsum4fp( mx5, b5 );
		__vector4 vy5 = __vmsum4fp( my5, b5 );
		__vector4 vz5 = __vmsum4fp( mz5, b5 );
		__vector4 vx6 = __vmsum4fp( mx6, b6 );
		__vector4 vy6 = __vmsum4fp( my6, b6 );
		__vector4 vz6 = __vmsum4fp( mz6, b6 );
		__vector4 vx7 = __vmsum4fp( mx7, b7 );
		__vector4 vy7 = __vmsum4fp( my7, b7 );
		__vector4 vz7 = __vmsum4fp( mz7, b7 );

		__stvewx( vx4, result, 4*DRAWVERT_SIZE+0 );
		__stvewx( vy4, result, 4*DRAWVERT_SIZE+4 );
		__stvewx( vz4, result, 4*DRAWVERT_SIZE+8 );
		__stvewx( vx5, result, 5*DRAWVERT_SIZE+0 );
		__stvewx( vy5, result, 5*DRAWVERT_SIZE+4 );
		__stvewx( vz5, result, 5*DRAWVERT_SIZE+8 );
		__stvewx( vx6, result, 6*DRAWVERT_SIZE+0 );
		__stvewx( vy6, result, 6*DRAWVERT_SIZE+4 );
		__stvewx( vz6, result, 6*DRAWVERT_SIZE+8 );
		__stvewx( vx7, result, 7*DRAWVERT_SIZE+0 );
		__stvewx( vy7, result, 7*DRAWVERT_SIZE+4 );
		__stvewx( vz7, result, 7*DRAWVERT_SIZE+8 );
	}

	for( ; verts < end; verts++, base += 3 ) {
		__vector4 *matrix = (__vector4 *) ( jointsPtr + ((jointWeight_t *)weightsPtr)->jointMatOffset );
		__vector4 baseVector = *(__vector4 *) base;

		weightsPtr += ((jointWeight_t *)weightsPtr)->nextVertexOffset;

		float *result = verts->xyz.ToFloatPtr();

		__vector4 mx = matrix[0];
		__vector4 my = matrix[1];
		__vector4 mz = matrix[2];

		__vector4 vx = __vmsum4fp( mx, baseVector );
		__vector4 vy = __vmsum4fp( my, baseVector );
		__vector4 vz = __vmsum4fp( mz, baseVector );

		__stvewx( vx, result, 0 );
		__stvewx( vy, result, 4 );
		__stvewx( vz, result, 8 );
	}
}

/*
============
idSIMD_Xenon::TracePointCull
============
*/
void VPCALL idSIMD_Xenon::TracePointCull( byte *__restrict cullBits, byte &totalOr, const float radius, const idPlane *__restrict planes, const idDrawVert *__restrict verts, const int numVerts ) {
	int i;
	byte tOr;

	tOr = 0;

	__vector4 px = { planes[0][0], planes[1][0], planes[2][0], planes[3][0] };
	__vector4 py = { planes[0][1], planes[1][1], planes[2][1], planes[3][1] };
	__vector4 pz = { planes[0][2], planes[1][2], planes[2][2], planes[3][2] };
	__vector4 pw = { planes[0][3], planes[1][3], planes[2][3], planes[3][3] };
	__vector4 vradius = { radius, radius, radius, radius };
	__vector4 vOr = vmx_byte_zero;

	for ( i = 0; i < numVerts - 7; i += 8 ) {

		__vector4 va_0 = *(__vector4 *)( &verts[i+0].xyz );
		__vector4 vb_0 = *(__vector4 *)( &verts[i+1].xyz );
		__vector4 vc_0 = *(__vector4 *)( &verts[i+2].xyz );
		__vector4 vd_0 = *(__vector4 *)( &verts[i+3].xyz );
		__vector4 va_1 = *(__vector4 *)( &verts[i+4].xyz );
		__vector4 vb_1 = *(__vector4 *)( &verts[i+5].xyz );
		__vector4 vc_1 = *(__vector4 *)( &verts[i+6].xyz );
		__vector4 vd_1 = *(__vector4 *)( &verts[i+7].xyz );

		__vector4 va0_0 = __vpermwi( va_0, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 va1_0 = __vpermwi( va_0, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 va2_0 = __vpermwi( va_0, SHUFFLE_D( 2, 2, 2, 2 ) );
		__vector4 va0_1 = __vpermwi( va_1, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 va1_1 = __vpermwi( va_1, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 va2_1 = __vpermwi( va_1, SHUFFLE_D( 2, 2, 2, 2 ) );

		__vector4 vb0_0 = __vpermwi( vb_0, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 vb1_0 = __vpermwi( vb_0, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 vb2_0 = __vpermwi( vb_0, SHUFFLE_D( 2, 2, 2, 2 ) );
		__vector4 vb0_1 = __vpermwi( vb_1, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 vb1_1 = __vpermwi( vb_1, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 vb2_1 = __vpermwi( vb_1, SHUFFLE_D( 2, 2, 2, 2 ) );

		__vector4 vc0_0 = __vpermwi( vc_0, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 vc1_0 = __vpermwi( vc_0, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 vc2_0 = __vpermwi( vc_0, SHUFFLE_D( 2, 2, 2, 2 ) );
		__vector4 vc0_1 = __vpermwi( vc_1, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 vc1_1 = __vpermwi( vc_1, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 vc2_1 = __vpermwi( vc_1, SHUFFLE_D( 2, 2, 2, 2 ) );

		__vector4 vd0_0 = __vpermwi( vd_0, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 vd1_0 = __vpermwi( vd_0, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 vd2_0 = __vpermwi( vd_0, SHUFFLE_D( 2, 2, 2, 2 ) );
		__vector4 vd0_1 = __vpermwi( vd_1, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 vd1_1 = __vpermwi( vd_1, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 vd2_1 = __vpermwi( vd_1, SHUFFLE_D( 2, 2, 2, 2 ) );

		__vector4 da_0 = __vmaddfp( va0_0, px, pw );
		__vector4 db_0 = __vmaddfp( vb0_0, px, pw );
		__vector4 dc_0 = __vmaddfp( vc0_0, px, pw );
		__vector4 dd_0 = __vmaddfp( vd0_0, px, pw );
		__vector4 da_1 = __vmaddfp( va0_1, px, pw );
		__vector4 db_1 = __vmaddfp( vb0_1, px, pw );
		__vector4 dc_1 = __vmaddfp( vc0_1, px, pw );
		__vector4 dd_1 = __vmaddfp( vd0_1, px, pw );

		da_0 = __vmaddfp( va1_0, py, da_0 );
		db_0 = __vmaddfp( vb1_0, py, db_0 );
		dc_0 = __vmaddfp( vc1_0, py, dc_0 );
		dd_0 = __vmaddfp( vd1_0, py, dd_0 );
		da_1 = __vmaddfp( va1_1, py, da_1 );
		db_1 = __vmaddfp( vb1_1, py, db_1 );
		dc_1 = __vmaddfp( vc1_1, py, dc_1 );
		dd_1 = __vmaddfp( vd1_1, py, dd_1 );

		da_0 = __vmaddfp( va2_0, pz, da_0 );
		db_0 = __vmaddfp( vb2_0, pz, db_0 );
		dc_0 = __vmaddfp( vc2_0, pz, dc_0 );
		dd_0 = __vmaddfp( vd2_0, pz, dd_0 );
		da_1 = __vmaddfp( va2_1, pz, da_1 );
		db_1 = __vmaddfp( vb2_1, pz, db_1 );
		dc_1 = __vmaddfp( vc2_1, pz, dc_1 );
		dd_1 = __vmaddfp( vd2_1, pz, dd_1 );

		__vector4 ta_0 = __vaddfp( da_0, vradius );
		__vector4 tb_0 = __vaddfp( db_0, vradius );
		__vector4 tc_0 = __vaddfp( dc_0, vradius );
		__vector4 td_0 = __vaddfp( dd_0, vradius );
		__vector4 ta_1 = __vaddfp( da_1, vradius );
		__vector4 tb_1 = __vaddfp( db_1, vradius );
		__vector4 tc_1 = __vaddfp( dc_1, vradius );
		__vector4 td_1 = __vaddfp( dd_1, vradius );

		__vector4 sa_0 = __vsubfp( da_0, vradius );
		__vector4 sb_0 = __vsubfp( db_0, vradius );
		__vector4 sc_0 = __vsubfp( dc_0, vradius );
		__vector4 sd_0 = __vsubfp( dd_0, vradius );
		__vector4 sa_1 = __vsubfp( da_1, vradius );
		__vector4 sb_1 = __vsubfp( db_1, vradius );
		__vector4 sc_1 = __vsubfp( dc_1, vradius );
		__vector4 sd_1 = __vsubfp( dd_1, vradius );

		ta_0 = __vcmpgtfp( ta_0, vmx_float_zero );
		tb_0 = __vcmpgtfp( tb_0, vmx_float_zero );
		tc_0 = __vcmpgtfp( tc_0, vmx_float_zero );
		td_0 = __vcmpgtfp( td_0, vmx_float_zero );
		ta_1 = __vcmpgtfp( ta_1, vmx_float_zero );
		tb_1 = __vcmpgtfp( tb_1, vmx_float_zero );
		tc_1 = __vcmpgtfp( tc_1, vmx_float_zero );
		td_1 = __vcmpgtfp( td_1, vmx_float_zero );

		sa_0 = __vcmpgtfp( sa_0, vmx_float_zero );
		sb_0 = __vcmpgtfp( sb_0, vmx_float_zero );
		sc_0 = __vcmpgtfp( sc_0, vmx_float_zero );
		sd_0 = __vcmpgtfp( sd_0, vmx_float_zero );
		sa_1 = __vcmpgtfp( sa_1, vmx_float_zero );
		sb_1 = __vcmpgtfp( sb_1, vmx_float_zero );
		sc_1 = __vcmpgtfp( sc_1, vmx_float_zero );
		sd_1 = __vcmpgtfp( sd_1, vmx_float_zero );

		ta_0 = __vand( ta_0, vmx_dword_trace_mask0 );
		tb_0 = __vand( tb_0, vmx_dword_trace_mask1 );
		tc_0 = __vand( tc_0, vmx_dword_trace_mask2 );
		td_0 = __vand( td_0, vmx_dword_trace_mask3 );
		ta_1 = __vand( ta_1, vmx_dword_trace_mask0 );
		tb_1 = __vand( tb_1, vmx_dword_trace_mask1 );
		tc_1 = __vand( tc_1, vmx_dword_trace_mask2 );
		td_1 = __vand( td_1, vmx_dword_trace_mask3 );

		sa_0 = __vand( sa_0, vmx_dword_trace_mask4 );
		sb_0 = __vand( sb_0, vmx_dword_trace_mask5 );
		sc_0 = __vand( sc_0, vmx_dword_trace_mask6 );
		sd_0 = __vand( sd_0, vmx_dword_trace_mask7 );
		sa_1 = __vand( sa_1, vmx_dword_trace_mask4 );
		sb_1 = __vand( sb_1, vmx_dword_trace_mask5 );
		sc_1 = __vand( sc_1, vmx_dword_trace_mask6 );
		sd_1 = __vand( sd_1, vmx_dword_trace_mask7 );

		ta_0 = __vor( ta_0, sa_0 );
		tb_0 = __vor( tb_0, sb_0 );
		tc_0 = __vor( tc_0, sc_0 );
		td_0 = __vor( td_0, sd_0 );
		ta_1 = __vor( ta_1, sa_1 );
		tb_1 = __vor( tb_1, sb_1 );
		tc_1 = __vor( tc_1, sc_1 );
		td_1 = __vor( td_1, sd_1 );

		ta_0 = __vor( ta_0, tb_0 );
		tc_0 = __vor( tc_0, td_0 );
		ta_1 = __vor( ta_1, tb_1 );
		tc_1 = __vor( tc_1, td_1 );

		ta_0 = __vor( ta_0, tc_0 );
		ta_1 = __vor( ta_1, tc_1 );

		__vector4 bits0_0 = __vpermwi( ta_0, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 bits1_0 = __vpermwi( ta_0, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 bits2_0 = __vpermwi( ta_0, SHUFFLE_D( 2, 2, 2, 2 ) );
		__vector4 bits3_0 = __vpermwi( ta_0, SHUFFLE_D( 3, 3, 3, 3 ) );
		__vector4 bits0_1 = __vpermwi( ta_1, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 bits1_1 = __vpermwi( ta_1, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 bits2_1 = __vpermwi( ta_1, SHUFFLE_D( 2, 2, 2, 2 ) );
		__vector4 bits3_1 = __vpermwi( ta_1, SHUFFLE_D( 3, 3, 3, 3 ) );

		bits0_0 = __vor( bits0_0, bits1_0 );
		bits2_0 = __vor( bits2_0, bits3_0 );
		bits0_0 = __vor( bits0_0, bits2_0 );
		bits0_1 = __vor( bits0_1, bits1_1 );
		bits2_1 = __vor( bits2_1, bits3_1 );
		bits0_1 = __vor( bits0_1, bits2_1 );

		bits0_0 = __vxor( bits0_0, vmx_dword_trace_xor );
		bits0_1 = __vxor( bits0_1, vmx_dword_trace_xor );

		vOr = __vor( vOr, bits0_0 );
		vOr = __vor( vOr, bits0_1 );

		bits0_0 = __vperm( bits0_0, bits0_0, vmx_dword_overlay_perm );
		bits0_1 = __vperm( bits0_1, bits0_1, vmx_dword_overlay_perm );

		__stvewx( bits0_0, cullBits, i );
		__stvewx( bits0_1, cullBits, i+4 );
	}

	for ( ; i < numVerts - 3; i += 4 ) {

		__vector4 va_0 = *(__vector4 *)( &verts[i+0].xyz );
		__vector4 vb_0 = *(__vector4 *)( &verts[i+1].xyz );
		__vector4 vc_0 = *(__vector4 *)( &verts[i+2].xyz );
		__vector4 vd_0 = *(__vector4 *)( &verts[i+3].xyz );

		__vector4 va0_0 = __vpermwi( va_0, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 va1_0 = __vpermwi( va_0, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 va2_0 = __vpermwi( va_0, SHUFFLE_D( 2, 2, 2, 2 ) );

		__vector4 vb0_0 = __vpermwi( vb_0, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 vb1_0 = __vpermwi( vb_0, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 vb2_0 = __vpermwi( vb_0, SHUFFLE_D( 2, 2, 2, 2 ) );

		__vector4 vc0_0 = __vpermwi( vc_0, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 vc1_0 = __vpermwi( vc_0, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 vc2_0 = __vpermwi( vc_0, SHUFFLE_D( 2, 2, 2, 2 ) );

		__vector4 vd0_0 = __vpermwi( vd_0, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 vd1_0 = __vpermwi( vd_0, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 vd2_0 = __vpermwi( vd_0, SHUFFLE_D( 2, 2, 2, 2 ) );

		__vector4 da_0 = __vmaddfp( va0_0, px, pw );
		__vector4 db_0 = __vmaddfp( vb0_0, px, pw );
		__vector4 dc_0 = __vmaddfp( vc0_0, px, pw );
		__vector4 dd_0 = __vmaddfp( vd0_0, px, pw );

		da_0 = __vmaddfp( va1_0, py, da_0 );
		db_0 = __vmaddfp( vb1_0, py, db_0 );
		dc_0 = __vmaddfp( vc1_0, py, dc_0 );
		dd_0 = __vmaddfp( vd1_0, py, dd_0 );

		da_0 = __vmaddfp( va2_0, pz, da_0 );
		db_0 = __vmaddfp( vb2_0, pz, db_0 );
		dc_0 = __vmaddfp( vc2_0, pz, dc_0 );
		dd_0 = __vmaddfp( vd2_0, pz, dd_0 );

		__vector4 ta_0 = __vaddfp( da_0, vradius );
		__vector4 tb_0 = __vaddfp( db_0, vradius );
		__vector4 tc_0 = __vaddfp( dc_0, vradius );
		__vector4 td_0 = __vaddfp( dd_0, vradius );

		__vector4 sa_0 = __vsubfp( da_0, vradius );
		__vector4 sb_0 = __vsubfp( db_0, vradius );
		__vector4 sc_0 = __vsubfp( dc_0, vradius );
		__vector4 sd_0 = __vsubfp( dd_0, vradius );

		ta_0 = __vcmpgtfp( ta_0, vmx_float_zero );
		tb_0 = __vcmpgtfp( tb_0, vmx_float_zero );
		tc_0 = __vcmpgtfp( tc_0, vmx_float_zero );
		td_0 = __vcmpgtfp( td_0, vmx_float_zero );

		sa_0 = __vcmpgtfp( sa_0, vmx_float_zero );
		sb_0 = __vcmpgtfp( sb_0, vmx_float_zero );
		sc_0 = __vcmpgtfp( sc_0, vmx_float_zero );
		sd_0 = __vcmpgtfp( sd_0, vmx_float_zero );

		ta_0 = __vand( ta_0, vmx_dword_trace_mask0 );
		tb_0 = __vand( tb_0, vmx_dword_trace_mask1 );
		tc_0 = __vand( tc_0, vmx_dword_trace_mask2 );
		td_0 = __vand( td_0, vmx_dword_trace_mask3 );

		sa_0 = __vand( sa_0, vmx_dword_trace_mask4 );
		sb_0 = __vand( sb_0, vmx_dword_trace_mask5 );
		sc_0 = __vand( sc_0, vmx_dword_trace_mask6 );
		sd_0 = __vand( sd_0, vmx_dword_trace_mask7 );

		ta_0 = __vor( ta_0, sa_0 );
		tb_0 = __vor( tb_0, sb_0 );
		tc_0 = __vor( tc_0, sc_0 );
		td_0 = __vor( td_0, sd_0 );

		ta_0 = __vor( ta_0, tb_0 );
		tc_0 = __vor( tc_0, td_0 );

		ta_0 = __vor( ta_0, tc_0 );

		__vector4 bits0_0 = __vpermwi( ta_0, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 bits1_0 = __vpermwi( ta_0, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 bits2_0 = __vpermwi( ta_0, SHUFFLE_D( 2, 2, 2, 2 ) );
		__vector4 bits3_0 = __vpermwi( ta_0, SHUFFLE_D( 3, 3, 3, 3 ) );

		bits0_0 = __vor( bits0_0, bits1_0 );
		bits2_0 = __vor( bits2_0, bits3_0 );
		bits0_0 = __vor( bits0_0, bits2_0 );

		bits0_0 = __vxor( bits0_0, vmx_dword_trace_xor );

		vOr = __vor( vOr, bits0_0 );

		bits0_0 = __vperm( bits0_0, bits0_0, vmx_dword_overlay_perm );

		__stvewx( bits0_0, cullBits, i );
	}

	for ( ; i < numVerts - 1; i += 2 ) {
		byte bits0, bits1;

		const idVec3 &v0 = verts[i+0].xyz;
		const idVec3 &v1 = verts[i+1].xyz;

		float d0  = planes[0][0] * v0[0] + planes[0][1] * v0[1] + planes[0][2] * v0[2] + planes[0][3];
		float d1  = planes[1][0] * v0[0] + planes[1][1] * v0[1] + planes[1][2] * v0[2] + planes[1][3];
		float d2  = planes[2][0] * v0[0] + planes[2][1] * v0[1] + planes[2][2] * v0[2] + planes[2][3];
		float d3  = planes[3][0] * v0[0] + planes[3][1] * v0[1] + planes[3][2] * v0[2] + planes[3][3];

		float d4  = planes[0][0] * v1[0] + planes[0][1] * v1[1] + planes[0][2] * v1[2] + planes[0][3];
		float d5  = planes[1][0] * v1[0] + planes[1][1] * v1[1] + planes[1][2] * v1[2] + planes[1][3];
		float d6  = planes[2][0] * v1[0] + planes[2][1] * v1[1] + planes[2][2] * v1[2] + planes[2][3];
		float d7  = planes[3][0] * v1[0] + planes[3][1] * v1[1] + planes[3][2] * v1[2] + planes[3][3];

		float t0  = d0  + radius;
		float t1  = d1  + radius;
		float t2  = d2  + radius;
		float t3  = d3  + radius;

		float s0  = d0  - radius;
		float s1  = d1  - radius;
		float s2  = d2  - radius;
		float s3  = d3  - radius;

		float t4  = d4  + radius;
		float t5  = d5  + radius;
		float t6  = d6  + radius;
		float t7  = d7  + radius;

		float s4  = d4  - radius;
		float s5  = d5  - radius;
		float s6  = d6  - radius;
		float s7  = d7  - radius;

		bits0  = FLOATSIGNBITSET( t0 ) << 0;
		bits0 |= FLOATSIGNBITSET( t1 ) << 1;
		bits0 |= FLOATSIGNBITSET( t2 ) << 2;
		bits0 |= FLOATSIGNBITSET( t3 ) << 3;
		bits0 |= FLOATSIGNBITSET( s0 ) << 4;
		bits0 |= FLOATSIGNBITSET( t1 ) << 5;
		bits0 |= FLOATSIGNBITSET( t2 ) << 6;
		bits0 |= FLOATSIGNBITSET( t3 ) << 7;

		bits1  = FLOATSIGNBITSET( t4 ) << 0;
		bits1 |= FLOATSIGNBITSET( t5 ) << 1;
		bits1 |= FLOATSIGNBITSET( t6 ) << 2;
		bits1 |= FLOATSIGNBITSET( t7 ) << 3;
		bits1 |= FLOATSIGNBITSET( s4 ) << 4;
		bits1 |= FLOATSIGNBITSET( t5 ) << 5;
		bits1 |= FLOATSIGNBITSET( t6 ) << 6;
		bits1 |= FLOATSIGNBITSET( t7 ) << 7;

		bits0 ^= 0x0F;		// flip lower four bits
		bits1 ^= 0x0F;		// flip lower four bits

		tOr |= bits0;
		tOr |= bits1;

		cullBits[i+0] = bits0;
		cullBits[i+1] = bits1;
	}

	for ( ; i < numVerts; i++ ) {
		byte bits;

		const idVec3 &v = verts[i].xyz;

		float d0 = planes[0][0] * v[0] + planes[0][1] * v[1] + planes[0][2] * v[2] + planes[0][3];
		float d1 = planes[1][0] * v[0] + planes[1][1] * v[1] + planes[1][2] * v[2] + planes[1][3];
		float d2 = planes[2][0] * v[0] + planes[2][1] * v[1] + planes[2][2] * v[2] + planes[2][3];
		float d3 = planes[3][0] * v[0] + planes[3][1] * v[1] + planes[3][2] * v[2] + planes[3][3];

		float t0 = d0 + radius;
		float t1 = d1 + radius;
		float t2 = d2 + radius;
		float t3 = d3 + radius;
		float t4 = d0 - radius;
		float t5 = d1 - radius;
		float t6 = d2 - radius;
		float t7 = d3 - radius;

		bits  = FLOATSIGNBITSET( t0 ) << 0;
		bits |= FLOATSIGNBITSET( t1 ) << 1;
		bits |= FLOATSIGNBITSET( t2 ) << 2;
		bits |= FLOATSIGNBITSET( t3 ) << 3;
		bits |= FLOATSIGNBITSET( t4 ) << 4;
		bits |= FLOATSIGNBITSET( t5 ) << 5;
		bits |= FLOATSIGNBITSET( t6 ) << 6;
		bits |= FLOATSIGNBITSET( t7 ) << 7;

		bits ^= 0x0F;		// flip lower four bits

		tOr |= bits;
		cullBits[i] = bits;
	}

	byte *ptr = (byte *)&vOr;

	totalOr = tOr | ptr[0] | ptr[1] | ptr[2] | ptr[3];
}

/*
============
idSIMD_Xenon::DecalPointCull
============
*/
void VPCALL idSIMD_Xenon::DecalPointCull( byte *__restrict cullBits, const idPlane *__restrict planes, const idDrawVert *__restrict verts, const int numVerts ) {
	int i;

	__vector4 p0x = { planes[0][0], planes[1][0], planes[2][0], planes[3][0] };
	__vector4 p0y = { planes[0][1], planes[1][1], planes[2][1], planes[3][1] };
	__vector4 p0z = { planes[0][2], planes[1][2], planes[2][2], planes[3][2] };
	__vector4 p0w = { planes[0][3], planes[1][3], planes[2][3], planes[3][3] };

	__vector4 p1x = { planes[4][0], planes[5][0], planes[4][0], planes[5][0] };
	__vector4 p1y = { planes[4][1], planes[5][1], planes[4][1], planes[5][1] };
	__vector4 p1z = { planes[4][2], planes[5][2], planes[4][2], planes[5][2] };
	__vector4 p1w = { planes[4][3], planes[5][3], planes[4][3], planes[5][3] };

	for ( i = 0; i < numVerts - 7; i += 8 ) {

		__vector4 va_0 = *(__vector4 *)( &verts[i+0].xyz );
		__vector4 vb_0 = *(__vector4 *)( &verts[i+1].xyz );
		__vector4 vc_0 = *(__vector4 *)( &verts[i+2].xyz );
		__vector4 vd_0 = *(__vector4 *)( &verts[i+3].xyz );
		__vector4 va_1 = *(__vector4 *)( &verts[i+4].xyz );
		__vector4 vb_1 = *(__vector4 *)( &verts[i+5].xyz );
		__vector4 vc_1 = *(__vector4 *)( &verts[i+6].xyz );
		__vector4 vd_1 = *(__vector4 *)( &verts[i+7].xyz );

		__vector4 ta_0 = __vmrghw( va_0, vb_0 );
		__vector4 tb_0 = __vmrghw( vc_0, vd_0 );
		__vector4 tc_0 = __vmrglw( va_0, vb_0 );
		__vector4 td_0 = __vmrglw( vc_0, vd_0 );
		__vector4 ta_1 = __vmrghw( va_1, vb_1 );
		__vector4 tb_1 = __vmrghw( vc_1, vd_1 );
		__vector4 tc_1 = __vmrglw( va_1, vb_1 );
		__vector4 td_1 = __vmrglw( vc_1, vd_1 );

		__vector4 va0_0 = __vpermwi( va_0, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 va1_0 = __vpermwi( va_0, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 va2_0 = __vpermwi( va_0, SHUFFLE_D( 2, 2, 2, 2 ) );
		__vector4 va0_1 = __vpermwi( va_1, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 va1_1 = __vpermwi( va_1, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 va2_1 = __vpermwi( va_1, SHUFFLE_D( 2, 2, 2, 2 ) );

		__vector4 vb0_0 = __vpermwi( vb_0, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 vb1_0 = __vpermwi( vb_0, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 vb2_0 = __vpermwi( vb_0, SHUFFLE_D( 2, 2, 2, 2 ) );
		__vector4 vb0_1 = __vpermwi( vb_1, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 vb1_1 = __vpermwi( vb_1, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 vb2_1 = __vpermwi( vb_1, SHUFFLE_D( 2, 2, 2, 2 ) );

		__vector4 vc0_0 = __vpermwi( vc_0, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 vc1_0 = __vpermwi( vc_0, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 vc2_0 = __vpermwi( vc_0, SHUFFLE_D( 2, 2, 2, 2 ) );
		__vector4 vc0_1 = __vpermwi( vc_1, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 vc1_1 = __vpermwi( vc_1, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 vc2_1 = __vpermwi( vc_1, SHUFFLE_D( 2, 2, 2, 2 ) );

		__vector4 vd0_0 = __vpermwi( vd_0, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 vd1_0 = __vpermwi( vd_0, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 vd2_0 = __vpermwi( vd_0, SHUFFLE_D( 2, 2, 2, 2 ) );
		__vector4 vd0_1 = __vpermwi( vd_1, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 vd1_1 = __vpermwi( vd_1, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 vd2_1 = __vpermwi( vd_1, SHUFFLE_D( 2, 2, 2, 2 ) );

		__vector4 ve0_0 = __vpermwi( ta_0, SHUFFLE_D( 0, 0, 1, 1 ) );
		__vector4 ve1_0 = __vpermwi( ta_0, SHUFFLE_D( 2, 2, 3, 3 ) );
		__vector4 ve2_0 = __vpermwi( tc_0, SHUFFLE_D( 0, 0, 1, 1 ) );
		__vector4 ve0_1 = __vpermwi( ta_1, SHUFFLE_D( 0, 0, 1, 1 ) );
		__vector4 ve1_1 = __vpermwi( ta_1, SHUFFLE_D( 2, 2, 3, 3 ) );
		__vector4 ve2_1 = __vpermwi( tc_1, SHUFFLE_D( 0, 0, 1, 1 ) );

		__vector4 vf0_0 = __vpermwi( tb_0, SHUFFLE_D( 0, 0, 1, 1 ) );
		__vector4 vf1_0 = __vpermwi( tb_0, SHUFFLE_D( 2, 2, 3, 3 ) );
		__vector4 vf2_0 = __vpermwi( td_0, SHUFFLE_D( 0, 0, 1, 1 ) );
		__vector4 vf0_1 = __vpermwi( tb_1, SHUFFLE_D( 0, 0, 1, 1 ) );
		__vector4 vf1_1 = __vpermwi( tb_1, SHUFFLE_D( 2, 2, 3, 3 ) );
		__vector4 vf2_1 = __vpermwi( td_1, SHUFFLE_D( 0, 0, 1, 1 ) );

		__vector4 da_0 = __vmaddfp( va0_0, p0x, p0w );
		__vector4 db_0 = __vmaddfp( vb0_0, p0x, p0w );
		__vector4 dc_0 = __vmaddfp( vc0_0, p0x, p0w );
		__vector4 dd_0 = __vmaddfp( vd0_0, p0x, p0w );
		__vector4 de_0 = __vmaddfp( ve0_0, p1x, p1w );
		__vector4 df_0 = __vmaddfp( vf0_0, p1x, p1w );
		__vector4 da_1 = __vmaddfp( va0_1, p0x, p0w );
		__vector4 db_1 = __vmaddfp( vb0_1, p0x, p0w );
		__vector4 dc_1 = __vmaddfp( vc0_1, p0x, p0w );
		__vector4 dd_1 = __vmaddfp( vd0_1, p0x, p0w );
		__vector4 de_1 = __vmaddfp( ve0_1, p1x, p1w );
		__vector4 df_1 = __vmaddfp( vf0_1, p1x, p1w );

		da_0 = __vmaddfp( va1_0, p0y, da_0 );
		db_0 = __vmaddfp( vb1_0, p0y, db_0 );
		dc_0 = __vmaddfp( vc1_0, p0y, dc_0 );
		dd_0 = __vmaddfp( vd1_0, p0y, dd_0 );
		de_0 = __vmaddfp( ve1_0, p1y, de_0 );
		df_0 = __vmaddfp( vf1_0, p1y, df_0 );
		da_1 = __vmaddfp( va1_1, p0y, da_1 );
		db_1 = __vmaddfp( vb1_1, p0y, db_1 );
		dc_1 = __vmaddfp( vc1_1, p0y, dc_1 );
		dd_1 = __vmaddfp( vd1_1, p0y, dd_1 );
		de_1 = __vmaddfp( ve1_1, p1y, de_1 );
		df_1 = __vmaddfp( vf1_1, p1y, df_1 );

		da_0 = __vmaddfp( va2_0, p0z, da_0 );
		db_0 = __vmaddfp( vb2_0, p0z, db_0 );
		dc_0 = __vmaddfp( vc2_0, p0z, dc_0 );
		dd_0 = __vmaddfp( vd2_0, p0z, dd_0 );
		de_0 = __vmaddfp( ve2_0, p1z, de_0 );
		df_0 = __vmaddfp( vf2_0, p1z, df_0 );
		da_1 = __vmaddfp( va2_1, p0z, da_1 );
		db_1 = __vmaddfp( vb2_1, p0z, db_1 );
		dc_1 = __vmaddfp( vc2_1, p0z, dc_1 );
		dd_1 = __vmaddfp( vd2_1, p0z, dd_1 );
		de_1 = __vmaddfp( ve2_1, p1z, de_1 );
		df_1 = __vmaddfp( vf2_1, p1z, df_1 );

		da_0 = __vcmpgtfp( da_0, vmx_float_zero );
		db_0 = __vcmpgtfp( db_0, vmx_float_zero );
		dc_0 = __vcmpgtfp( dc_0, vmx_float_zero );
		dd_0 = __vcmpgtfp( dd_0, vmx_float_zero );
		de_0 = __vcmpgtfp( de_0, vmx_float_zero );
		df_0 = __vcmpgtfp( df_0, vmx_float_zero );
		da_1 = __vcmpgtfp( da_1, vmx_float_zero );
		db_1 = __vcmpgtfp( db_1, vmx_float_zero );
		dc_1 = __vcmpgtfp( dc_1, vmx_float_zero );
		dd_1 = __vcmpgtfp( dd_1, vmx_float_zero );
		de_1 = __vcmpgtfp( de_1, vmx_float_zero );
		df_1 = __vcmpgtfp( df_1, vmx_float_zero );

		da_0 = __vand( da_0, vmx_dword_decal_mask0 );		//  0  1  2  3
		db_0 = __vand( db_0, vmx_dword_decal_mask1 );		//  8  9 10 11
		dc_0 = __vand( dc_0, vmx_dword_decal_mask2 );		// 16 17 18 19
		dd_0 = __vand( dd_0, vmx_dword_decal_mask3 );		// 24 25 26 27
		de_0 = __vand( de_0, vmx_dword_decal_mask4 );		//  4  5 12 13
		df_0 = __vand( df_0, vmx_dword_decal_mask5 );		// 20 21 28 29
		da_1 = __vand( da_1, vmx_dword_decal_mask0 );		//  0  1  2  3
		db_1 = __vand( db_1, vmx_dword_decal_mask1 );		//  8  9 10 11
		dc_1 = __vand( dc_1, vmx_dword_decal_mask2 );		// 16 17 18 19
		dd_1 = __vand( dd_1, vmx_dword_decal_mask3 );		// 24 25 26 27
		de_1 = __vand( de_1, vmx_dword_decal_mask4 );		//  4  5 12 13
		df_1 = __vand( df_1, vmx_dword_decal_mask5 );		// 20 21 28 29

		da_0 = __vor( da_0, db_0 );
		dc_0 = __vor( dc_0, dd_0 );
		de_0 = __vor( de_0, df_0 );
		da_1 = __vor( da_1, db_1 );
		dc_1 = __vor( dc_1, dd_1 );
		de_1 = __vor( de_1, df_1 );

		da_0 = __vor( da_0, dc_0 );
		da_0 = __vor( da_0, de_0 );
		da_1 = __vor( da_1, dc_1 );
		da_1 = __vor( da_1, de_1 );

		__vector4 bits0_0 = __vpermwi( da_0, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 bits1_0 = __vpermwi( da_0, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 bits2_0 = __vpermwi( da_0, SHUFFLE_D( 2, 2, 2, 2 ) );
		__vector4 bits3_0 = __vpermwi( da_0, SHUFFLE_D( 3, 3, 3, 3 ) );
		__vector4 bits0_1 = __vpermwi( da_1, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 bits1_1 = __vpermwi( da_1, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 bits2_1 = __vpermwi( da_1, SHUFFLE_D( 2, 2, 2, 2 ) );
		__vector4 bits3_1 = __vpermwi( da_1, SHUFFLE_D( 3, 3, 3, 3 ) );

		bits0_0 = __vor( bits0_0, bits1_0 );
		bits2_0 = __vor( bits2_0, bits3_0 );
		bits0_0 = __vor( bits0_0, bits2_0 );
		bits0_1 = __vor( bits0_1, bits1_1 );
		bits2_1 = __vor( bits2_1, bits3_1 );
		bits0_1 = __vor( bits0_1, bits2_1 );

		bits0_0 = __vperm( bits0_0, bits0_0, vmx_dword_overlay_perm );
		bits0_1 = __vperm( bits0_1, bits0_1, vmx_dword_overlay_perm );

		__stvewx( bits0_0, cullBits, i );
		__stvewx( bits0_1, cullBits, i+4 );
	}

	for ( ; i < numVerts - 3; i += 4 ) {

		__vector4 va_0 = *(__vector4 *)( &verts[i+0].xyz );
		__vector4 vb_0 = *(__vector4 *)( &verts[i+1].xyz );
		__vector4 vc_0 = *(__vector4 *)( &verts[i+2].xyz );
		__vector4 vd_0 = *(__vector4 *)( &verts[i+3].xyz );

		__vector4 ta_0 = __vmrghw( va_0, vb_0 );
		__vector4 tb_0 = __vmrghw( vc_0, vd_0 );
		__vector4 tc_0 = __vmrglw( va_0, vb_0 );
		__vector4 td_0 = __vmrglw( vc_0, vd_0 );

		__vector4 va0_0 = __vpermwi( va_0, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 va1_0 = __vpermwi( va_0, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 va2_0 = __vpermwi( va_0, SHUFFLE_D( 2, 2, 2, 2 ) );

		__vector4 vb0_0 = __vpermwi( vb_0, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 vb1_0 = __vpermwi( vb_0, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 vb2_0 = __vpermwi( vb_0, SHUFFLE_D( 2, 2, 2, 2 ) );

		__vector4 vc0_0 = __vpermwi( vc_0, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 vc1_0 = __vpermwi( vc_0, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 vc2_0 = __vpermwi( vc_0, SHUFFLE_D( 2, 2, 2, 2 ) );

		__vector4 vd0_0 = __vpermwi( vd_0, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 vd1_0 = __vpermwi( vd_0, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 vd2_0 = __vpermwi( vd_0, SHUFFLE_D( 2, 2, 2, 2 ) );

		__vector4 ve0_0 = __vpermwi( ta_0, SHUFFLE_D( 0, 0, 1, 1 ) );
		__vector4 ve1_0 = __vpermwi( ta_0, SHUFFLE_D( 2, 2, 3, 3 ) );
		__vector4 ve2_0 = __vpermwi( tc_0, SHUFFLE_D( 0, 0, 1, 1 ) );

		__vector4 vf0_0 = __vpermwi( tb_0, SHUFFLE_D( 0, 0, 1, 1 ) );
		__vector4 vf1_0 = __vpermwi( tb_0, SHUFFLE_D( 2, 2, 3, 3 ) );
		__vector4 vf2_0 = __vpermwi( td_0, SHUFFLE_D( 0, 0, 1, 1 ) );

		__vector4 da_0 = __vmaddfp( va0_0, p0x, p0w );
		__vector4 db_0 = __vmaddfp( vb0_0, p0x, p0w );
		__vector4 dc_0 = __vmaddfp( vc0_0, p0x, p0w );
		__vector4 dd_0 = __vmaddfp( vd0_0, p0x, p0w );
		__vector4 de_0 = __vmaddfp( ve0_0, p1x, p1w );
		__vector4 df_0 = __vmaddfp( vf0_0, p1x, p1w );

		da_0 = __vmaddfp( va1_0, p0y, da_0 );
		db_0 = __vmaddfp( vb1_0, p0y, db_0 );
		dc_0 = __vmaddfp( vc1_0, p0y, dc_0 );
		dd_0 = __vmaddfp( vd1_0, p0y, dd_0 );
		de_0 = __vmaddfp( ve1_0, p1y, de_0 );
		df_0 = __vmaddfp( vf1_0, p1y, df_0 );

		da_0 = __vmaddfp( va2_0, p0z, da_0 );
		db_0 = __vmaddfp( vb2_0, p0z, db_0 );
		dc_0 = __vmaddfp( vc2_0, p0z, dc_0 );
		dd_0 = __vmaddfp( vd2_0, p0z, dd_0 );
		de_0 = __vmaddfp( ve2_0, p1z, de_0 );
		df_0 = __vmaddfp( vf2_0, p1z, df_0 );

		da_0 = __vcmpgtfp( da_0, vmx_float_zero );
		db_0 = __vcmpgtfp( db_0, vmx_float_zero );
		dc_0 = __vcmpgtfp( dc_0, vmx_float_zero );
		dd_0 = __vcmpgtfp( dd_0, vmx_float_zero );
		de_0 = __vcmpgtfp( de_0, vmx_float_zero );
		df_0 = __vcmpgtfp( df_0, vmx_float_zero );

		da_0 = __vand( da_0, vmx_dword_decal_mask0 );		//  0  1  2  3
		db_0 = __vand( db_0, vmx_dword_decal_mask1 );		//  8  9 10 11
		dc_0 = __vand( dc_0, vmx_dword_decal_mask2 );		// 16 17 18 19
		dd_0 = __vand( dd_0, vmx_dword_decal_mask3 );		// 24 25 26 27
		de_0 = __vand( de_0, vmx_dword_decal_mask4 );		//  4  5 12 13
		df_0 = __vand( df_0, vmx_dword_decal_mask5 );		// 20 21 28 29

		da_0 = __vor( da_0, db_0 );
		dc_0 = __vor( dc_0, dd_0 );
		de_0 = __vor( de_0, df_0 );

		da_0 = __vor( da_0, dc_0 );
		da_0 = __vor( da_0, de_0 );

		__vector4 bits0_0 = __vpermwi( da_0, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 bits1_0 = __vpermwi( da_0, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 bits2_0 = __vpermwi( da_0, SHUFFLE_D( 2, 2, 2, 2 ) );
		__vector4 bits3_0 = __vpermwi( da_0, SHUFFLE_D( 3, 3, 3, 3 ) );

		bits0_0 = __vor( bits0_0, bits1_0 );
		bits2_0 = __vor( bits2_0, bits3_0 );
		bits0_0 = __vor( bits0_0, bits2_0 );

		bits0_0 = __vperm( bits0_0, bits0_0, vmx_dword_overlay_perm );

		__stvewx( bits0_0, cullBits, i );
	}

	for ( ; i < numVerts - 1; i += 2 ) {
		int bits0, bits1;

		const idVec3 &v0 = verts[i+0].xyz;
		const idVec3 &v1 = verts[i+1].xyz;

		float d0  = planes[0][0] * v0[0] + planes[0][1] * v0[1] + planes[0][2] * v0[2] + planes[0][3];
		float d1  = planes[1][0] * v0[0] + planes[1][1] * v0[1] + planes[1][2] * v0[2] + planes[1][3];
		float d2  = planes[2][0] * v0[0] + planes[2][1] * v0[1] + planes[2][2] * v0[2] + planes[2][3];
		float d3  = planes[3][0] * v0[0] + planes[3][1] * v0[1] + planes[3][2] * v0[2] + planes[3][3];

		float d4  = planes[4][0] * v0[0] + planes[4][1] * v0[1] + planes[4][2] * v0[2] + planes[4][3];
		float d5  = planes[5][0] * v0[0] + planes[5][1] * v0[1] + planes[5][2] * v0[2] + planes[5][3];
		float d10 = planes[4][0] * v1[0] + planes[4][1] * v1[1] + planes[4][2] * v1[2] + planes[4][3];
		float d11 = planes[5][0] * v1[0] + planes[5][1] * v1[1] + planes[5][2] * v1[2] + planes[5][3];

		float d6  = planes[0][0] * v1[0] + planes[0][1] * v1[1] + planes[0][2] * v1[2] + planes[0][3];
		float d7  = planes[1][0] * v1[0] + planes[1][1] * v1[1] + planes[1][2] * v1[2] + planes[1][3];
		float d8  = planes[2][0] * v1[0] + planes[2][1] * v1[1] + planes[2][2] * v1[2] + planes[2][3];
		float d9  = planes[3][0] * v1[0] + planes[3][1] * v1[1] + planes[3][2] * v1[2] + planes[3][3];

		bits0  = FLOATSIGNBITSET( d0  ) << 0;
		bits0 |= FLOATSIGNBITSET( d1  ) << 1;
		bits0 |= FLOATSIGNBITSET( d2  ) << 2;
		bits0 |= FLOATSIGNBITSET( d3  ) << 3;
		bits0 |= FLOATSIGNBITSET( d4  ) << 4;
		bits0 |= FLOATSIGNBITSET( d5  ) << 5;

		bits1  = FLOATSIGNBITSET( d6  ) << 0;
		bits1 |= FLOATSIGNBITSET( d7  ) << 1;
		bits1 |= FLOATSIGNBITSET( d8  ) << 2;
		bits1 |= FLOATSIGNBITSET( d9  ) << 3;
		bits1 |= FLOATSIGNBITSET( d10 ) << 4;
		bits1 |= FLOATSIGNBITSET( d11 ) << 5;

		cullBits[i+0] = bits0 ^ 0x3F;				// flip lower 6 bits
		cullBits[i+1] = bits1 ^ 0x3F;				// flip lower 6 bits
	}

	if ( numVerts & 1 ) {
		byte bits;
		float d0, d1, d2, d3, d4, d5;
		const idVec3 &v = verts[numVerts - 1].xyz;

		d0 = planes[0][0] * v[0] + planes[0][1] * v[1] + planes[0][2] * v[2] + planes[0][3];
		d1 = planes[1][0] * v[0] + planes[1][1] * v[1] + planes[1][2] * v[2] + planes[1][3];
		d2 = planes[2][0] * v[0] + planes[2][1] * v[1] + planes[2][2] * v[2] + planes[2][3];
		d3 = planes[3][0] * v[0] + planes[3][1] * v[1] + planes[3][2] * v[2] + planes[3][3];

		d4 = planes[4][0] * v[0] + planes[4][1] * v[1] + planes[4][2] * v[2] + planes[4][3];
		d5 = planes[5][0] * v[0] + planes[5][1] * v[1] + planes[5][2] * v[2] + planes[5][3];

		bits  = FLOATSIGNBITSET( d0 ) << 0;
		bits |= FLOATSIGNBITSET( d1 ) << 1;
		bits |= FLOATSIGNBITSET( d2 ) << 2;
		bits |= FLOATSIGNBITSET( d3 ) << 3;
		bits |= FLOATSIGNBITSET( d4 ) << 4;
		bits |= FLOATSIGNBITSET( d5 ) << 5;

		cullBits[numVerts - 1] = bits ^ 0x3F;		// flip lower 6 bits
	}
}

/*
============
idSIMD_Xenon::OverlayPointCull
============
*/
void VPCALL idSIMD_Xenon::OverlayPointCull( byte *__restrict cullBits, idVec2 *__restrict texCoords, const idPlane *__restrict planes, const idDrawVert *__restrict verts, const int numVerts ) {
	int i;

	const idPlane &p0 = planes[0];
	const idPlane &p1 = planes[1];

	__vector4 px = { p0[0], p1[0], p0[0], p1[0] };
	__vector4 py = { p0[1], p1[1], p0[1], p1[1] };
	__vector4 pz = { p0[2], p1[2], p0[2], p1[2] };
	__vector4 pw = { p0[3], p1[3], p0[3], p1[3] };

	for ( i = 0; i < numVerts - 7; i += 8 ) {

		__vector4 va_0 = *(__vector4 *)( &verts[i+0].xyz );
		__vector4 vb_0 = *(__vector4 *)( &verts[i+1].xyz );
		__vector4 vc_0 = *(__vector4 *)( &verts[i+2].xyz );
		__vector4 vd_0 = *(__vector4 *)( &verts[i+3].xyz );
		__vector4 va_1 = *(__vector4 *)( &verts[i+4].xyz );
		__vector4 vb_1 = *(__vector4 *)( &verts[i+5].xyz );
		__vector4 vc_1 = *(__vector4 *)( &verts[i+6].xyz );
		__vector4 vd_1 = *(__vector4 *)( &verts[i+7].xyz );

		__vector4 ta_0 = __vmrghw( va_0, vb_0 );		// 00, 10, 01, 11
		__vector4 tb_0 = __vmrghw( vc_0, vd_0 );		// 00, 10, 01, 11
		__vector4 tc_0 = __vmrglw( va_0, vb_0 );		// 02, 12, 03, 13
		__vector4 td_0 = __vmrglw( vc_0, vd_0 );		// 02, 12, 03, 13
		__vector4 ta_1 = __vmrghw( va_1, vb_1 );		// 00, 10, 01, 11
		__vector4 tb_1 = __vmrghw( vc_1, vd_1 );		// 00, 10, 01, 11
		__vector4 tc_1 = __vmrglw( va_1, vb_1 );		// 02, 12, 03, 13
		__vector4 td_1 = __vmrglw( vc_1, vd_1 );		// 02, 12, 03, 13

		__vector4 sa_0 = __vpermwi( ta_0, SHUFFLE_D( 0, 0, 1, 1 ) );
		__vector4 sb_0 = __vpermwi( ta_0, SHUFFLE_D( 2, 2, 3, 3 ) );
		__vector4 sc_0 = __vpermwi( tc_0, SHUFFLE_D( 0, 0, 1, 1 ) );
		__vector4 sa_1 = __vpermwi( ta_1, SHUFFLE_D( 0, 0, 1, 1 ) );
		__vector4 sb_1 = __vpermwi( ta_1, SHUFFLE_D( 2, 2, 3, 3 ) );
		__vector4 sc_1 = __vpermwi( tc_1, SHUFFLE_D( 0, 0, 1, 1 ) );

		__vector4 sd_0 = __vpermwi( tb_0, SHUFFLE_D( 0, 0, 1, 1 ) );
		__vector4 se_0 = __vpermwi( tb_0, SHUFFLE_D( 2, 2, 3, 3 ) );
		__vector4 sf_0 = __vpermwi( td_0, SHUFFLE_D( 0, 0, 1, 1 ) );
		__vector4 sd_1 = __vpermwi( tb_1, SHUFFLE_D( 0, 0, 1, 1 ) );
		__vector4 se_1 = __vpermwi( tb_1, SHUFFLE_D( 2, 2, 3, 3 ) );
		__vector4 sf_1 = __vpermwi( td_1, SHUFFLE_D( 0, 0, 1, 1 ) );

		__vector4 da_0 = __vmaddfp( sa_0, px, pw );
		__vector4 db_0 = __vmaddfp( sd_0, px, pw );
		__vector4 da_1 = __vmaddfp( sa_1, px, pw );
		__vector4 db_1 = __vmaddfp( sd_1, px, pw );

		da_0 = __vmaddfp( sb_0, py, da_0 );
		db_0 = __vmaddfp( se_0, py, db_0 );
		da_1 = __vmaddfp( sb_1, py, da_1 );
		db_1 = __vmaddfp( se_1, py, db_1 );

		da_0 = __vmaddfp( sc_0, pz, da_0 );
		db_0 = __vmaddfp( sf_0, pz, db_0 );
		da_1 = __vmaddfp( sc_1, pz, da_1 );
		db_1 = __vmaddfp( sf_1, pz, db_1 );

		__stvx( da_0, texCoords, i*8+0 );
		__stvx( db_0, texCoords, i*8+16 );
		__stvx( da_1, texCoords, i*8+32 );
		__stvx( db_1, texCoords, i*8+48 );

		__vector4 ba_0 = __vcmpgtfp( da_0, vmx_float_zero );
		__vector4 bb_0 = __vcmpgtfp( db_0, vmx_float_zero );
		__vector4 ba_1 = __vcmpgtfp( da_1, vmx_float_zero );
		__vector4 bb_1 = __vcmpgtfp( db_1, vmx_float_zero );

		ba_0 = __vand( ba_0, vmx_dword_overlay_mask0 );
		bb_0 = __vand( bb_0, vmx_dword_overlay_mask1 );
		ba_1 = __vand( ba_1, vmx_dword_overlay_mask0 );
		bb_1 = __vand( bb_1, vmx_dword_overlay_mask1 );

		da_0 = __vsubfp( vmx_float_one, da_0 );
		db_0 = __vsubfp( vmx_float_one, db_0 );
		da_1 = __vsubfp( vmx_float_one, da_1 );
		db_1 = __vsubfp( vmx_float_one, db_1 );

		__vector4 bc_0 = __vcmpgtfp( da_0, vmx_float_zero );
		__vector4 bd_0 = __vcmpgtfp( db_0, vmx_float_zero );
		__vector4 bc_1 = __vcmpgtfp( da_1, vmx_float_zero );
		__vector4 bd_1 = __vcmpgtfp( db_1, vmx_float_zero );

		bc_0 = __vand( bc_0, vmx_dword_overlay_mask2 );
		bd_0 = __vand( bd_0, vmx_dword_overlay_mask3 );
		bc_1 = __vand( bc_1, vmx_dword_overlay_mask2 );
		bd_1 = __vand( bd_1, vmx_dword_overlay_mask3 );

		ba_0 = __vor( ba_0, bb_0 );
		bc_0 = __vor( bc_0, bd_0 );
		ba_0 = __vor( ba_0, bc_0 );
		ba_1 = __vor( ba_1, bb_1 );
		bc_1 = __vor( bc_1, bd_1 );
		ba_1 = __vor( ba_1, bc_1 );

		__vector4 bits0_0 = __vpermwi( ba_0, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 bits1_0 = __vpermwi( ba_0, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 bits2_0 = __vpermwi( ba_0, SHUFFLE_D( 2, 2, 2, 2 ) );
		__vector4 bits3_0 = __vpermwi( ba_0, SHUFFLE_D( 3, 3, 3, 3 ) );
		__vector4 bits0_1 = __vpermwi( ba_1, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 bits1_1 = __vpermwi( ba_1, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 bits2_1 = __vpermwi( ba_1, SHUFFLE_D( 2, 2, 2, 2 ) );
		__vector4 bits3_1 = __vpermwi( ba_1, SHUFFLE_D( 3, 3, 3, 3 ) );

		bits0_0 = __vor( bits0_0, bits1_0 );
		bits2_0 = __vor( bits2_0, bits3_0 );
		bits0_0 = __vor( bits0_0, bits2_0 );
		bits0_1 = __vor( bits0_1, bits1_1 );
		bits2_1 = __vor( bits2_1, bits3_1 );
		bits0_1 = __vor( bits0_1, bits2_1 );

		bits0_0 = __vxor( bits0_0, vmx_dword_overlay_xor );
		bits0_1 = __vxor( bits0_1, vmx_dword_overlay_xor );

		bits0_0 = __vperm( bits0_0, bits0_0, vmx_dword_overlay_perm );
		bits0_1 = __vperm( bits0_1, bits0_1, vmx_dword_overlay_perm );

		__stvewx( bits0_0, cullBits, i );
		__stvewx( bits0_1, cullBits, i+4 );
	}

	for ( ; i < numVerts - 3; i += 4 ) {

		__vector4 va_0 = *(__vector4 *)( &verts[i+0].xyz );
		__vector4 vb_0 = *(__vector4 *)( &verts[i+1].xyz );
		__vector4 vc_0 = *(__vector4 *)( &verts[i+2].xyz );
		__vector4 vd_0 = *(__vector4 *)( &verts[i+3].xyz );

		__vector4 ta_0 = __vmrghw( va_0, vb_0 );		// 00, 10, 01, 11
		__vector4 tb_0 = __vmrghw( vc_0, vd_0 );		// 00, 10, 01, 11
		__vector4 tc_0 = __vmrglw( va_0, vb_0 );		// 02, 12, 03, 13
		__vector4 td_0 = __vmrglw( vc_0, vd_0 );		// 02, 12, 03, 13

		__vector4 sa_0 = __vpermwi( ta_0, SHUFFLE_D( 0, 0, 1, 1 ) );
		__vector4 sb_0 = __vpermwi( ta_0, SHUFFLE_D( 2, 2, 3, 3 ) );
		__vector4 sc_0 = __vpermwi( tc_0, SHUFFLE_D( 0, 0, 1, 1 ) );

		__vector4 sd_0 = __vpermwi( tb_0, SHUFFLE_D( 0, 0, 1, 1 ) );
		__vector4 se_0 = __vpermwi( tb_0, SHUFFLE_D( 2, 2, 3, 3 ) );
		__vector4 sf_0 = __vpermwi( td_0, SHUFFLE_D( 0, 0, 1, 1 ) );

		__vector4 da_0 = __vmaddfp( sa_0, px, pw );
		__vector4 db_0 = __vmaddfp( sd_0, px, pw );

		da_0 = __vmaddfp( sb_0, py, da_0 );
		db_0 = __vmaddfp( se_0, py, db_0 );

		da_0 = __vmaddfp( sc_0, pz, da_0 );
		db_0 = __vmaddfp( sf_0, pz, db_0 );

		__stvx( da_0, texCoords, i*8+0 );
		__stvx( db_0, texCoords, i*8+16 );

		__vector4 ba_0 = __vcmpgtfp( da_0, vmx_float_zero );
		__vector4 bb_0 = __vcmpgtfp( db_0, vmx_float_zero );

		ba_0 = __vand( ba_0, vmx_dword_overlay_mask0 );
		bb_0 = __vand( bb_0, vmx_dword_overlay_mask1 );

		da_0 = __vsubfp( vmx_float_one, da_0 );
		db_0 = __vsubfp( vmx_float_one, db_0 );

		__vector4 bc_0 = __vcmpgtfp( da_0, vmx_float_zero );
		__vector4 bd_0 = __vcmpgtfp( db_0, vmx_float_zero );

		bc_0 = __vand( bc_0, vmx_dword_overlay_mask2 );
		bd_0 = __vand( bd_0, vmx_dword_overlay_mask3 );

		ba_0 = __vor( ba_0, bb_0 );
		bc_0 = __vor( bc_0, bd_0 );
		ba_0 = __vor( ba_0, bc_0 );

		__vector4 bits0_0 = __vpermwi( ba_0, SHUFFLE_D( 0, 0, 0, 0 ) );
		__vector4 bits1_0 = __vpermwi( ba_0, SHUFFLE_D( 1, 1, 1, 1 ) );
		__vector4 bits2_0 = __vpermwi( ba_0, SHUFFLE_D( 2, 2, 2, 2 ) );
		__vector4 bits3_0 = __vpermwi( ba_0, SHUFFLE_D( 3, 3, 3, 3 ) );

		bits0_0 = __vor( bits0_0, bits1_0 );
		bits2_0 = __vor( bits2_0, bits3_0 );
		bits0_0 = __vor( bits0_0, bits2_0 );

		bits0_0 = __vxor( bits0_0, vmx_dword_overlay_xor );

		bits0_0 = __vperm( bits0_0, bits0_0, vmx_dword_overlay_perm );

		__stvewx( bits0_0, cullBits, i );
	}

	for ( ; i < numVerts - 1; i += 2 ) {
		unsigned int bits;

		const idVec3 &v0 = verts[i+0].xyz;
		const idVec3 &v1 = verts[i+1].xyz;

		float d0 = p0[0] * v0[0] + p0[1] * v0[1] + p0[2] * v0[2] + p0[3];
		float d1 = p1[0] * v0[0] + p1[1] * v0[1] + p1[2] * v0[2] + p1[3];
		float d2 = p0[0] * v1[0] + p0[1] * v1[1] + p0[2] * v1[2] + p0[3];
		float d3 = p1[0] * v1[0] + p1[1] * v1[1] + p1[2] * v1[2] + p1[3];

		texCoords[i+0][0] = d0;
		texCoords[i+0][1] = d1;
		texCoords[i+1][0] = d2;
		texCoords[i+1][1] = d3;

		bits  = FLOATSIGNBITSET( d0 ) << 0;
		bits |= FLOATSIGNBITSET( d1 ) << 1;
		bits |= FLOATSIGNBITSET( d2 ) << 8;
		bits |= FLOATSIGNBITSET( d3 ) << 9;

		d0 = 1.0f - d0;
		d1 = 1.0f - d1;
		d2 = 1.0f - d2;
		d3 = 1.0f - d3;

		bits |= FLOATSIGNBITSET( d0 ) << 2;
		bits |= FLOATSIGNBITSET( d1 ) << 3;
		bits |= FLOATSIGNBITSET( d2 ) << 10;
		bits |= FLOATSIGNBITSET( d3 ) << 11;

		cullBits[i+0] = ( bits >> 0 ) & 0xFF;
		cullBits[i+1] = ( bits >> 8 ) & 0xFF;
	}

	if ( numVerts & 1 ) {
		byte bits;
		float d0, d1;

		const idPlane &p0 = planes[0];
		const idPlane &p1 = planes[1];
		const idVec3 &v0 = verts[numVerts - 1].xyz;

		d0 = p0[0] * v0[0] + p0[1] * v0[1] + p0[2] * v0[2] + p0[3];
		d1 = p1[0] * v0[0] + p1[1] * v0[1] + p1[2] * v0[2] + p1[3];

		texCoords[numVerts - 1][0] = d0;
		texCoords[numVerts - 1][1] = d1;

		bits  = FLOATSIGNBITSET( d0 ) << 0;
		bits |= FLOATSIGNBITSET( d1 ) << 1;

		d0 = 1.0f - d0;
		d1 = 1.0f - d1;

		bits |= FLOATSIGNBITSET( d0 ) << 2;
		bits |= FLOATSIGNBITSET( d1 ) << 3;

		cullBits[numVerts - 1] = bits;
	}
}

/*
============
idSIMD_Xenon::DeriveTriPlanes

	Derives a plane equation for each triangle.
============
*/
void VPCALL idSIMD_Xenon::DeriveTriPlanes( idPlane *__restrict planes, const idDrawVert *__restrict verts, const int numVerts, const vertIndex_t *__restrict indexes, const int numIndexes ) {
	const vertIndex_t *end = indexes + numIndexes;

	assert_16_byte_aligned( planes );
	assert_16_byte_aligned( verts );
	assert_sizeof_16_byte_multiple( idPlane );
	assert_sizeof_16_byte_multiple( idDrawVert );

	for ( ; indexes + 7*3 < end; indexes += 8*3, planes += 8 ) {

		__vector4 a0_0 = *(__vector4 *)( verts + indexes[0 * 3 + 0] );
		__vector4 b0_0 = *(__vector4 *)( verts + indexes[0 * 3 + 1] );
		__vector4 c0_0 = *(__vector4 *)( verts + indexes[0 * 3 + 2] );
		__vector4 a1_0 = *(__vector4 *)( verts + indexes[1 * 3 + 0] );
		__vector4 b1_0 = *(__vector4 *)( verts + indexes[1 * 3 + 1] );
		__vector4 c1_0 = *(__vector4 *)( verts + indexes[1 * 3 + 2] );
		__vector4 a2_0 = *(__vector4 *)( verts + indexes[2 * 3 + 0] );
		__vector4 b2_0 = *(__vector4 *)( verts + indexes[2 * 3 + 1] );
		__vector4 c2_0 = *(__vector4 *)( verts + indexes[2 * 3 + 2] );
		__vector4 a3_0 = *(__vector4 *)( verts + indexes[3 * 3 + 0] );
		__vector4 b3_0 = *(__vector4 *)( verts + indexes[3 * 3 + 1] );
		__vector4 c3_0 = *(__vector4 *)( verts + indexes[3 * 3 + 2] );

		__vector4 a0_1 = *(__vector4 *)( verts + indexes[4 * 3 + 0] );
		__vector4 b0_1 = *(__vector4 *)( verts + indexes[4 * 3 + 1] );
		__vector4 c0_1 = *(__vector4 *)( verts + indexes[4 * 3 + 2] );
		__vector4 a1_1 = *(__vector4 *)( verts + indexes[5 * 3 + 0] );
		__vector4 b1_1 = *(__vector4 *)( verts + indexes[5 * 3 + 1] );
		__vector4 c1_1 = *(__vector4 *)( verts + indexes[5 * 3 + 2] );
		__vector4 a2_1 = *(__vector4 *)( verts + indexes[6 * 3 + 0] );
		__vector4 b2_1 = *(__vector4 *)( verts + indexes[6 * 3 + 1] );
		__vector4 c2_1 = *(__vector4 *)( verts + indexes[6 * 3 + 2] );
		__vector4 a3_1 = *(__vector4 *)( verts + indexes[7 * 3 + 0] );
		__vector4 b3_1 = *(__vector4 *)( verts + indexes[7 * 3 + 1] );
		__vector4 c3_1 = *(__vector4 *)( verts + indexes[7 * 3 + 2] );

		__vector4 ta0_0 = __vmrghw( a0_0, a1_0 );
		__vector4 ta1_0 = __vmrghw( a2_0, a3_0 );
		__vector4 ta2_0 = __vmrglw( a0_0, a1_0 );
		__vector4 ta3_0 = __vmrglw( a2_0, a3_0 );
		__vector4 ta0_1 = __vmrghw( a0_1, a1_1 );
		__vector4 ta1_1 = __vmrghw( a2_1, a3_1 );
		__vector4 ta2_1 = __vmrglw( a0_1, a1_1 );
		__vector4 ta3_1 = __vmrglw( a2_1, a3_1 );

		__vector4 tb0_0 = __vmrghw( b0_0, b1_0 );
		__vector4 tb1_0 = __vmrghw( b2_0, b3_0 );
		__vector4 tb2_0 = __vmrglw( b0_0, b1_0 );
		__vector4 tb3_0 = __vmrglw( b2_0, b3_0 );
		__vector4 tb0_1 = __vmrghw( b0_1, b1_1 );
		__vector4 tb1_1 = __vmrghw( b2_1, b3_1 );
		__vector4 tb2_1 = __vmrglw( b0_1, b1_1 );
		__vector4 tb3_1 = __vmrglw( b2_1, b3_1 );

		__vector4 tc0_0 = __vmrghw( c0_0, c1_0 );
		__vector4 tc1_0 = __vmrghw( c2_0, c3_0 );
		__vector4 tc2_0 = __vmrglw( c0_0, c1_0 );
		__vector4 tc3_0 = __vmrglw( c2_0, c3_0 );
		__vector4 tc0_1 = __vmrghw( c0_1, c1_1 );
		__vector4 tc1_1 = __vmrghw( c2_1, c3_1 );
		__vector4 tc2_1 = __vmrglw( c0_1, c1_1 );
		__vector4 tc3_1 = __vmrglw( c2_1, c3_1 );

		__vector4 ax_0 = __vperm( ta0_0, ta1_0, vmx_dword_perm_plane_x );
		__vector4 ay_0 = __vperm( ta0_0, ta1_0, vmx_dword_perm_plane_y );
		__vector4 az_0 = __vperm( ta2_0, ta3_0, vmx_dword_perm_plane_x );
		__vector4 ax_1 = __vperm( ta0_1, ta1_1, vmx_dword_perm_plane_x );
		__vector4 ay_1 = __vperm( ta0_1, ta1_1, vmx_dword_perm_plane_y );
		__vector4 az_1 = __vperm( ta2_1, ta3_1, vmx_dword_perm_plane_x );

		__vector4 bx_0 = __vperm( tb0_0, tb1_0, vmx_dword_perm_plane_x );
		__vector4 by_0 = __vperm( tb0_0, tb1_0, vmx_dword_perm_plane_y );
		__vector4 bz_0 = __vperm( tb2_0, tb3_0, vmx_dword_perm_plane_x );
		__vector4 bx_1 = __vperm( tb0_1, tb1_1, vmx_dword_perm_plane_x );
		__vector4 by_1 = __vperm( tb0_1, tb1_1, vmx_dword_perm_plane_y );
		__vector4 bz_1 = __vperm( tb2_1, tb3_1, vmx_dword_perm_plane_x );

		__vector4 cx_0 = __vperm( tc0_0, tc1_0, vmx_dword_perm_plane_x );
		__vector4 cy_0 = __vperm( tc0_0, tc1_0, vmx_dword_perm_plane_y );
		__vector4 cz_0 = __vperm( tc2_0, tc3_0, vmx_dword_perm_plane_x );
		__vector4 cx_1 = __vperm( tc0_1, tc1_1, vmx_dword_perm_plane_x );
		__vector4 cy_1 = __vperm( tc0_1, tc1_1, vmx_dword_perm_plane_y );
		__vector4 cz_1 = __vperm( tc2_1, tc3_1, vmx_dword_perm_plane_x );

		__vector4 d0_0 = __vsubfp( bx_0, ax_0 );
		__vector4 d1_0 = __vsubfp( by_0, ay_0 );
		__vector4 d2_0 = __vsubfp( bz_0, az_0 );
		__vector4 d0_1 = __vsubfp( bx_1, ax_1 );
		__vector4 d1_1 = __vsubfp( by_1, ay_1 );
		__vector4 d2_1 = __vsubfp( bz_1, az_1 );

		__vector4 d3_0 = __vsubfp( cx_0, ax_0 );
		__vector4 d4_0 = __vsubfp( cy_0, ay_0 );
		__vector4 d5_0 = __vsubfp( cz_0, az_0 );
		__vector4 d3_1 = __vsubfp( cx_1, ax_1 );
		__vector4 d4_1 = __vsubfp( cy_1, ay_1 );
		__vector4 d5_1 = __vsubfp( cz_1, az_1 );

		__vector4 m0_0 = __vmulfp( d4_0, d2_0 );
		__vector4 m1_0 = __vmulfp( d5_0, d0_0 );
		__vector4 m2_0 = __vmulfp( d3_0, d1_0 );
		__vector4 m0_1 = __vmulfp( d4_1, d2_1 );
		__vector4 m1_1 = __vmulfp( d5_1, d0_1 );
		__vector4 m2_1 = __vmulfp( d3_1, d1_1 );

		__vector4 m3_0 = __vmulfp( d5_0, d1_0 );
		__vector4 m4_0 = __vmulfp( d3_0, d2_0 );
		__vector4 m5_0 = __vmulfp( d4_0, d0_0 );
		__vector4 m3_1 = __vmulfp( d5_1, d1_1 );
		__vector4 m4_1 = __vmulfp( d3_1, d2_1 );
		__vector4 m5_1 = __vmulfp( d4_1, d0_1 );

		__vector4 vx_0 = __vsubfp( m0_0, m3_0 );
		__vector4 vy_0 = __vsubfp( m1_0, m4_0 );
		__vector4 vz_0 = __vsubfp( m2_0, m5_0 );
		__vector4 vx_1 = __vsubfp( m0_1, m3_1 );
		__vector4 vy_1 = __vsubfp( m1_1, m4_1 );
		__vector4 vz_1 = __vsubfp( m2_1, m5_1 );

		__vector4 s0_0 = __vmulfp( vx_0, vx_0 );
		__vector4 s1_0 = __vmulfp( vy_0, vy_0 );
		__vector4 s2_0 = __vmulfp( vz_0, vz_0 );
		__vector4 s0_1 = __vmulfp( vx_1, vx_1 );
		__vector4 s1_1 = __vmulfp( vy_1, vy_1 );
		__vector4 s2_1 = __vmulfp( vz_1, vz_1 );

		__vector4 e0_0 = __vaddfp( s0_0, s1_0 );
		__vector4 e1_0 = __vaddfp( e0_0, s2_0 );
		__vector4 e0_1 = __vaddfp( s0_1, s1_1 );
		__vector4 e1_1 = __vaddfp( e0_1, s2_1 );

		__vector4 rp_0 = __vrsqrtefp( e1_0 );
		__vector4 rp_1 = __vrsqrtefp( e1_1 );

		__vector4 nx_0 = __vmulfp( vx_0, rp_0 );
		__vector4 ny_0 = __vmulfp( vy_0, rp_0 );
		__vector4 nz_0 = __vmulfp( vz_0, rp_0 );
		__vector4 nx_1 = __vmulfp( vx_1, rp_1 );
		__vector4 ny_1 = __vmulfp( vy_1, rp_1 );
		__vector4 nz_1 = __vmulfp( vz_1, rp_1 );

		__vector4 s3_0 = __vmulfp( nx_0, ax_0 );
		__vector4 s4_0 = __vmaddfp( ny_0, ay_0, s3_0 );
		__vector4 vw_0 = __vmaddfp( nz_0, az_0, s4_0 );
		__vector4 s3_1 = __vmulfp( nx_1, ax_1 );
		__vector4 s4_1 = __vmaddfp( ny_1, ay_1, s3_1 );
		__vector4 vw_1 = __vmaddfp( nz_1, az_1, s4_1 );

		__vector4 nw_0 = __vxor( vw_0, vmx_float_sign_bit );
		__vector4 nw_1 = __vxor( vw_1, vmx_float_sign_bit );

		__vector4 tp0_0 = __vmrghw( nx_0, ny_0 );
		__vector4 tp1_0 = __vmrghw( nz_0, nw_0 );
		__vector4 tp2_0 = __vmrglw( nx_0, ny_0 );
		__vector4 tp3_0 = __vmrglw( nz_0, nw_0 );
		__vector4 tp0_1 = __vmrghw( nx_1, ny_1 );
		__vector4 tp1_1 = __vmrghw( nz_1, nw_1 );
		__vector4 tp2_1 = __vmrglw( nx_1, ny_1 );
		__vector4 tp3_1 = __vmrglw( nz_1, nw_1 );

		__vector4 p0_0 = __vperm( tp0_0, tp1_0, vmx_dword_perm_plane_x );
		__vector4 p1_0 = __vperm( tp0_0, tp1_0, vmx_dword_perm_plane_y );
		__vector4 p2_0 = __vperm( tp2_0, tp3_0, vmx_dword_perm_plane_x );
		__vector4 p3_0 = __vperm( tp2_0, tp3_0, vmx_dword_perm_plane_y );
		__vector4 p0_1 = __vperm( tp0_1, tp1_1, vmx_dword_perm_plane_x );
		__vector4 p1_1 = __vperm( tp0_1, tp1_1, vmx_dword_perm_plane_y );
		__vector4 p2_1 = __vperm( tp2_1, tp3_1, vmx_dword_perm_plane_x );
		__vector4 p3_1 = __vperm( tp2_1, tp3_1, vmx_dword_perm_plane_y );

		__stvx( p0_0, planes, 0*16 );
		__stvx( p1_0, planes, 1*16 );
		__stvx( p2_0, planes, 2*16 );
		__stvx( p3_0, planes, 3*16 );
		__stvx( p0_1, planes, 4*16 );
		__stvx( p1_1, planes, 5*16 );
		__stvx( p2_1, planes, 6*16 );
		__stvx( p3_1, planes, 7*16 );
	}

	for ( ; indexes + 3*3 < end; indexes += 4*3, planes += 4 ) {

		__vector4 a0_0 = *(__vector4 *)( verts + indexes[0 * 3 + 0] );
		__vector4 b0_0 = *(__vector4 *)( verts + indexes[0 * 3 + 1] );
		__vector4 c0_0 = *(__vector4 *)( verts + indexes[0 * 3 + 2] );

		__vector4 a1_0 = *(__vector4 *)( verts + indexes[1 * 3 + 0] );
		__vector4 b1_0 = *(__vector4 *)( verts + indexes[1 * 3 + 1] );
		__vector4 c1_0 = *(__vector4 *)( verts + indexes[1 * 3 + 2] );

		__vector4 a2_0 = *(__vector4 *)( verts + indexes[2 * 3 + 0] );
		__vector4 b2_0 = *(__vector4 *)( verts + indexes[2 * 3 + 1] );
		__vector4 c2_0 = *(__vector4 *)( verts + indexes[2 * 3 + 2] );

		__vector4 a3_0 = *(__vector4 *)( verts + indexes[3 * 3 + 0] );
		__vector4 b3_0 = *(__vector4 *)( verts + indexes[3 * 3 + 1] );
		__vector4 c3_0 = *(__vector4 *)( verts + indexes[3 * 3 + 2] );

		__vector4 ta0_0 = __vmrghw( a0_0, a1_0 );
		__vector4 ta1_0 = __vmrghw( a2_0, a3_0 );
		__vector4 ta2_0 = __vmrglw( a0_0, a1_0 );
		__vector4 ta3_0 = __vmrglw( a2_0, a3_0 );

		__vector4 tb0_0 = __vmrghw( b0_0, b1_0 );
		__vector4 tb1_0 = __vmrghw( b2_0, b3_0 );
		__vector4 tb2_0 = __vmrglw( b0_0, b1_0 );
		__vector4 tb3_0 = __vmrglw( b2_0, b3_0 );

		__vector4 tc0_0 = __vmrghw( c0_0, c1_0 );
		__vector4 tc1_0 = __vmrghw( c2_0, c3_0 );
		__vector4 tc2_0 = __vmrglw( c0_0, c1_0 );
		__vector4 tc3_0 = __vmrglw( c2_0, c3_0 );

		__vector4 ax_0 = __vperm( ta0_0, ta1_0, vmx_dword_perm_plane_x );
		__vector4 ay_0 = __vperm( ta0_0, ta1_0, vmx_dword_perm_plane_y );
		__vector4 az_0 = __vperm( ta2_0, ta3_0, vmx_dword_perm_plane_x );

		__vector4 bx_0 = __vperm( tb0_0, tb1_0, vmx_dword_perm_plane_x );
		__vector4 by_0 = __vperm( tb0_0, tb1_0, vmx_dword_perm_plane_y );
		__vector4 bz_0 = __vperm( tb2_0, tb3_0, vmx_dword_perm_plane_x );

		__vector4 cx_0 = __vperm( tc0_0, tc1_0, vmx_dword_perm_plane_x );
		__vector4 cy_0 = __vperm( tc0_0, tc1_0, vmx_dword_perm_plane_y );
		__vector4 cz_0 = __vperm( tc2_0, tc3_0, vmx_dword_perm_plane_x );

		__vector4 d0_0 = __vsubfp( bx_0, ax_0 );
		__vector4 d1_0 = __vsubfp( by_0, ay_0 );
		__vector4 d2_0 = __vsubfp( bz_0, az_0 );

		__vector4 d3_0 = __vsubfp( cx_0, ax_0 );
		__vector4 d4_0 = __vsubfp( cy_0, ay_0 );
		__vector4 d5_0 = __vsubfp( cz_0, az_0 );

		__vector4 m0_0 = __vmulfp( d4_0, d2_0 );
		__vector4 m1_0 = __vmulfp( d5_0, d0_0 );
		__vector4 m2_0 = __vmulfp( d3_0, d1_0 );

		__vector4 m3_0 = __vmulfp( d5_0, d1_0 );
		__vector4 m4_0 = __vmulfp( d3_0, d2_0 );
		__vector4 m5_0 = __vmulfp( d4_0, d0_0 );

		__vector4 vx_0 = __vsubfp( m0_0, m3_0 );
		__vector4 vy_0 = __vsubfp( m1_0, m4_0 );
		__vector4 vz_0 = __vsubfp( m2_0, m5_0 );

		__vector4 s0_0 = __vmulfp( vx_0, vx_0 );
		__vector4 s1_0 = __vmulfp( vy_0, vy_0 );
		__vector4 s2_0 = __vmulfp( vz_0, vz_0 );

		__vector4 e0_0 = __vaddfp( s0_0, s1_0 );
		__vector4 e1_0 = __vaddfp( e0_0, s2_0 );

		__vector4 rp_0 = __vrsqrtefp( e1_0 );

		__vector4 nx_0 = __vmulfp( vx_0, rp_0 );
		__vector4 ny_0 = __vmulfp( vy_0, rp_0 );
		__vector4 nz_0 = __vmulfp( vz_0, rp_0 );

		__vector4 s3_0 = __vmulfp( nx_0, ax_0 );
		__vector4 s4_0 = __vmaddfp( ny_0, ay_0, s3_0 );
		__vector4 vw_0 = __vmaddfp( nz_0, az_0, s4_0 );

		__vector4 nw_0 = __vxor( vw_0, vmx_float_sign_bit );

		__vector4 tp0_0 = __vmrghw( nx_0, ny_0 );
		__vector4 tp1_0 = __vmrghw( nz_0, nw_0 );
		__vector4 tp2_0 = __vmrglw( nx_0, ny_0 );
		__vector4 tp3_0 = __vmrglw( nz_0, nw_0 );

		__vector4 p0_0 = __vperm( tp0_0, tp1_0, vmx_dword_perm_plane_x );
		__vector4 p1_0 = __vperm( tp0_0, tp1_0, vmx_dword_perm_plane_y );
		__vector4 p2_0 = __vperm( tp2_0, tp3_0, vmx_dword_perm_plane_x );
		__vector4 p3_0 = __vperm( tp2_0, tp3_0, vmx_dword_perm_plane_y );

		__stvx( p0_0, planes, 0 );
		__stvx( p1_0, planes, 16 );
		__stvx( p2_0, planes, 32 );
		__stvx( p3_0, planes, 48 );
	}

	for ( ; indexes < end; indexes += 3, planes++ ) {

		__vector4 a = *(__vector4 *)( verts + indexes[0] );
		__vector4 b = *(__vector4 *)( verts + indexes[1] );
		__vector4 c = *(__vector4 *)( verts + indexes[2] );

		__vector4 da = __vsubfp( b, a );
		__vector4 db = __vsubfp( c, a );

		__vector4 dc = __vpermwi( da, SHUFFLE_D( 1, 2, 0, 3 ) );
		__vector4 dd = __vpermwi( db, SHUFFLE_D( 1, 2, 0, 3 ) );

		__vector4 de = __vmulfp( db, dc );
		__vector4 df = __vmulfp( da, dd );

		__vector4 dg = __vsubfp( de, df );

		__vector4 dh = __vmsum3fp( dg, dg );
		__vector4 di = __vrsqrtefp( dh );
		__vector4 np = __vmulfp( dg, di );

		__vector4 normal = __vpermwi( np, SHUFFLE_D( 1, 2, 0, 3 ) );

		__vector4 dj = __vmsum3fp( normal, a );
		__vector4 dist = __vxor( dj, vmx_float_sign_bit );

		__vector4 plane = __vperm( normal, dist, vmx_dword_perm_replacelast );

		__stvx( plane, planes, 0 );
	}
}

/*
============
idSIMD_Xenon::CalculateFacing
============
*/
void VPCALL idSIMD_Xenon::CalculateFacing( byte *__restrict facing, const idPlane *__restrict planes, const int numTriangles, const idVec4 &light ) {
	int	i;

	__vector4 vlight = (__vector4 &) light;

	for	( i	= 0; i < numTriangles - 15; i += 16 ) {
		__vector4 p0_0 = __lvx( planes, i*16+ 0*16 );
		__vector4 p1_0 = __lvx( planes, i*16+ 1*16 );
		__vector4 p2_0 = __lvx( planes, i*16+ 2*16 );
		__vector4 p3_0 = __lvx( planes, i*16+ 3*16 );
		__vector4 p0_1 = __lvx( planes, i*16+ 4*16 );
		__vector4 p1_1 = __lvx( planes, i*16+ 5*16 );
		__vector4 p2_1 = __lvx( planes, i*16+ 6*16 );
		__vector4 p3_1 = __lvx( planes, i*16+ 7*16 );
		__vector4 p0_2 = __lvx( planes, i*16+ 8*16 );
		__vector4 p1_2 = __lvx( planes, i*16+ 9*16 );
		__vector4 p2_2 = __lvx( planes, i*16+10*16 );
		__vector4 p3_2 = __lvx( planes, i*16+11*16 );
		__vector4 p0_3 = __lvx( planes, i*16+12*16 );
		__vector4 p1_3 = __lvx( planes, i*16+13*16 );
		__vector4 p2_3 = __lvx( planes, i*16+14*16 );
		__vector4 p3_3 = __lvx( planes, i*16+15*16 );

		__vector4 d0_0 = __vmsum4fp( p0_0, vlight );
		__vector4 d1_0 = __vmsum4fp( p1_0, vlight );
		__vector4 d2_0 = __vmsum4fp( p2_0, vlight );
		__vector4 d3_0 = __vmsum4fp( p3_0, vlight );
		__vector4 d0_1 = __vmsum4fp( p0_1, vlight );
		__vector4 d1_1 = __vmsum4fp( p1_1, vlight );
		__vector4 d2_1 = __vmsum4fp( p2_1, vlight );
		__vector4 d3_1 = __vmsum4fp( p3_1, vlight );
		__vector4 d0_2 = __vmsum4fp( p0_2, vlight );
		__vector4 d1_2 = __vmsum4fp( p1_2, vlight );
		__vector4 d2_2 = __vmsum4fp( p2_2, vlight );
		__vector4 d3_2 = __vmsum4fp( p3_2, vlight );
		__vector4 d0_3 = __vmsum4fp( p0_3, vlight );
		__vector4 d1_3 = __vmsum4fp( p1_3, vlight );
		__vector4 d2_3 = __vmsum4fp( p2_3, vlight );
		__vector4 d3_3 = __vmsum4fp( p3_3, vlight );

		__vector4 b0_0 = __vcmpgtfp( d0_0, vmx_float_zero );
		__vector4 b1_0 = __vcmpgtfp( d1_0, vmx_float_zero );
		__vector4 b2_0 = __vcmpgtfp( d2_0, vmx_float_zero );
		__vector4 b3_0 = __vcmpgtfp( d3_0, vmx_float_zero );
		__vector4 b0_1 = __vcmpgtfp( d0_1, vmx_float_zero );
		__vector4 b1_1 = __vcmpgtfp( d1_1, vmx_float_zero );
		__vector4 b2_1 = __vcmpgtfp( d2_1, vmx_float_zero );
		__vector4 b3_1 = __vcmpgtfp( d3_1, vmx_float_zero );
		__vector4 b0_2 = __vcmpgtfp( d0_2, vmx_float_zero );
		__vector4 b1_2 = __vcmpgtfp( d1_2, vmx_float_zero );
		__vector4 b2_2 = __vcmpgtfp( d2_2, vmx_float_zero );
		__vector4 b3_2 = __vcmpgtfp( d3_2, vmx_float_zero );
		__vector4 b0_3 = __vcmpgtfp( d0_3, vmx_float_zero );
		__vector4 b1_3 = __vcmpgtfp( d1_3, vmx_float_zero );
		__vector4 b2_3 = __vcmpgtfp( d2_3, vmx_float_zero );
		__vector4 b3_3 = __vcmpgtfp( d3_3, vmx_float_zero );

		b0_0 = __vand( b0_0, vmx_dword_facing_mask0 );
		b1_0 = __vand( b1_0, vmx_dword_facing_mask1 );
		b2_0 = __vand( b2_0, vmx_dword_facing_mask2 );
		b3_0 = __vand( b3_0, vmx_dword_facing_mask3 );
		b0_1 = __vand( b0_1, vmx_dword_facing_mask0 );
		b1_1 = __vand( b1_1, vmx_dword_facing_mask1 );
		b2_1 = __vand( b2_1, vmx_dword_facing_mask2 );
		b3_1 = __vand( b3_1, vmx_dword_facing_mask3 );
		b0_2 = __vand( b0_2, vmx_dword_facing_mask0 );
		b1_2 = __vand( b1_2, vmx_dword_facing_mask1 );
		b2_2 = __vand( b2_2, vmx_dword_facing_mask2 );
		b3_2 = __vand( b3_2, vmx_dword_facing_mask3 );
		b0_3 = __vand( b0_3, vmx_dword_facing_mask0 );
		b1_3 = __vand( b1_3, vmx_dword_facing_mask1 );
		b2_3 = __vand( b2_3, vmx_dword_facing_mask2 );
		b3_3 = __vand( b3_3, vmx_dword_facing_mask3 );

		b0_0 = __vor( b0_0, b1_0 );
		b2_0 = __vor( b2_0, b3_0 );
		b0_0 = __vor( b0_0, b2_0 );
		b0_1 = __vor( b0_1, b1_1 );
		b2_1 = __vor( b2_1, b3_1 );
		b0_1 = __vor( b0_1, b2_1 );
		b0_2 = __vor( b0_2, b1_2 );
		b2_2 = __vor( b2_2, b3_2 );
		b0_2 = __vor( b0_2, b2_2 );
		b0_3 = __vor( b0_3, b1_3 );
		b2_3 = __vor( b2_3, b3_3 );
		b0_3 = __vor( b0_3, b2_3 );

	 	__stvewx( b0_0, facing, i+ 0 );
	 	__stvewx( b0_1, facing, i+ 4 );
	 	__stvewx( b0_2, facing, i+ 8 );
	 	__stvewx( b0_3, facing, i+12 );
	}

	for	( ; i < numTriangles - 3; i += 4 ) {
		__vector4 p0_0 = __lvx( planes, i*16+ 0 );
		__vector4 p1_0 = __lvx( planes, i*16+16 );
		__vector4 p2_0 = __lvx( planes, i*16+32 );
		__vector4 p3_0 = __lvx( planes, i*16+48 );

		__vector4 d0_0 = __vmsum4fp( p0_0, vlight );
		__vector4 d1_0 = __vmsum4fp( p1_0, vlight );
		__vector4 d2_0 = __vmsum4fp( p2_0, vlight );
		__vector4 d3_0 = __vmsum4fp( p3_0, vlight );

		__vector4 b0_0 = __vcmpgtfp( d0_0, vmx_float_zero );
		__vector4 b1_0 = __vcmpgtfp( d1_0, vmx_float_zero );
		__vector4 b2_0 = __vcmpgtfp( d2_0, vmx_float_zero );
		__vector4 b3_0 = __vcmpgtfp( d3_0, vmx_float_zero );

		b0_0 = __vand( b0_0, vmx_dword_facing_mask0 );
		b1_0 = __vand( b1_0, vmx_dword_facing_mask1 );
		b2_0 = __vand( b2_0, vmx_dword_facing_mask2 );
		b3_0 = __vand( b3_0, vmx_dword_facing_mask3 );

		b0_0 = __vor( b0_0, b1_0 );
		b2_0 = __vor( b2_0, b3_0 );
		b0_0 = __vor( b0_0, b2_0 );

	 	__stvewx( b0_0, facing, i );
	}

	for	( ; i < numTriangles; i++ ) {
		facing[i] =	planes[i][0] * light.x + planes[i][1] * light.y + planes[i][2] * light.z + planes[i][3] * light.w > 0.0f;
	}
	facing[numTriangles] = 1;	// for dangling	edges to reference
}

/*
============
idSIMD_Xenon::CalculateCullBits
============
*/
void VPCALL idSIMD_Xenon::CalculateCullBits( byte * __restrict cullBits, const idDrawVert * __restrict verts, const int numVerts, const int frontBits, const idPlane lightPlanes[NUM_LIGHT_PLANES] ) {
	int i, j;

	assert( NUM_LIGHT_PLANES <= sizeof( cullBits[0] ) * 8 );

	XMemSet( cullBits, 0, numVerts * sizeof( cullBits[0] ) );

	for ( i = 0; i < NUM_LIGHT_PLANES; i++ ) {
		// if completely infront of this clipping plane
		if ( frontBits & ( 1 << i ) ) {
			continue;
		}
		__vector4 plane = (__vector4 &) lightPlanes[i];
		__vector4i mask0 = { (1<<24) << i, (1<<24) << i, (1<<24) << i, (1<<24) << i };
		__vector4i mask1 = { (1<<16) << i, (1<<16) << i, (1<<16) << i, (1<<16) << i };
		__vector4i mask2 = { (1<< 8) << i, (1<< 8) << i, (1<< 8) << i, (1<< 8) << i };
		__vector4i mask3 = { (1<< 0) << i, (1<< 0) << i, (1<< 0) << i, (1<< 0) << i };

		plane = __vxor( plane, vmx_float_sign_bit );

		__vector4 perm = vmx_dword_perm_replacelast;
		__vector4 zero = vmx_float_zero;

		for ( j = 0; j < numVerts - 15; j += 16 ) {
	 		__vector4 bits = __lvx( cullBits, j );

			__vector4 v0_0 = __lvx( verts, j*DRAWVERT_SIZE+ 0*DRAWVERT_SIZE );
			__vector4 v1_0 = __lvx( verts, j*DRAWVERT_SIZE+ 1*DRAWVERT_SIZE );
			__vector4 v2_0 = __lvx( verts, j*DRAWVERT_SIZE+ 2*DRAWVERT_SIZE );
			__vector4 v3_0 = __lvx( verts, j*DRAWVERT_SIZE+ 3*DRAWVERT_SIZE );
			__vector4 v0_1 = __lvx( verts, j*DRAWVERT_SIZE+ 4*DRAWVERT_SIZE );
			__vector4 v1_1 = __lvx( verts, j*DRAWVERT_SIZE+ 5*DRAWVERT_SIZE );
			__vector4 v2_1 = __lvx( verts, j*DRAWVERT_SIZE+ 6*DRAWVERT_SIZE );
			__vector4 v3_1 = __lvx( verts, j*DRAWVERT_SIZE+ 7*DRAWVERT_SIZE );
			__vector4 v0_2 = __lvx( verts, j*DRAWVERT_SIZE+ 8*DRAWVERT_SIZE );
			__vector4 v1_2 = __lvx( verts, j*DRAWVERT_SIZE+ 9*DRAWVERT_SIZE );
			__vector4 v2_2 = __lvx( verts, j*DRAWVERT_SIZE+10*DRAWVERT_SIZE );
			__vector4 v3_2 = __lvx( verts, j*DRAWVERT_SIZE+11*DRAWVERT_SIZE );
			__vector4 v0_3 = __lvx( verts, j*DRAWVERT_SIZE+12*DRAWVERT_SIZE );
			__vector4 v1_3 = __lvx( verts, j*DRAWVERT_SIZE+13*DRAWVERT_SIZE );
			__vector4 v2_3 = __lvx( verts, j*DRAWVERT_SIZE+14*DRAWVERT_SIZE );
			__vector4 v3_3 = __lvx( verts, j*DRAWVERT_SIZE+15*DRAWVERT_SIZE );

			v0_0 = __vand( v0_0, vmx_dword_mask_clear_last );
			v1_0 = __vand( v1_0, vmx_dword_mask_clear_last );
			v2_0 = __vand( v2_0, vmx_dword_mask_clear_last );
			v3_0 = __vand( v3_0, vmx_dword_mask_clear_last );
			v0_1 = __vand( v0_1, vmx_dword_mask_clear_last );
			v1_1 = __vand( v1_1, vmx_dword_mask_clear_last );
			v2_1 = __vand( v2_1, vmx_dword_mask_clear_last );
			v3_1 = __vand( v3_1, vmx_dword_mask_clear_last );
			v0_2 = __vand( v0_2, vmx_dword_mask_clear_last );
			v1_2 = __vand( v1_2, vmx_dword_mask_clear_last );
			v2_2 = __vand( v2_2, vmx_dword_mask_clear_last );
			v3_2 = __vand( v3_2, vmx_dword_mask_clear_last );
			v0_3 = __vand( v0_3, vmx_dword_mask_clear_last );
			v1_3 = __vand( v1_3, vmx_dword_mask_clear_last );
			v2_3 = __vand( v2_3, vmx_dword_mask_clear_last );
			v3_3 = __vand( v3_3, vmx_dword_mask_clear_last );

			v0_0 = __vor( v0_0, vmx_float_last_one );
			v1_0 = __vor( v1_0, vmx_float_last_one );
			v2_0 = __vor( v2_0, vmx_float_last_one );
			v3_0 = __vor( v3_0, vmx_float_last_one );
			v0_1 = __vor( v0_1, vmx_float_last_one );
			v1_1 = __vor( v1_1, vmx_float_last_one );
			v2_1 = __vor( v2_1, vmx_float_last_one );
			v3_1 = __vor( v3_1, vmx_float_last_one );
			v0_2 = __vor( v0_2, vmx_float_last_one );
			v1_2 = __vor( v1_2, vmx_float_last_one );
			v2_2 = __vor( v2_2, vmx_float_last_one );
			v3_2 = __vor( v3_2, vmx_float_last_one );
			v0_3 = __vor( v0_3, vmx_float_last_one );
			v1_3 = __vor( v1_3, vmx_float_last_one );
			v2_3 = __vor( v2_3, vmx_float_last_one );
			v3_3 = __vor( v3_3, vmx_float_last_one );

			__vector4 d0_0 = __vmsum4fp( plane, v0_0 );
			__vector4 d1_0 = __vmsum4fp( plane, v1_0 );
			__vector4 d2_0 = __vmsum4fp( plane, v2_0 );
			__vector4 d3_0 = __vmsum4fp( plane, v3_0 );
			__vector4 d0_1 = __vmsum4fp( plane, v0_1 );
			__vector4 d1_1 = __vmsum4fp( plane, v1_1 );
			__vector4 d2_1 = __vmsum4fp( plane, v2_1 );
			__vector4 d3_1 = __vmsum4fp( plane, v3_1 );
			__vector4 d0_2 = __vmsum4fp( plane, v0_2 );
			__vector4 d1_2 = __vmsum4fp( plane, v1_2 );
			__vector4 d2_2 = __vmsum4fp( plane, v2_2 );
			__vector4 d3_2 = __vmsum4fp( plane, v3_2 );
			__vector4 d0_3 = __vmsum4fp( plane, v0_3 );
			__vector4 d1_3 = __vmsum4fp( plane, v1_3 );
			__vector4 d2_3 = __vmsum4fp( plane, v2_3 );
			__vector4 d3_3 = __vmsum4fp( plane, v3_3 );

			__vector4 b0_0 = __vcmpgtfp( d0_0, zero );
			__vector4 b1_0 = __vcmpgtfp( d1_0, zero );
			__vector4 b2_0 = __vcmpgtfp( d2_0, zero );
			__vector4 b3_0 = __vcmpgtfp( d3_0, zero );
			__vector4 b0_1 = __vcmpgtfp( d0_1, zero );
			__vector4 b1_1 = __vcmpgtfp( d1_1, zero );
			__vector4 b2_1 = __vcmpgtfp( d2_1, zero );
			__vector4 b3_1 = __vcmpgtfp( d3_1, zero );
			__vector4 b0_2 = __vcmpgtfp( d0_2, zero );
			__vector4 b1_2 = __vcmpgtfp( d1_2, zero );
			__vector4 b2_2 = __vcmpgtfp( d2_2, zero );
			__vector4 b3_2 = __vcmpgtfp( d3_2, zero );
			__vector4 b0_3 = __vcmpgtfp( d0_3, zero );
			__vector4 b1_3 = __vcmpgtfp( d1_3, zero );
			__vector4 b2_3 = __vcmpgtfp( d2_3, zero );
			__vector4 b3_3 = __vcmpgtfp( d3_3, zero );

			b0_0 = __vand( b0_0, (__vector4 &)mask0 );
			b1_0 = __vand( b1_0, (__vector4 &)mask1 );
			b2_0 = __vand( b2_0, (__vector4 &)mask2 );
			b3_0 = __vand( b3_0, (__vector4 &)mask3 );
			b0_1 = __vand( b0_1, (__vector4 &)mask0 );
			b1_1 = __vand( b1_1, (__vector4 &)mask1 );
			b2_1 = __vand( b2_1, (__vector4 &)mask2 );
			b3_1 = __vand( b3_1, (__vector4 &)mask3 );
			b0_2 = __vand( b0_2, (__vector4 &)mask0 );
			b1_2 = __vand( b1_2, (__vector4 &)mask1 );
			b2_2 = __vand( b2_2, (__vector4 &)mask2 );
			b3_2 = __vand( b3_2, (__vector4 &)mask3 );
			b0_3 = __vand( b0_3, (__vector4 &)mask0 );
			b1_3 = __vand( b1_3, (__vector4 &)mask1 );
			b2_3 = __vand( b2_3, (__vector4 &)mask2 );
			b3_3 = __vand( b3_3, (__vector4 &)mask3 );

			b0_0 = __vor( b0_0, b1_0 );
			b2_0 = __vor( b2_0, b3_0 );
			b0_0 = __vor( b0_0, b2_0 );
			b0_1 = __vor( b0_1, b1_1 );
			b2_1 = __vor( b2_1, b3_1 );
			b0_1 = __vor( b0_1, b2_1 );
			b0_2 = __vor( b0_2, b1_2 );
			b2_2 = __vor( b2_2, b3_2 );
			b0_2 = __vor( b0_2, b2_2 );
			b0_3 = __vor( b0_3, b1_3 );
			b2_3 = __vor( b2_3, b3_3 );
			b0_3 = __vor( b0_3, b2_3 );

			b0_0 = __vor( b0_0, bits );
			b0_1 = __vor( b0_1, bits );
			b0_2 = __vor( b0_2, bits );
			b0_3 = __vor( b0_3, bits );

	 		__stvewx( b0_0, cullBits, j+ 0 );
	 		__stvewx( b0_1, cullBits, j+ 4 );
	 		__stvewx( b0_2, cullBits, j+ 8 );
	 		__stvewx( b0_3, cullBits, j+12 );
		}

		for ( ; j < numVerts - 3; j += 4 ) {
	 		__vector4 bits = __lvewx( cullBits, j );

			__vector4 v0_0 = __lvx( verts, j*DRAWVERT_SIZE+ 0*DRAWVERT_SIZE );
			__vector4 v1_0 = __lvx( verts, j*DRAWVERT_SIZE+ 1*DRAWVERT_SIZE );
			__vector4 v2_0 = __lvx( verts, j*DRAWVERT_SIZE+ 2*DRAWVERT_SIZE );
			__vector4 v3_0 = __lvx( verts, j*DRAWVERT_SIZE+ 3*DRAWVERT_SIZE );

			v0_0 = __vand( v0_0, vmx_dword_mask_clear_last );
			v1_0 = __vand( v1_0, vmx_dword_mask_clear_last );
			v2_0 = __vand( v2_0, vmx_dword_mask_clear_last );
			v3_0 = __vand( v3_0, vmx_dword_mask_clear_last );

			v0_0 = __vor( v0_0, vmx_float_last_one );
			v1_0 = __vor( v1_0, vmx_float_last_one );
			v2_0 = __vor( v2_0, vmx_float_last_one );
			v3_0 = __vor( v3_0, vmx_float_last_one );

			__vector4 d0_0 = __vmsum4fp( plane, v0_0 );
			__vector4 d1_0 = __vmsum4fp( plane, v1_0 );
			__vector4 d2_0 = __vmsum4fp( plane, v2_0 );
			__vector4 d3_0 = __vmsum4fp( plane, v3_0 );

			__vector4 b0_0 = __vcmpgtfp( d0_0, vmx_float_zero );
			__vector4 b1_0 = __vcmpgtfp( d1_0, vmx_float_zero );
			__vector4 b2_0 = __vcmpgtfp( d2_0, vmx_float_zero );
			__vector4 b3_0 = __vcmpgtfp( d3_0, vmx_float_zero );

			b0_0 = __vand( b0_0, (__vector4 &)mask0 );
			b1_0 = __vand( b1_0, (__vector4 &)mask1 );
			b2_0 = __vand( b2_0, (__vector4 &)mask2 );
			b3_0 = __vand( b3_0, (__vector4 &)mask3 );

			b0_0 = __vor( b0_0, b1_0 );
			b2_0 = __vor( b2_0, b3_0 );
			b0_0 = __vor( b0_0, b2_0 );

			b0_0 = __vor( b0_0, bits );

	 		__stvewx( b0_0, cullBits, j );
		}

		for ( ; j < numVerts; j++ ) {
			int bit = lightPlanes[i][0] * verts[j].xyz.x + lightPlanes[i][1] * verts[j].xyz.y + lightPlanes[i][2] * verts[j].xyz.z + lightPlanes[i][3] < 0.0f;
			cullBits[j] |= bit << i;
		}
	}
}

/*
============
idSIMD_Xenon::CreateShadowCache
============
*/
int VPCALL idSIMD_Xenon::CreateShadowCache( idVec4 *__restrict vertexCache, const idDrawVert *__restrict verts, const int numVerts ) {
	const idDrawVert *end = verts + numVerts;

	assert_16_byte_aligned( vertexCache );
	assert_16_byte_aligned( verts );
	assert_sizeof_16_byte_multiple( idVec4 );
	assert_sizeof_16_byte_multiple( idDrawVert );

	__vector4 clear_last = vmx_dword_mask_clear_last;
	__vector4 last_one = vmx_float_last_one;

	for( ; verts + 7 < end; verts += 8, vertexCache += 2*8 ) {
		__vector4 v0 = (__vector4 &) verts[0].xyz;
		__vector4 v1 = (__vector4 &) verts[1].xyz;
		__vector4 v2 = (__vector4 &) verts[2].xyz;
		__vector4 v3 = (__vector4 &) verts[3].xyz;
		__vector4 v4 = (__vector4 &) verts[4].xyz;
		__vector4 v5 = (__vector4 &) verts[5].xyz;
		__vector4 v6 = (__vector4 &) verts[6].xyz;
		__vector4 v7 = (__vector4 &) verts[7].xyz;

		__vector4 b0 = __vand( v0, clear_last );
		__vector4 b1 = __vand( v1, clear_last );
		__vector4 b2 = __vand( v2, clear_last );
		__vector4 b3 = __vand( v3, clear_last );
		__vector4 b4 = __vand( v4, clear_last );
		__vector4 b5 = __vand( v5, clear_last );
		__vector4 b6 = __vand( v6, clear_last );
		__vector4 b7 = __vand( v7, clear_last );

		__vector4 a0 = __vor( b0, last_one );
		__vector4 a1 = __vor( b1, last_one );
		__vector4 a2 = __vor( b2, last_one );
		__vector4 a3 = __vor( b3, last_one );
		__vector4 a4 = __vor( b4, last_one );
		__vector4 a5 = __vor( b5, last_one );
		__vector4 a6 = __vor( b6, last_one );
		__vector4 a7 = __vor( b7, last_one );

		__stvx( b0, vertexCache, 0*32+16 );
		__stvx( b1, vertexCache, 1*32+16 );
		__stvx( b2, vertexCache, 2*32+16 );
		__stvx( b3, vertexCache, 3*32+16 );
		__stvx( b4, vertexCache, 4*32+16 );
		__stvx( b5, vertexCache, 5*32+16 );
		__stvx( b6, vertexCache, 6*32+16 );
		__stvx( b7, vertexCache, 7*32+16 );

		__stvx( a0, vertexCache, 0*32+0 );
		__stvx( a1, vertexCache, 1*32+0 );
		__stvx( a2, vertexCache, 2*32+0 );
		__stvx( a3, vertexCache, 3*32+0 );
		__stvx( a4, vertexCache, 4*32+0 );
		__stvx( a5, vertexCache, 5*32+0 );
		__stvx( a6, vertexCache, 6*32+0 );
		__stvx( a7, vertexCache, 7*32+0 );
	}
	for( ; verts < end; verts++, vertexCache += 2 ) {
		__vector4 v0 = (__vector4 &) verts->xyz;

		__vector4 b0 = __vand( v0, clear_last );
		__vector4 a0 = __vor( b0, last_one );
		__stvx( a0, vertexCache, 0 );
		__stvx( b0, vertexCache, 16 );
	}
	return numVerts * 2;
}

/*
============
idSIMD_Xenon::ShadowVolume_CountFacing
============
*/
int VPCALL idSIMD_Xenon::ShadowVolume_CountFacing( const byte *__restrict facing, const int numFaces ) {
	int i, n0, n1, n2, n3, n4, n5, n6, n7, n8, n9, n10, n11, n12, n13, n14, n15;

	n0 = n1 = n2 = n3 = n4 = n5 = n6 = n7 = n8 = n9 = n10 = n11 = n12 = n13 = n14 = n15 = 0;
	for ( i = 0; i < numFaces-15; i += 16 ) {
		n0 += facing[i+0];
		n1 += facing[i+1];
		n2 += facing[i+2];
		n3 += facing[i+3];
		n4 += facing[i+4];
		n5 += facing[i+5];
		n6 += facing[i+6];
		n7 += facing[i+7];
		n8 += facing[i+8];
		n9 += facing[i+9];
		n10 += facing[i+10];
		n11 += facing[i+11];
		n12 += facing[i+12];
		n13 += facing[i+13];
		n14 += facing[i+14];
		n15 += facing[i+15];
	}
	for ( ; i < numFaces; i++ ) {
		n0 += facing[i];
	}
	return n0 + n1 + n2 + n3 + n4 + n5 + n6 + n7 + n8 + n9 + n10 + n11 + n12 + n13 + n14 + n15;
}

/*
============
idSIMD_Xenon::ShadowVolume_CountFacingCull
============
*/
int VPCALL idSIMD_Xenon::ShadowVolume_CountFacingCull( byte *__restrict facing, const int numFaces, const vertIndex_t *__restrict indexes, const byte *cull ) {
	int i, n0, n1, n2, n3;

	n0 = n1 = n2 = n3 = 0;
	for ( i = 0; i < numFaces - 3; i += 4 ) {

		int c0 = cull[indexes[0*3+0]] & cull[indexes[0*3+1]] & cull[indexes[0*3+2]];
		int c1 = cull[indexes[1*3+0]] & cull[indexes[1*3+1]] & cull[indexes[1*3+2]];
		int c2 = cull[indexes[2*3+0]] & cull[indexes[2*3+1]] & cull[indexes[2*3+2]];
		int c3 = cull[indexes[3*3+0]] & cull[indexes[3*3+1]] & cull[indexes[3*3+2]];

		int f0 = facing[i+0] | ( (-c0) >> 31 ) & 1;
		int f1 = facing[i+1] | ( (-c1) >> 31 ) & 1;
		int f2 = facing[i+2] | ( (-c2) >> 31 ) & 1;
		int f3 = facing[i+3] | ( (-c3) >> 31 ) & 1;

		n0 += f0;
		n1 += f1;
		n2 += f2;
		n3 += f3;

		facing[i+0] = f0;
		facing[i+1] = f1;
		facing[i+2] = f2;
		facing[i+3] = f3;

		indexes += 4*3;
	}
	for ( ; i < numFaces; i++ ) {
		int c = cull[indexes[0]] & cull[indexes[1]] & cull[indexes[2]];
		facing[i] |= ( (-c) >> 31 ) & 1;
		n0 += facing[i];
		indexes += 3;
	}
	return n0 + n1 + n2 + n3;
}

/*
============
idSIMD_Xenon::ShadowVolume_CreateSilTriangles
============
*/
int VPCALL idSIMD_Xenon::ShadowVolume_CreateSilTriangles( vertIndex_t *__restrict shadowIndexes, const byte *__restrict facing, const silEdge_t *__restrict silEdges, const int numSilEdges ) {
	const silEdge_t *__restrict sil, *__restrict end;
	vertIndex_t *__restrict si;
	byte inc[2] = { 0, 6 };

	si = shadowIndexes;
	end = silEdges + numSilEdges;
	for ( sil = silEdges; sil + 3 < end; sil += 4 ) {

		byte f1a = facing[sil[0].p1];
		byte f2a = facing[sil[0].p2];
		byte f1b = facing[sil[1].p1];
		byte f2b = facing[sil[1].p2];
		byte f1c = facing[sil[2].p1];
		byte f2c = facing[sil[2].p2];
		byte f1d = facing[sil[3].p1];
		byte f2d = facing[sil[3].p2];

		byte t0 = inc[ f1a ^ f2a ];
		int v1a = sil[0].v1;
		int v2a = sil[0].v2;

		si[0] = v1a;
		si[1] = v2a ^ f1a;
		si[2] = v2a ^ f2a;
		si[3] = v1a ^ f2a;
		si[4] = v1a ^ f1a;
		si[5] = v2a ^ 1;

		si += t0;

		byte t1 = inc[ f1b ^ f2b ];
		int v1b = sil[1].v1;
		int v2b = sil[1].v2;

		si[0] = v1b;
		si[1] = v2b ^ f1b;
		si[2] = v2b ^ f2b;
		si[3] = v1b ^ f2b;
		si[4] = v1b ^ f1b;
		si[5] = v2b ^ 1;

		si += t1;

		byte t2 = inc[ f1c ^ f2c ];
		int v1c = sil[2].v1;
		int v2c = sil[2].v2;

		si[0] = v1c;
		si[1] = v2c ^ f1c;
		si[2] = v2c ^ f2c;
		si[3] = v1c ^ f2c;
		si[4] = v1c ^ f1c;
		si[5] = v2c ^ 1;

		si += t2;

		byte t3 = inc[ f1d ^ f2d ];
		int v1d = sil[3].v1;
		int v2d = sil[3].v2;

		si[0] = v1d;
		si[1] = v2d ^ f1d;
		si[2] = v2d ^ f2d;
		si[3] = v1d ^ f2d;
		si[4] = v1d ^ f1d;
		si[5] = v2d ^ 1;

		si += t3;
	}
	for ( ; sil < end; sil++ ) {

		byte f1 = facing[sil->p1];
		byte f2 = facing[sil->p2];

		byte t = inc[ f1 ^ f2 ];
		int v1 = sil->v1;
		int v2 = sil->v2;

		si[0] = v1;
		si[1] = v2 ^ f1;
		si[2] = v2 ^ f2;
		si[3] = v1 ^ f2;
		si[4] = v1 ^ f1;
		si[5] = v2 ^ 1;

		si += t;
	}
	return si - shadowIndexes;
}

/*
============
idSIMD_Xenon::ShadowVolume_CreateSilTrianglesParallel
============
*/
int VPCALL idSIMD_Xenon::ShadowVolume_CreateSilTrianglesParallel( vertIndex_t *__restrict shadowIndexes, const byte *__restrict facing, const silEdge_t *__restrict silEdges, const int numSilEdges ) {
	const silEdge_t *__restrict sil, *__restrict end;
	vertIndex_t *__restrict si;
	byte inc[2] = { 0, 3 };

	si = shadowIndexes;
	end = silEdges + numSilEdges;
	for ( sil = silEdges; sil + 3 < end; sil += 4 ) {

		byte f1a = facing[sil[0].p1];
		byte f2a = facing[sil[0].p2];

		byte t0 = inc[ f1a ^ f2a ];
		int v1a = sil[0].v1;
		int v2a = sil[0].v2;
		int m0 = f1a - f2a;

		si[0] = v1a;
		si[1] = ( m0 & v2a ) | f1a;
		si[2] = ( ~m0 & v2a ) | f2a;

		si += t0;

		byte f1b = facing[sil[1].p1];
		byte f2b = facing[sil[1].p2];

		byte t1 = inc[ f1b ^ f2b ];
		int v1b = sil[1].v1;
		int v2b = sil[1].v2;
		int m1 = f1b - f2b;

		si[0] = v1b;
		si[1] = ( m1 & v2b ) | f1b;
		si[2] = ( ~m1 & v2b ) | f2b;

		si += t1;

		byte f1c = facing[sil[2].p1];
		byte f2c = facing[sil[2].p2];

		byte t2 = inc[ f1c ^ f2c ];
		int v1c = sil[2].v1;
		int v2c = sil[2].v2;
		int m2 = f1c - f2c;

		si[0] = v1c;
		si[1] = ( m2 & v2c ) | f1c;
		si[2] = ( ~m2 & v2c ) | f2c;

		si += t2;

		byte f1d = facing[sil[3].p1];
		byte f2d = facing[sil[3].p2];

		byte t3 = inc[ f1d ^ f2d ];
		int v1d = sil[3].v1;
		int v2d = sil[3].v2;
		int m3 = f1d - f2d;

		si[0] = v1d;
		si[1] = ( m3 & v2d ) | f1d;
		si[2] = ( ~m3 & v2d ) | f2d;

		si += t3;
	}

	for ( ; sil < end; sil++ ) {

		byte f1 = facing[sil->p1];
		byte f2 = facing[sil->p2];

		byte t = inc[ f1 ^ f2 ];
		int v1 = sil->v1;
		int v2 = sil->v2;
		int m = f1 - f2;

		si[0] = v1;
		si[1] = ( m & v2 ) | f1;
		si[2] = ( ~m & v2 ) | f2;

		si += t;
	}
	return si - shadowIndexes;
}

/*
============
idSIMD_Xenon::ShadowVolume_CreateCapTriangles
============
*/
int VPCALL idSIMD_Xenon::ShadowVolume_CreateCapTriangles( vertIndex_t *__restrict shadowIndexes, const byte *__restrict facing, const vertIndex_t *__restrict indexes, const int numIndexes ) {
	int i, j;
	vertIndex_t *__restrict si;
	byte inc[2] = { 6, 0 };

	si = shadowIndexes;
	for ( i = 0, j = 0; i < numIndexes - 3*4; i += 4*3, j += 4 ) {
		byte t0 = inc[facing[j+0]];
		byte t1 = inc[facing[j+1]];
		byte t2 = inc[facing[j+2]];
		byte t3 = inc[facing[j+3]];

		int i0 = indexes[i+0*3+0] << 1;
		int i1 = indexes[i+0*3+1] << 1;
		int i2 = indexes[i+0*3+2] << 1;

		si[0] = i2;
		si[1] = i1;
		si[2] = i0;
		si[3] = i0 + 1;
		si[4] = i1 + 1;
		si[5] = i2 + 1;

		si += t0;

		int i3 = indexes[i+1*3+0] << 1;
		int i4 = indexes[i+1*3+1] << 1;
		int i5 = indexes[i+1*3+2] << 1;

		si[0] = i5;
		si[1] = i4;
		si[2] = i3;
		si[3] = i3 + 1;
		si[4] = i4 + 1;
		si[5] = i5 + 1;

		si += t1;

		int i6 = indexes[i+2*3+0] << 1;
		int i7 = indexes[i+2*3+1] << 1;
		int i8 = indexes[i+2*3+2] << 1;

		si[0] = i8;
		si[1] = i7;
		si[2] = i6;
		si[3] = i6 + 1;
		si[4] = i7 + 1;
		si[5] = i8 + 1;

		si += t2;

		int i9  = indexes[i+3*3+0] << 1;
		int i10 = indexes[i+3*3+1] << 1;
		int i11 = indexes[i+3*3+2] << 1;

		si[0] = i11;
		si[1] = i10;
		si[2] = i9;
		si[3] = i9 + 1;
		si[4] = i10 + 1;
		si[5] = i11 + 1;

		si += t3;
	}

	for ( ; i < numIndexes; i += 3, j++ ) {
		byte t = inc[facing[j]];

		int i0 = indexes[i+0] << 1;
		int i1 = indexes[i+1] << 1;
		int i2 = indexes[i+2] << 1;

		si[0] = i2;
		si[1] = i1;
		si[2] = i0;
		si[3] = i0 + 1;
		si[4] = i1 + 1;
		si[5] = i2 + 1;

		si += t;
	}

	return si - shadowIndexes;
}

/*
============
idSIMD_Xenon::ShadowVolume_CreateCapTrianglesParallel
============
*/
int VPCALL idSIMD_Xenon::ShadowVolume_CreateCapTrianglesParallel( vertIndex_t *__restrict shadowIndexes, const byte *__restrict facing, const vertIndex_t *__restrict indexes, const int numIndexes ) {
	int i, j;
	vertIndex_t *__restrict si;
	byte inc[2] = { 3, 0 };

	si = shadowIndexes;
	for ( i = 0, j = 0; i < numIndexes - 3*4; i += 3*4, j += 4 ) {
		byte t0 = inc[facing[j+0]];
		byte t1 = inc[facing[j+1]];
		byte t2 = inc[facing[j+2]];
		byte t3 = inc[facing[j+3]];

		si[0] = indexes[i+0*3+2] << 1;
		si[1] = indexes[i+0*3+1] << 1;
		si[2] = indexes[i+0*3+0] << 1;

		si += t0;

		si[0] = indexes[i+1*3+2] << 1;
		si[1] = indexes[i+1*3+1] << 1;
		si[2] = indexes[i+1*3+0] << 1;

		si += t1;

		si[0] = indexes[i+2*3+2] << 1;
		si[1] = indexes[i+2*3+1] << 1;
		si[2] = indexes[i+2*3+0] << 1;

		si += t2;

		si[0] = indexes[i+3*3+2] << 1;
		si[1] = indexes[i+3*3+1] << 1;
		si[2] = indexes[i+3*3+0] << 1;

		si += t3;
	}
	for ( ; i < numIndexes; i += 3, j++ ) {
		byte t = inc[facing[j]];

		si[0] = indexes[i+2] << 1;
		si[1] = indexes[i+1] << 1;
		si[2] = indexes[i+0] << 1;

		si += t;
	}
	return si - shadowIndexes;
}

#endif