From 9c37079c16015fc58de29d3de366e0d93dc11f8a Mon Sep 17 00:00:00 2001
From: Brian Harris <brian@idsoftware.com>
Date: Wed, 29 May 2013 13:12:13 -0500
Subject: [PATCH] Restored generic (non-SIMD) code

---
 neo/idlib/geometry/DrawVert.h                 |   14 +
 neo/idlib/geometry/DrawVert_intrinsics.h      |    6 +
 neo/idlib/geometry/RenderMatrix.cpp           | 1200 +++++++++++++++++
 neo/idlib/math/Lcp.cpp                        |  100 +-
 neo/idlib/math/MatX.cpp                       |   15 +
 neo/idlib/math/MatX.h                         |   18 +-
 neo/idlib/math/Math.cpp                       |    2 +
 neo/idlib/math/Math.h                         |   73 +
 neo/idlib/math/VecX.h                         |   22 +-
 neo/idlib/math/Vector.h                       |    4 +
 neo/idlib/sys/sys_defines.h                   |   58 +
 neo/idlib/sys/sys_intrinsics.h                |   20 +
 neo/renderer/BufferObject.cpp                 |   10 +
 neo/renderer/DXT/DXTCodec.h                   |   20 +
 neo/renderer/DXT/DXTEncoder.cpp               |   57 +
 neo/renderer/DXT/DXTEncoder_SSE2.cpp          | 1198 ++++++++++++++++
 neo/renderer/GLMatrix.cpp                     |   36 +
 neo/renderer/ModelDecal.cpp                   |   52 +
 neo/renderer/ModelOverlay.cpp                 |   88 ++
 neo/renderer/Model_md5.cpp                    |   21 +
 neo/renderer/jobs/ShadowShared.cpp            |   49 +
 .../DynamicShadowVolume.cpp                   |  191 +++
 neo/renderer/tr_trace.cpp                     |   98 ++
 23 files changed, 3328 insertions(+), 24 deletions(-)

diff --git a/neo/idlib/geometry/DrawVert.h b/neo/idlib/geometry/DrawVert.h
index 2797d427..0d1d1dbf 100644
--- a/neo/idlib/geometry/DrawVert.h
+++ b/neo/idlib/geometry/DrawVert.h
@@ -193,6 +193,7 @@ Assumes input is in the range [-1, 1]
 ID_INLINE void VertexFloatToByte( const float & x, const float & y, const float & z, byte * bval ) {
 	assert_4_byte_aligned( bval );	// for __stvebx
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	const __m128 vector_float_one			= { 1.0f, 1.0f, 1.0f, 1.0f };
 	const __m128 vector_float_half			= { 0.5f, 0.5f, 0.5f, 0.5f };
@@ -209,6 +210,13 @@ ID_INLINE void VertexFloatToByte( const float & x, const float & y, const float
 	bval[1] = (byte)_mm_extract_epi16( xyz16, 1 );
 	bval[2] = (byte)_mm_extract_epi16( xyz16, 2 );
 
+#else
+
+	bval[0] = VERTEX_FLOAT_TO_BYTE( x );
+	bval[1] = VERTEX_FLOAT_TO_BYTE( y );
+	bval[2] = VERTEX_FLOAT_TO_BYTE( z );
+
+#endif
 }
 
 /*
@@ -609,6 +617,7 @@ ID_INLINE void WriteDrawVerts16( idDrawVert * destVerts, const idDrawVert * loca
 	assert_16_byte_aligned( destVerts );
 	assert_16_byte_aligned( localVerts );
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	for ( int i = 0; i < numVerts; i++ ) {
 		__m128i v0 = _mm_load_si128( (const __m128i *)( (byte *)( localVerts + i ) +  0 ) );
@@ -617,6 +626,11 @@ ID_INLINE void WriteDrawVerts16( idDrawVert * destVerts, const idDrawVert * loca
 		_mm_stream_si128( (__m128i *)( (byte *)( destVerts + i ) + 16 ), v1 );
 	}
 
+#else
+
+	memcpy( destVerts, localVerts, numVerts * sizeof( idDrawVert ) );
+
+#endif
 }
 
 /*
diff --git a/neo/idlib/geometry/DrawVert_intrinsics.h b/neo/idlib/geometry/DrawVert_intrinsics.h
index 97df61c3..dd5a1aba 100644
--- a/neo/idlib/geometry/DrawVert_intrinsics.h
+++ b/neo/idlib/geometry/DrawVert_intrinsics.h
@@ -29,6 +29,7 @@ If you have questions concerning this license or the applicable additional terms
 #ifndef __DRAWVERT_INTRINSICS_H__
 #define __DRAWVERT_INTRINSICS_H__
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 static const __m128i vector_int_f32_sign_mask					= _mm_set1_epi32( 1U << IEEE_FLT_SIGN_BIT );
 static const __m128i vector_int_f32_exponent_mask				= _mm_set1_epi32( ( ( 1U << IEEE_FLT_EXPONENT_BITS ) - 1 ) << IEEE_FLT_MANTISSA_BITS );
@@ -50,12 +51,14 @@ static const __m128 vector_float_last_one						= {   0.0f,	  0.0f,   0.0f,   1.0
 static const __m128 vector_float_1_over_255						= { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f };
 static const __m128 vector_float_1_over_4						= { 1.0f / 4.0f, 1.0f / 4.0f, 1.0f / 4.0f, 1.0f / 4.0f };
 
+#endif
 
 /*
 ====================
 FastF32toF16
 ====================
 */
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 ID_INLINE_EXTERN __m128i FastF32toF16( __m128i f32_bits ) {
 	__m128i f16_sign     = _mm_srli_epi32( _mm_and_si128( f32_bits, vector_int_f32_sign_mask     ), f32_to_f16_sign_shift );
@@ -77,6 +80,7 @@ ID_INLINE_EXTERN __m128i FastF32toF16( __m128i f32_bits ) {
 	return _mm_packs_epi32( flt16, flt16 );
 }
 
+#endif
 
 ID_INLINE_EXTERN halfFloat_t Scalar_FastF32toF16( float f32 ) {
 	const int f32_sign_mask				= 1U << IEEE_FLT_SIGN_BIT;
@@ -115,6 +119,7 @@ ID_INLINE_EXTERN halfFloat_t Scalar_FastF32toF16( float f32 ) {
 LoadSkinnedDrawVertPosition
 ====================
 */
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 ID_INLINE_EXTERN __m128 LoadSkinnedDrawVertPosition( const idDrawVert & base, const idJointMat * joints ) {
 	const idJointMat & j0 = joints[base.color[0]];
@@ -176,6 +181,7 @@ ID_INLINE_EXTERN __m128 LoadSkinnedDrawVertPosition( const idDrawVert & base, co
 	return r0;
 }
 
+#endif
 
 ID_INLINE_EXTERN idVec3 Scalar_LoadSkinnedDrawVertPosition( const idDrawVert & vert, const idJointMat * joints ) {
 	const idJointMat & j0 = joints[vert.color[0]];
diff --git a/neo/idlib/geometry/RenderMatrix.cpp b/neo/idlib/geometry/RenderMatrix.cpp
index 618cc16c..b15e702a 100644
--- a/neo/idlib/geometry/RenderMatrix.cpp
+++ b/neo/idlib/geometry/RenderMatrix.cpp
@@ -92,6 +92,7 @@ SIMD constants
 ================================================================================================
 */
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 static const __m128i vector_int_1							= _mm_set1_epi32( 1 );
 static const __m128i vector_int_4							= _mm_set1_epi32( 4 );
@@ -117,6 +118,7 @@ static const __m128 vector_float_pos_one					= { +1.0f, +1.0f, +1.0f, +1.0f };
 static const __m128 vector_float_neg_one					= { -1.0f, -1.0f, -1.0f, -1.0f };
 static const __m128 vector_float_last_one					= { 0.0f, 0.0f, 0.0f, 1.0f };
 
+#endif
 
 /*
 ================================================================================================
@@ -531,6 +533,7 @@ front bits:
   bit 5 = pos-Z is front facing
 ========================
 */
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 static int GetBoxFrontBits_SSE2( const __m128 & b0, const __m128 & b1, const __m128 & viewOrigin ) {
 	const __m128 dir0 = _mm_sub_ps( viewOrigin, b0 );
@@ -542,6 +545,22 @@ static int GetBoxFrontBits_SSE2( const __m128 & b0, const __m128 & b1, const __m
 	return frontBits;
 }
 
+#else
+
+static int GetBoxFrontBits_Generic( const idBounds & bounds, const idVec3 & viewOrigin ) {
+	idVec3 dir0 = viewOrigin - bounds[0];
+	idVec3 dir1 = bounds[1] - viewOrigin;
+	int frontBits = 0;
+	frontBits |= IEEE_FLT_SIGNBITSET( dir0.x ) << 0;
+	frontBits |= IEEE_FLT_SIGNBITSET( dir0.y ) << 1;
+	frontBits |= IEEE_FLT_SIGNBITSET( dir0.z ) << 2;
+	frontBits |= IEEE_FLT_SIGNBITSET( dir1.x ) << 3;
+	frontBits |= IEEE_FLT_SIGNBITSET( dir1.y ) << 4;
+	frontBits |= IEEE_FLT_SIGNBITSET( dir1.z ) << 5;
+	return frontBits;
+}
+
+#endif
 
 /*
 ================================================================================================
@@ -720,6 +739,7 @@ The result matrix will transform the unit-cube to exactly cover the bounds.
 void idRenderMatrix::OffsetScaleForBounds( const idRenderMatrix & src, const idBounds & bounds, idRenderMatrix & out ) {
 	assert( &src != &out );
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	__m128 b0 = _mm_loadu_bounds_0( bounds );
 	__m128 b1 = _mm_loadu_bounds_1( bounds );
@@ -766,6 +786,32 @@ void idRenderMatrix::OffsetScaleForBounds( const idRenderMatrix & src, const idB
 	_mm_storeu_ps( out.m + 2*4, a2 );
 	_mm_storeu_ps( out.m + 3*4, a3 );
 
+#else
+
+	const idVec3 offset = ( bounds[1] + bounds[0] ) * 0.5f;
+	const idVec3 scale = ( bounds[1] - bounds[0] ) * 0.5f;
+
+	out[0][0] = src[0][0] * scale[0];
+	out[0][1] = src[0][1] * scale[1];
+	out[0][2] = src[0][2] * scale[2];
+	out[0][3] = src[0][3] + src[0][0] * offset[0] + src[0][1] * offset[1] + src[0][2] * offset[2];
+
+	out[1][0] = src[1][0] * scale[0];
+	out[1][1] = src[1][1] * scale[1];
+	out[1][2] = src[1][2] * scale[2];
+	out[1][3] = src[1][3] + src[1][0] * offset[0] + src[1][1] * offset[1] + src[1][2] * offset[2];
+
+	out[2][0] = src[2][0] * scale[0];
+	out[2][1] = src[2][1] * scale[1];
+	out[2][2] = src[2][2] * scale[2];
+	out[2][3] = src[2][3] + src[2][0] * offset[0] + src[2][1] * offset[1] + src[2][2] * offset[2];
+
+	out[3][0] = src[3][0] * scale[0];
+	out[3][1] = src[3][1] * scale[1];
+	out[3][2] = src[3][2] * scale[2];
+	out[3][3] = src[3][3] + src[3][0] * offset[0] + src[3][1] * offset[1] + src[3][2] * offset[2];
+
+#endif
 }
 
 /*
@@ -779,6 +825,7 @@ The result matrix will transform the bounds to exactly cover the unit-cube.
 void idRenderMatrix::InverseOffsetScaleForBounds( const idRenderMatrix & src, const idBounds & bounds, idRenderMatrix & out ) {
 	assert( &src != &out );
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	__m128 b0 = _mm_loadu_bounds_0( bounds );
 	__m128 b1 = _mm_loadu_bounds_1( bounds );
@@ -810,6 +857,32 @@ void idRenderMatrix::InverseOffsetScaleForBounds( const idRenderMatrix & src, co
 	_mm_storeu_ps( out.m + 2*4, a2 );
 	_mm_storeu_ps( out.m + 3*4, a3 );
 
+#else
+
+	const idVec3 offset = -0.5f * ( bounds[1] + bounds[0] );
+	const idVec3 scale = 2.0f / ( bounds[1] - bounds[0] );
+
+	out[0][0] = scale[0] * src[0][0];
+	out[0][1] = scale[0] * src[0][1];
+	out[0][2] = scale[0] * src[0][2];
+	out[0][3] = scale[0] * ( src[0][3] + offset[0] );
+						
+	out[1][0] = scale[1] * src[1][0];
+	out[1][1] = scale[1] * src[1][1];
+	out[1][2] = scale[1] * src[1][2];
+	out[1][3] = scale[1] * ( src[1][3] + offset[1] );
+						
+	out[2][0] = scale[2] * src[2][0];
+	out[2][1] = scale[2] * src[2][1];
+	out[2][2] = scale[2] * src[2][2];
+	out[2][3] = scale[2] * ( src[2][3] + offset[2] );
+
+	out[3][0] = src[3][0];
+	out[3][1] = src[3][1];
+	out[3][2] = src[3][2];
+	out[3][3] = src[3][3];
+
+#endif
 }
 
 /*
@@ -820,6 +893,7 @@ idRenderMatrix::Transpose
 void idRenderMatrix::Transpose( const idRenderMatrix & src, idRenderMatrix & out ) {
 	assert( &src != &out );
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	const __m128 a0 = _mm_loadu_ps( src.m + 0*4 );
 	const __m128 a1 = _mm_loadu_ps( src.m + 1*4 );
@@ -841,6 +915,15 @@ void idRenderMatrix::Transpose( const idRenderMatrix & src, idRenderMatrix & out
 	_mm_storeu_ps( out.m + 2*4, t2 );
 	_mm_storeu_ps( out.m + 3*4, t3 );
 
+#else
+
+	for ( int i = 0; i < 4; i++ ) {
+		for ( int j = 0; j < 4; j++ ) {
+			out[i][j] = src[j][i];
+		}
+	}
+
+#endif
 }
 
 /*
@@ -850,6 +933,7 @@ idRenderMatrix::Multiply
 */
 void idRenderMatrix::Multiply( const idRenderMatrix & a, const idRenderMatrix & b, idRenderMatrix & out ) {
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	__m128 a0 = _mm_loadu_ps( a.m + 0*4 );
 	__m128 a1 = _mm_loadu_ps( a.m + 1*4 );
@@ -886,6 +970,41 @@ void idRenderMatrix::Multiply( const idRenderMatrix & a, const idRenderMatrix &
 	_mm_storeu_ps( out.m + 2*4, t2 );
 	_mm_storeu_ps( out.m + 3*4, t3 );
 
+#else
+
+	/*
+	for ( int i = 0 ; i < 4 ; i++ ) {
+		for ( int j = 0 ; j < 4 ; j++ ) {
+			out.m[ i * 4 + j ] =
+				a.m[ i * 4 + 0 ] * b.m[ 0 * 4 + j ] +
+				a.m[ i * 4 + 1 ] * b.m[ 1 * 4 + j ] +
+				a.m[ i * 4 + 2 ] * b.m[ 2 * 4 + j ] +
+				a.m[ i * 4 + 3 ] * b.m[ 3 * 4 + j ];
+		}
+	}
+	*/
+
+	out.m[0*4+0] = a.m[0*4+0]*b.m[0*4+0] + a.m[0*4+1]*b.m[1*4+0] + a.m[0*4+2]*b.m[2*4+0] + a.m[0*4+3]*b.m[3*4+0];
+	out.m[0*4+1] = a.m[0*4+0]*b.m[0*4+1] + a.m[0*4+1]*b.m[1*4+1] + a.m[0*4+2]*b.m[2*4+1] + a.m[0*4+3]*b.m[3*4+1];
+	out.m[0*4+2] = a.m[0*4+0]*b.m[0*4+2] + a.m[0*4+1]*b.m[1*4+2] + a.m[0*4+2]*b.m[2*4+2] + a.m[0*4+3]*b.m[3*4+2];
+	out.m[0*4+3] = a.m[0*4+0]*b.m[0*4+3] + a.m[0*4+1]*b.m[1*4+3] + a.m[0*4+2]*b.m[2*4+3] + a.m[0*4+3]*b.m[3*4+3];
+
+	out.m[1*4+0] = a.m[1*4+0]*b.m[0*4+0] + a.m[1*4+1]*b.m[1*4+0] + a.m[1*4+2]*b.m[2*4+0] + a.m[1*4+3]*b.m[3*4+0];
+	out.m[1*4+1] = a.m[1*4+0]*b.m[0*4+1] + a.m[1*4+1]*b.m[1*4+1] + a.m[1*4+2]*b.m[2*4+1] + a.m[1*4+3]*b.m[3*4+1];
+	out.m[1*4+2] = a.m[1*4+0]*b.m[0*4+2] + a.m[1*4+1]*b.m[1*4+2] + a.m[1*4+2]*b.m[2*4+2] + a.m[1*4+3]*b.m[3*4+2];
+	out.m[1*4+3] = a.m[1*4+0]*b.m[0*4+3] + a.m[1*4+1]*b.m[1*4+3] + a.m[1*4+2]*b.m[2*4+3] + a.m[1*4+3]*b.m[3*4+3];
+
+	out.m[2*4+0] = a.m[2*4+0]*b.m[0*4+0] + a.m[2*4+1]*b.m[1*4+0] + a.m[2*4+2]*b.m[2*4+0] + a.m[2*4+3]*b.m[3*4+0];
+	out.m[2*4+1] = a.m[2*4+0]*b.m[0*4+1] + a.m[2*4+1]*b.m[1*4+1] + a.m[2*4+2]*b.m[2*4+1] + a.m[2*4+3]*b.m[3*4+1];
+	out.m[2*4+2] = a.m[2*4+0]*b.m[0*4+2] + a.m[2*4+1]*b.m[1*4+2] + a.m[2*4+2]*b.m[2*4+2] + a.m[2*4+3]*b.m[3*4+2];
+	out.m[2*4+3] = a.m[2*4+0]*b.m[0*4+3] + a.m[2*4+1]*b.m[1*4+3] + a.m[2*4+2]*b.m[2*4+3] + a.m[2*4+3]*b.m[3*4+3];
+
+	out.m[3*4+0] = a.m[3*4+0]*b.m[0*4+0] + a.m[3*4+1]*b.m[1*4+0] + a.m[3*4+2]*b.m[2*4+0] + a.m[3*4+3]*b.m[3*4+0];
+	out.m[3*4+1] = a.m[3*4+0]*b.m[0*4+1] + a.m[3*4+1]*b.m[1*4+1] + a.m[3*4+2]*b.m[2*4+1] + a.m[3*4+3]*b.m[3*4+1];
+	out.m[3*4+2] = a.m[3*4+0]*b.m[0*4+2] + a.m[3*4+1]*b.m[1*4+2] + a.m[3*4+2]*b.m[2*4+2] + a.m[3*4+3]*b.m[3*4+2];
+	out.m[3*4+3] = a.m[3*4+0]*b.m[0*4+3] + a.m[3*4+1]*b.m[1*4+3] + a.m[3*4+2]*b.m[2*4+3] + a.m[3*4+3]*b.m[3*4+3];
+
+#endif
 }
 
 /*
@@ -905,6 +1024,7 @@ can get really, really small.
 */
 bool idRenderMatrix::Inverse( const idRenderMatrix & src, idRenderMatrix & out ) {
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	const __m128 r0 = _mm_loadu_ps( src.m + 0 * 4 );
 	const __m128 r1 = _mm_loadu_ps( src.m + 1 * 4 );
@@ -1009,6 +1129,87 @@ bool idRenderMatrix::Inverse( const idRenderMatrix & src, idRenderMatrix & out )
 	_mm_storeu_ps( out.m + 2 * 4, _mm_mul_ps( adjoint_r2, rcpDet ) );
 	_mm_storeu_ps( out.m + 3 * 4, _mm_mul_ps( adjoint_r3, rcpDet ) );
 
+#else
+
+	const int FRL = 4;
+
+	// 84+4+16 = 104 multiplications
+	//			   1 division
+
+	// 2x2 sub-determinants required to calculate 4x4 determinant
+	const float det2_01_01 = src.m[0*FRL+0] * src.m[1*FRL+1] - src.m[0*FRL+1] * src.m[1*FRL+0];
+	const float det2_01_02 = src.m[0*FRL+0] * src.m[1*FRL+2] - src.m[0*FRL+2] * src.m[1*FRL+0];
+	const float det2_01_03 = src.m[0*FRL+0] * src.m[1*FRL+3] - src.m[0*FRL+3] * src.m[1*FRL+0];
+	const float det2_01_12 = src.m[0*FRL+1] * src.m[1*FRL+2] - src.m[0*FRL+2] * src.m[1*FRL+1];
+	const float det2_01_13 = src.m[0*FRL+1] * src.m[1*FRL+3] - src.m[0*FRL+3] * src.m[1*FRL+1];
+	const float det2_01_23 = src.m[0*FRL+2] * src.m[1*FRL+3] - src.m[0*FRL+3] * src.m[1*FRL+2];
+
+	// 3x3 sub-determinants required to calculate 4x4 determinant
+	const float det3_201_012 = src.m[2*FRL+0] * det2_01_12 - src.m[2*FRL+1] * det2_01_02 + src.m[2*FRL+2] * det2_01_01;
+	const float det3_201_013 = src.m[2*FRL+0] * det2_01_13 - src.m[2*FRL+1] * det2_01_03 + src.m[2*FRL+3] * det2_01_01;
+	const float det3_201_023 = src.m[2*FRL+0] * det2_01_23 - src.m[2*FRL+2] * det2_01_03 + src.m[2*FRL+3] * det2_01_02;
+	const float det3_201_123 = src.m[2*FRL+1] * det2_01_23 - src.m[2*FRL+2] * det2_01_13 + src.m[2*FRL+3] * det2_01_12;
+
+	const float det = ( - det3_201_123 * src.m[3*FRL+0] + det3_201_023 * src.m[3*FRL+1] - det3_201_013 * src.m[3*FRL+2] + det3_201_012 * src.m[3*FRL+3] );
+
+	if ( idMath::Fabs( det ) < RENDER_MATRIX_INVERSE_EPSILON ) {
+		return false;
+	}
+
+	const float rcpDet = 1.0f / det;
+
+	// remaining 2x2 sub-determinants
+	const float det2_03_01 = src.m[0*FRL+0] * src.m[3*FRL+1] - src.m[0*FRL+1] * src.m[3*FRL+0];
+	const float det2_03_02 = src.m[0*FRL+0] * src.m[3*FRL+2] - src.m[0*FRL+2] * src.m[3*FRL+0];
+	const float det2_03_03 = src.m[0*FRL+0] * src.m[3*FRL+3] - src.m[0*FRL+3] * src.m[3*FRL+0];
+	const float det2_03_12 = src.m[0*FRL+1] * src.m[3*FRL+2] - src.m[0*FRL+2] * src.m[3*FRL+1];
+	const float det2_03_13 = src.m[0*FRL+1] * src.m[3*FRL+3] - src.m[0*FRL+3] * src.m[3*FRL+1];
+	const float det2_03_23 = src.m[0*FRL+2] * src.m[3*FRL+3] - src.m[0*FRL+3] * src.m[3*FRL+2];
+
+	const float det2_13_01 = src.m[1*FRL+0] * src.m[3*FRL+1] - src.m[1*FRL+1] * src.m[3*FRL+0];
+	const float det2_13_02 = src.m[1*FRL+0] * src.m[3*FRL+2] - src.m[1*FRL+2] * src.m[3*FRL+0];
+	const float det2_13_03 = src.m[1*FRL+0] * src.m[3*FRL+3] - src.m[1*FRL+3] * src.m[3*FRL+0];
+	const float det2_13_12 = src.m[1*FRL+1] * src.m[3*FRL+2] - src.m[1*FRL+2] * src.m[3*FRL+1];
+	const float det2_13_13 = src.m[1*FRL+1] * src.m[3*FRL+3] - src.m[1*FRL+3] * src.m[3*FRL+1];
+	const float det2_13_23 = src.m[1*FRL+2] * src.m[3*FRL+3] - src.m[1*FRL+3] * src.m[3*FRL+2];
+
+	// remaining 3x3 sub-determinants
+	const float det3_203_012 = src.m[2*FRL+0] * det2_03_12 - src.m[2*FRL+1] * det2_03_02 + src.m[2*FRL+2] * det2_03_01;
+	const float det3_203_013 = src.m[2*FRL+0] * det2_03_13 - src.m[2*FRL+1] * det2_03_03 + src.m[2*FRL+3] * det2_03_01;
+	const float det3_203_023 = src.m[2*FRL+0] * det2_03_23 - src.m[2*FRL+2] * det2_03_03 + src.m[2*FRL+3] * det2_03_02;
+	const float det3_203_123 = src.m[2*FRL+1] * det2_03_23 - src.m[2*FRL+2] * det2_03_13 + src.m[2*FRL+3] * det2_03_12;
+
+	const float det3_213_012 = src.m[2*FRL+0] * det2_13_12 - src.m[2*FRL+1] * det2_13_02 + src.m[2*FRL+2] * det2_13_01;
+	const float det3_213_013 = src.m[2*FRL+0] * det2_13_13 - src.m[2*FRL+1] * det2_13_03 + src.m[2*FRL+3] * det2_13_01;
+	const float det3_213_023 = src.m[2*FRL+0] * det2_13_23 - src.m[2*FRL+2] * det2_13_03 + src.m[2*FRL+3] * det2_13_02;
+	const float det3_213_123 = src.m[2*FRL+1] * det2_13_23 - src.m[2*FRL+2] * det2_13_13 + src.m[2*FRL+3] * det2_13_12;
+
+	const float det3_301_012 = src.m[3*FRL+0] * det2_01_12 - src.m[3*FRL+1] * det2_01_02 + src.m[3*FRL+2] * det2_01_01;
+	const float det3_301_013 = src.m[3*FRL+0] * det2_01_13 - src.m[3*FRL+1] * det2_01_03 + src.m[3*FRL+3] * det2_01_01;
+	const float det3_301_023 = src.m[3*FRL+0] * det2_01_23 - src.m[3*FRL+2] * det2_01_03 + src.m[3*FRL+3] * det2_01_02;
+	const float det3_301_123 = src.m[3*FRL+1] * det2_01_23 - src.m[3*FRL+2] * det2_01_13 + src.m[3*FRL+3] * det2_01_12;
+
+	out.m[0*FRL+0] = - det3_213_123 * rcpDet;
+	out.m[1*FRL+0] = + det3_213_023 * rcpDet;
+	out.m[2*FRL+0] = - det3_213_013 * rcpDet;
+	out.m[3*FRL+0] = + det3_213_012 * rcpDet;
+
+	out.m[0*FRL+1] = + det3_203_123 * rcpDet;
+	out.m[1*FRL+1] = - det3_203_023 * rcpDet;
+	out.m[2*FRL+1] = + det3_203_013 * rcpDet;
+	out.m[3*FRL+1] = - det3_203_012 * rcpDet;
+
+	out.m[0*FRL+2] = + det3_301_123 * rcpDet;
+	out.m[1*FRL+2] = - det3_301_023 * rcpDet;
+	out.m[2*FRL+2] = + det3_301_013 * rcpDet;
+	out.m[3*FRL+2] = - det3_301_012 * rcpDet;
+
+	out.m[0*FRL+3] = - det3_201_123 * rcpDet;
+	out.m[1*FRL+3] = + det3_201_023 * rcpDet;
+	out.m[2*FRL+3] = - det3_201_013 * rcpDet;
+	out.m[3*FRL+3] = + det3_201_012 * rcpDet;
+
+#endif
 
 	return true;
 }
@@ -1133,6 +1334,7 @@ bool idRenderMatrix::InverseByDoubles( const idRenderMatrix & src, idRenderMatri
 DeterminantIsNegative
 ========================
 */
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 void DeterminantIsNegative( bool & negativeDeterminant, const __m128 & r0, const __m128 & r1, const __m128 & r2, const __m128 & r3 ) {
 
@@ -1177,6 +1379,30 @@ void DeterminantIsNegative( bool & negativeDeterminant, const __m128 & r0, const
 	negativeDeterminant	= _mm_movemask_ps( result ) & 1;
 }
 
+#else
+
+void DeterminantIsNegative( bool & negativeDeterminant, const float * row0, const float * row1, const float * row2, const float * row3 ) {
+
+	// 2x2 sub-determinants required to calculate 4x4 determinant
+	const float det2_01_01 = row0[0] * row1[1] - row0[1] * row1[0];
+	const float det2_01_02 = row0[0] * row1[2] - row0[2] * row1[0];
+	const float det2_01_03 = row0[0] * row1[3] - row0[3] * row1[0];
+	const float det2_01_12 = row0[1] * row1[2] - row0[2] * row1[1];
+	const float det2_01_13 = row0[1] * row1[3] - row0[3] * row1[1];
+	const float det2_01_23 = row0[2] * row1[3] - row0[3] * row1[2];
+
+	// 3x3 sub-determinants required to calculate 4x4 determinant
+	const float det3_201_012 = row2[0] * det2_01_12 - row2[1] * det2_01_02 + row2[2] * det2_01_01;
+	const float det3_201_013 = row2[0] * det2_01_13 - row2[1] * det2_01_03 + row2[3] * det2_01_01;
+	const float det3_201_023 = row2[0] * det2_01_23 - row2[2] * det2_01_03 + row2[3] * det2_01_02;
+	const float det3_201_123 = row2[1] * det2_01_23 - row2[2] * det2_01_13 + row2[3] * det2_01_12;
+
+	const float det = ( - det3_201_123 * row3[0] + det3_201_023 * row3[1] - det3_201_013 * row3[2] + det3_201_012 * row3[3] );
+
+	negativeDeterminant = ( det < 0.0f );
+}
+
+#endif
 
 /*
 ========================
@@ -1189,6 +1415,7 @@ void idRenderMatrix::CopyMatrix( const idRenderMatrix & matrix, idVec4 & row0, i
 	assert_16_byte_aligned( row2.ToFloatPtr() );
 	assert_16_byte_aligned( row3.ToFloatPtr() );
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	const __m128 r0 = _mm_loadu_ps( matrix.m + 0 * 4 );
 	const __m128 r1 = _mm_loadu_ps( matrix.m + 1 * 4 );
@@ -1200,6 +1427,14 @@ void idRenderMatrix::CopyMatrix( const idRenderMatrix & matrix, idVec4 & row0, i
 	_mm_store_ps( row2.ToFloatPtr(), r2 );
 	_mm_store_ps( row3.ToFloatPtr(), r3 );
 
+#else
+
+	memcpy( row0.ToFloatPtr(), matrix[0], sizeof( idVec4 ) );
+	memcpy( row1.ToFloatPtr(), matrix[1], sizeof( idVec4 ) );
+	memcpy( row2.ToFloatPtr(), matrix[2], sizeof( idVec4 ) );
+	memcpy( row3.ToFloatPtr(), matrix[3], sizeof( idVec4 ) );
+
+#endif
 }
 
 /*
@@ -1213,6 +1448,7 @@ void idRenderMatrix::SetMVP( const idRenderMatrix & mvp, idVec4 & row0, idVec4 &
 	assert_16_byte_aligned( row2.ToFloatPtr() );
 	assert_16_byte_aligned( row3.ToFloatPtr() );
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	const __m128 r0 = _mm_loadu_ps( mvp.m + 0 * 4 );
 	const __m128 r1 = _mm_loadu_ps( mvp.m + 1 * 4 );
@@ -1226,6 +1462,16 @@ void idRenderMatrix::SetMVP( const idRenderMatrix & mvp, idVec4 & row0, idVec4 &
 
 	DeterminantIsNegative( negativeDeterminant, r0, r1, r2, r3 );
 
+#else
+
+	memcpy( row0.ToFloatPtr(), mvp[0], sizeof( idVec4 ) );
+	memcpy( row1.ToFloatPtr(), mvp[1], sizeof( idVec4 ) );
+	memcpy( row2.ToFloatPtr(), mvp[2], sizeof( idVec4 ) );
+	memcpy( row3.ToFloatPtr(), mvp[3], sizeof( idVec4 ) );
+
+	DeterminantIsNegative( negativeDeterminant, mvp[0], mvp[1], mvp[2], mvp[3] );
+
+#endif
 }
 
 /*
@@ -1239,6 +1485,7 @@ void idRenderMatrix::SetMVPForBounds( const idRenderMatrix & mvp, const idBounds
 	assert_16_byte_aligned( row2.ToFloatPtr() );
 	assert_16_byte_aligned( row3.ToFloatPtr() );
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	__m128 b0 = _mm_loadu_bounds_0( bounds );
 	__m128 b1 = _mm_loadu_bounds_1( bounds );
@@ -1287,6 +1534,34 @@ void idRenderMatrix::SetMVPForBounds( const idRenderMatrix & mvp, const idBounds
 
 	DeterminantIsNegative( negativeDeterminant, r0, r1, r2, r3 );
 
+#else
+
+	const idVec3 offset = ( bounds[1] + bounds[0] ) * 0.5f;
+	const idVec3 scale = ( bounds[1] - bounds[0] ) * 0.5f;
+
+	row0[0] = mvp[0][0] * scale[0];
+	row0[1] = mvp[0][1] * scale[1];
+	row0[2] = mvp[0][2] * scale[2];
+	row0[3] = mvp[0][3] + mvp[0][0] * offset[0] + mvp[0][1] * offset[1] + mvp[0][2] * offset[2];
+
+	row1[0] = mvp[1][0] * scale[0];
+	row1[1] = mvp[1][1] * scale[1];
+	row1[2] = mvp[1][2] * scale[2];
+	row1[3] = mvp[1][3] + mvp[1][0] * offset[0] + mvp[1][1] * offset[1] + mvp[1][2] * offset[2];
+
+	row2[0] = mvp[2][0] * scale[0];
+	row2[1] = mvp[2][1] * scale[1];
+	row2[2] = mvp[2][2] * scale[2];
+	row2[3] = mvp[2][3] + mvp[2][0] * offset[0] + mvp[2][1] * offset[1] + mvp[2][2] * offset[2];
+
+	row3[0] = mvp[3][0] * scale[0];
+	row3[1] = mvp[3][1] * scale[1];
+	row3[2] = mvp[3][2] * scale[2];
+	row3[3] = mvp[3][3] + mvp[3][0] * offset[0] + mvp[3][1] * offset[1] + mvp[3][2] * offset[2];
+
+	DeterminantIsNegative( negativeDeterminant, row0.ToFloatPtr(), row1.ToFloatPtr(), row2.ToFloatPtr(), row3.ToFloatPtr() );
+
+#endif
 }
 
 /*
@@ -1300,6 +1575,7 @@ void idRenderMatrix::SetMVPForInverseProject( const idRenderMatrix & mvp, const
 	assert_16_byte_aligned( row2.ToFloatPtr() );
 	assert_16_byte_aligned( row3.ToFloatPtr() );
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	__m128 r0 = _mm_loadu_ps( mvp.m + 0 * 4 );
 	__m128 r1 = _mm_loadu_ps( mvp.m + 1 * 4 );
@@ -1338,6 +1614,31 @@ void idRenderMatrix::SetMVPForInverseProject( const idRenderMatrix & mvp, const
 
 	DeterminantIsNegative( negativeDeterminant, t0, t1, t2, t3 );
 
+#else
+
+	row0[0] = mvp.m[0*4+0]*inverseProject.m[0*4+0] + mvp.m[0*4+1]*inverseProject.m[1*4+0] + mvp.m[0*4+2]*inverseProject.m[2*4+0] + mvp.m[0*4+3]*inverseProject.m[3*4+0];
+	row0[1] = mvp.m[0*4+0]*inverseProject.m[0*4+1] + mvp.m[0*4+1]*inverseProject.m[1*4+1] + mvp.m[0*4+2]*inverseProject.m[2*4+1] + mvp.m[0*4+3]*inverseProject.m[3*4+1];
+	row0[2] = mvp.m[0*4+0]*inverseProject.m[0*4+2] + mvp.m[0*4+1]*inverseProject.m[1*4+2] + mvp.m[0*4+2]*inverseProject.m[2*4+2] + mvp.m[0*4+3]*inverseProject.m[3*4+2];
+	row0[3] = mvp.m[0*4+0]*inverseProject.m[0*4+3] + mvp.m[0*4+1]*inverseProject.m[1*4+3] + mvp.m[0*4+2]*inverseProject.m[2*4+3] + mvp.m[0*4+3]*inverseProject.m[3*4+3];
+
+	row1[0] = mvp.m[1*4+0]*inverseProject.m[0*4+0] + mvp.m[1*4+1]*inverseProject.m[1*4+0] + mvp.m[1*4+2]*inverseProject.m[2*4+0] + mvp.m[1*4+3]*inverseProject.m[3*4+0];
+	row1[1] = mvp.m[1*4+0]*inverseProject.m[0*4+1] + mvp.m[1*4+1]*inverseProject.m[1*4+1] + mvp.m[1*4+2]*inverseProject.m[2*4+1] + mvp.m[1*4+3]*inverseProject.m[3*4+1];
+	row1[2] = mvp.m[1*4+0]*inverseProject.m[0*4+2] + mvp.m[1*4+1]*inverseProject.m[1*4+2] + mvp.m[1*4+2]*inverseProject.m[2*4+2] + mvp.m[1*4+3]*inverseProject.m[3*4+2];
+	row1[3] = mvp.m[1*4+0]*inverseProject.m[0*4+3] + mvp.m[1*4+1]*inverseProject.m[1*4+3] + mvp.m[1*4+2]*inverseProject.m[2*4+3] + mvp.m[1*4+3]*inverseProject.m[3*4+3];
+
+	row2[0] = mvp.m[2*4+0]*inverseProject.m[0*4+0] + mvp.m[2*4+1]*inverseProject.m[1*4+0] + mvp.m[2*4+2]*inverseProject.m[2*4+0] + mvp.m[2*4+3]*inverseProject.m[3*4+0];
+	row2[1] = mvp.m[2*4+0]*inverseProject.m[0*4+1] + mvp.m[2*4+1]*inverseProject.m[1*4+1] + mvp.m[2*4+2]*inverseProject.m[2*4+1] + mvp.m[2*4+3]*inverseProject.m[3*4+1];
+	row2[2] = mvp.m[2*4+0]*inverseProject.m[0*4+2] + mvp.m[2*4+1]*inverseProject.m[1*4+2] + mvp.m[2*4+2]*inverseProject.m[2*4+2] + mvp.m[2*4+3]*inverseProject.m[3*4+2];
+	row2[3] = mvp.m[2*4+0]*inverseProject.m[0*4+3] + mvp.m[2*4+1]*inverseProject.m[1*4+3] + mvp.m[2*4+2]*inverseProject.m[2*4+3] + mvp.m[2*4+3]*inverseProject.m[3*4+3];
+
+	row3[0] = mvp.m[3*4+0]*inverseProject.m[0*4+0] + mvp.m[3*4+1]*inverseProject.m[1*4+0] + mvp.m[3*4+2]*inverseProject.m[2*4+0] + mvp.m[3*4+3]*inverseProject.m[3*4+0];
+	row3[1] = mvp.m[3*4+0]*inverseProject.m[0*4+1] + mvp.m[3*4+1]*inverseProject.m[1*4+1] + mvp.m[3*4+2]*inverseProject.m[2*4+1] + mvp.m[3*4+3]*inverseProject.m[3*4+1];
+	row3[2] = mvp.m[3*4+0]*inverseProject.m[0*4+2] + mvp.m[3*4+1]*inverseProject.m[1*4+2] + mvp.m[3*4+2]*inverseProject.m[2*4+2] + mvp.m[3*4+3]*inverseProject.m[3*4+2];
+	row3[3] = mvp.m[3*4+0]*inverseProject.m[0*4+3] + mvp.m[3*4+1]*inverseProject.m[1*4+3] + mvp.m[3*4+2]*inverseProject.m[2*4+3] + mvp.m[3*4+3]*inverseProject.m[3*4+3];
+
+	DeterminantIsNegative( negativeDeterminant, row0.ToFloatPtr(), row1.ToFloatPtr(), row2.ToFloatPtr(), row3.ToFloatPtr() );
+
+#endif
 }
 
 /*
@@ -1398,6 +1699,7 @@ frustum plane, but only while also being behind another one.
 */
 bool idRenderMatrix::CullBoundsToMVPbits( const idRenderMatrix & mvp, const idBounds & bounds, byte * outBits, bool zeroToOne ) {
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	__m128 mvp0 = _mm_loadu_ps( mvp[0] );
 	__m128 mvp1 = _mm_loadu_ps( mvp[1] );
@@ -1497,6 +1799,48 @@ bool idRenderMatrix::CullBoundsToMVPbits( const idRenderMatrix & mvp, const idBo
 
 	return ( bits != 63 );
 
+#else
+
+	int bits = 0;
+
+	idVec3 v;
+	for ( int x = 0; x < 2; x++ ) {
+		v[0] = bounds[x][0];
+		for ( int y = 0; y < 2; y++ ) {
+			v[1] = bounds[y][1];
+			for ( int z = 0; z < 2; z++ ) {
+				v[2] = bounds[z][2];
+
+				idVec4 c;
+				for ( int i = 0; i < 4; i++ ) {
+					c[i] = v[0] * mvp[i][0] + v[1] * mvp[i][1] + v[2] * mvp[i][2] + mvp[i][3];
+				}
+
+				const float minW = zeroToOne ? 0.0f : -c[3];
+				const float maxW = c[3];
+#if defined( CLIP_SPACE_D3D )	// the D3D clip space Z is in the range [0,1] so always compare Z vs zero whether 'zeroToOne' is true or false
+				const float minZ = 0.0f;
+#else
+				const float minZ = minW;
+#endif
+
+				if ( c[0] > minW ) { bits |= ( 1 << 0 ); }
+				if ( c[0] < maxW ) { bits |= ( 1 << 1 ); }
+				if ( c[1] > minW ) { bits |= ( 1 << 2 ); }
+				if ( c[1] < maxW ) { bits |= ( 1 << 3 ); }
+				if ( c[2] > minZ ) { bits |= ( 1 << 4 ); }	// NOTE: using minZ
+				if ( c[2] < maxW ) { bits |= ( 1 << 5 ); }
+			}
+		}
+	}
+
+	// store out a bit set for each side where the bounds is outside the clip space
+	*outBits = (byte)( bits ^ 63 );
+
+	// if any bits weren't set, the bounds is completely off one side of the frustum
+	return ( bits != 63 );
+
+#endif
 }
 
 /*
@@ -1519,6 +1863,7 @@ frustum plane, but only while also being behind another one.
 bool idRenderMatrix::CullExtrudedBoundsToMVPbits( const idRenderMatrix & mvp, const idBounds & bounds, const idVec3 & extrudeDirection, const idPlane & clipPlane, byte * outBits, bool zeroToOne ) {
 	assert( idMath::Fabs( extrudeDirection * clipPlane.Normal() ) >= idMath::FLT_SMALLEST_NON_DENORMAL );
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	__m128 mvp0 = _mm_loadu_ps( mvp[0] );
 	__m128 mvp1 = _mm_loadu_ps( mvp[1] );
@@ -1736,6 +2081,62 @@ bool idRenderMatrix::CullExtrudedBoundsToMVPbits( const idRenderMatrix & mvp, co
 
 	return ( bits != 63 );
 
+#else
+
+	int bits = 0;
+
+	float closing = extrudeDirection * clipPlane.Normal();
+	float invClosing = -1.0f / closing;
+
+	idVec3 v;
+	for ( int x = 0; x < 2; x++ ) {
+		v[0] = bounds[x][0];
+		for ( int y = 0; y < 2; y++ ) {
+			v[1] = bounds[y][1];
+			for ( int z = 0; z < 2; z++ ) {
+				v[2] = bounds[z][2];
+
+				for ( int extrude = 0; extrude <= 1; extrude++ ) {
+
+					idVec3 test;
+					if ( extrude ) {
+						const float extrudeDist = clipPlane.Distance( v ) * invClosing;
+						test = v + extrudeDirection * extrudeDist;
+					} else {
+						test = v;
+					}
+
+					idVec4 c;
+					for ( int i = 0; i < 4; i++ ) {
+						c[i] = test[0] * mvp[i][0] + test[1] * mvp[i][1] + test[2] * mvp[i][2] + mvp[i][3];
+					}
+
+					const float minW = zeroToOne ? 0.0f : -c[3];
+					const float maxW = c[3];
+#if defined( CLIP_SPACE_D3D )	// the D3D clip space Z is in the range [0,1] so always compare Z vs zero whether 'zeroToOne' is true or false
+					const float minZ = 0.0f;
+#else
+					const float minZ = minW;
+#endif
+
+					if ( c[0] > minW ) { bits |= ( 1 << 0 ); }
+					if ( c[0] < maxW ) { bits |= ( 1 << 1 ); }
+					if ( c[1] > minW ) { bits |= ( 1 << 2 ); }
+					if ( c[1] < maxW ) { bits |= ( 1 << 3 ); }
+					if ( c[2] > minZ ) { bits |= ( 1 << 4 ); }	// NOTE: using minZ
+					if ( c[2] < maxW ) { bits |= ( 1 << 5 ); }
+				}
+			}
+		}
+	}
+
+	// store out a bit set for each side where the bounds is outside the clip space
+	*outBits = (byte)(bits ^ 63);
+
+	// if any bits weren't set, the bounds is completely off one side of the frustum
+	return ( bits != 63 );
+
+#endif
 }
 
 /*
@@ -1753,6 +2154,7 @@ is W=0 clipped.
 ========================
 */
 void idRenderMatrix::ProjectedBounds( idBounds & projected, const idRenderMatrix & mvp, const idBounds & bounds, bool windowSpace ) {
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	__m128 mvp0 = _mm_loadu_ps( mvp[0] );
 	__m128 mvp1 = _mm_loadu_ps( mvp[1] );
@@ -1885,6 +2287,78 @@ void idRenderMatrix::ProjectedBounds( idBounds & projected, const idRenderMatrix
 	_mm_store_ss( & projected[1].y, maxY );
 	_mm_store_ss( & projected[1].z, maxZ );
 
+#else
+
+	for ( int i = 0; i < 3; i++ ) {
+		projected[0][i] = RENDER_MATRIX_INFINITY;
+		projected[1][i] = - RENDER_MATRIX_INFINITY;
+	}
+
+	idVec3 v;
+	for ( int x = 0; x < 2; x++ ) {
+		v[0] = bounds[x][0];
+		for ( int y = 0; y < 2; y++ ) {
+			v[1] = bounds[y][1];
+			for ( int z = 0; z < 2; z++ ) {
+				v[2] = bounds[z][2];
+
+				float tx = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2] + mvp[0][3];
+				float ty = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2] + mvp[1][3];
+				float tz = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3];
+				float tw = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3];
+
+				if ( tw <= idMath::FLT_SMALLEST_NON_DENORMAL ) {
+					projected[0][0] = -RENDER_MATRIX_INFINITY;
+					projected[0][1] = -RENDER_MATRIX_INFINITY;
+					projected[0][2] = -RENDER_MATRIX_INFINITY;
+					projected[1][0] = RENDER_MATRIX_INFINITY;
+					projected[1][1] = RENDER_MATRIX_INFINITY;
+					// NOTE: projected[1][1] is still valid
+					continue;
+				}
+
+				float rw = 1.0f / tw;
+
+				tx = tx * rw;
+				ty = ty * rw;
+				tz = tz * rw;
+
+				projected[0][0] = Min( projected[0][0], tx );
+				projected[0][1] = Min( projected[0][1], ty );
+				projected[0][2] = Min( projected[0][2], tz );
+
+				projected[1][0] = Max( projected[1][0], tx );
+				projected[1][1] = Max( projected[1][1], ty );
+				projected[1][2] = Max( projected[1][2], tz );
+			}
+		}
+	}
+
+	if ( windowSpace ) {
+		// convert to window coords
+		projected[0][0] = projected[0][0] * 0.5f + 0.5f;
+		projected[1][0] = projected[1][0] * 0.5f + 0.5f;
+
+		projected[0][1] = projected[0][1] * 0.5f + 0.5f;
+		projected[1][1] = projected[1][1] * 0.5f + 0.5f;
+
+#if !defined( CLIP_SPACE_D3D )	// the D3D clip space Z is already in the range [0,1]
+		projected[0][2] = projected[0][2] * 0.5f + 0.5f;
+		projected[1][2] = projected[1][2] * 0.5f + 0.5f;
+#endif
+
+		// clamp to [0, 1] range
+		projected[0][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][0] );
+		projected[1][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][0] );
+
+		projected[0][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][1] );
+		projected[1][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][1] );
+
+		projected[0][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][2] );
+		projected[1][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][2] );
+	}
+
+#endif
 }
 
 /*
@@ -1920,6 +2394,7 @@ void idRenderMatrix::ProjectedNearClippedBounds( idBounds & projected, const idR
 	    - X +
 */
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	const __m128 mvp0 = _mm_loadu_ps( mvp[0] );
 	const __m128 mvp1 = _mm_loadu_ps( mvp[1] );
@@ -2181,6 +2656,306 @@ void idRenderMatrix::ProjectedNearClippedBounds( idBounds & projected, const idR
 	_mm_store_ss( & projected[1].y, maxY );
 	_mm_store_ss( & projected[1].z, maxZ );
 
+#elif 1
+
+{
+	const idVec3 points[8] = {
+		idVec3( bounds[0][0], bounds[0][1], bounds[0][2] ),
+		idVec3( bounds[1][0], bounds[0][1], bounds[0][2] ),
+		idVec3( bounds[1][0], bounds[1][1], bounds[0][2] ),
+		idVec3( bounds[0][0], bounds[1][1], bounds[0][2] ),
+		idVec3( bounds[0][0], bounds[0][1], bounds[1][2] ),
+		idVec3( bounds[1][0], bounds[0][1], bounds[1][2] ),
+		idVec3( bounds[1][0], bounds[1][1], bounds[1][2] ),
+		idVec3( bounds[0][0], bounds[1][1], bounds[1][2] )
+	};
+
+	idVec4 projectedPoints[8];
+	for ( int i = 0; i < 8; i++ ) {
+		const idVec3 & v = points[i];
+		projectedPoints[i].x = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2] + mvp[0][3];
+		projectedPoints[i].y = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2] + mvp[1][3];
+		projectedPoints[i].z = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3];
+		projectedPoints[i].w = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3];
+	}
+
+	const idVec4 & p0 = projectedPoints[0];
+	const idVec4 & p1 = projectedPoints[1];
+	const idVec4 & p2 = projectedPoints[2];
+	const idVec4 & p3 = projectedPoints[3];
+	const idVec4 & p4 = projectedPoints[4];
+	const idVec4 & p5 = projectedPoints[5];
+	const idVec4 & p6 = projectedPoints[6];
+	const idVec4 & p7 = projectedPoints[7];
+
+#if defined( CLIP_SPACE_D3D )	// the D3D near plane is at Z=0 instead of Z=-1
+	const float d0 = p0.z;
+	const float d1 = p1.z;
+	const float d2 = p2.z;
+	const float d3 = p3.z;
+	const float d4 = p4.z;
+	const float d5 = p5.z;
+	const float d6 = p6.z;
+	const float d7 = p7.z;
+#else
+	const float d0 = p0.z + p0.w;
+	const float d1 = p1.z + p1.w;
+	const float d2 = p2.z + p2.w;
+	const float d3 = p3.z + p3.w;
+	const float d4 = p4.z + p4.w;
+	const float d5 = p5.z + p5.w;
+	const float d6 = p6.z + p6.w;
+	const float d7 = p7.z + p7.w;
+#endif
+
+	const float deltaA = d0 - d1;
+	const float deltaB = d1 - d2;
+	const float deltaC = d2 - d3;
+	const float deltaD = d3 - d0;
+
+	const float deltaE = d4 - d5;
+	const float deltaF = d5 - d6;
+	const float deltaG = d6 - d7;
+	const float deltaH = d7 - d4;
+
+	const float deltaI = d0 - d4;
+	const float deltaJ = d1 - d5;
+	const float deltaK = d2 - d6;
+	const float deltaL = d3 - d7;
+
+	const float fractionA = ( fabs( deltaA ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d0 / deltaA ) : 0.0f;
+	const float fractionB = ( fabs( deltaB ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d1 / deltaB ) : 0.0f;
+	const float fractionC = ( fabs( deltaC ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d2 / deltaC ) : 0.0f;
+	const float fractionD = ( fabs( deltaD ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d3 / deltaD ) : 0.0f;
+
+	const float fractionE = ( fabs( deltaE ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d4 / deltaE ) : 0.0f;
+	const float fractionF = ( fabs( deltaF ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d5 / deltaF ) : 0.0f;
+	const float fractionG = ( fabs( deltaG ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d6 / deltaG ) : 0.0f;
+	const float fractionH = ( fabs( deltaH ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d7 / deltaH ) : 0.0f;
+
+	const float fractionI = ( fabs( deltaI ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d0 / deltaI ) : 0.0f;
+	const float fractionJ = ( fabs( deltaJ ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d1 / deltaJ ) : 0.0f;
+	const float fractionK = ( fabs( deltaK ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d2 / deltaK ) : 0.0f;
+	const float fractionL = ( fabs( deltaL ) > idMath::FLT_SMALLEST_NON_DENORMAL ) ? ( d3 / deltaL ) : 0.0f;
+
+	const bool clipA = ( fractionA > 0.0f && fractionA < 1.0f );
+	const bool clipB = ( fractionB > 0.0f && fractionB < 1.0f );
+	const bool clipC = ( fractionC > 0.0f && fractionC < 1.0f );
+	const bool clipD = ( fractionD > 0.0f && fractionD < 1.0f );
+
+	const bool clipE = ( fractionE > 0.0f && fractionE < 1.0f );
+	const bool clipF = ( fractionF > 0.0f && fractionF < 1.0f );
+	const bool clipG = ( fractionG > 0.0f && fractionG < 1.0f );
+	const bool clipH = ( fractionH > 0.0f && fractionH < 1.0f );
+
+	const bool clipI = ( fractionI > 0.0f && fractionI < 1.0f );
+	const bool clipJ = ( fractionJ > 0.0f && fractionJ < 1.0f );
+	const bool clipK = ( fractionK > 0.0f && fractionK < 1.0f );
+	const bool clipL = ( fractionL > 0.0f && fractionL < 1.0f );
+
+	const idVec4 intersectionA = p0 + fractionA * ( p1 - p0 );
+	const idVec4 intersectionB = p1 + fractionB * ( p2 - p1 );
+	const idVec4 intersectionC = p2 + fractionC * ( p3 - p2 );
+	const idVec4 intersectionD = p3 + fractionD * ( p0 - p3 );
+
+	const idVec4 intersectionE = p4 + fractionE * ( p5 - p4 );
+	const idVec4 intersectionF = p5 + fractionF * ( p6 - p5 );
+	const idVec4 intersectionG = p6 + fractionG * ( p7 - p6 );
+	const idVec4 intersectionH = p7 + fractionH * ( p4 - p7 );
+
+	const idVec4 intersectionI = p0 + fractionI * ( p4 - p0 );
+	const idVec4 intersectionJ = p1 + fractionJ * ( p5 - p1 );
+	const idVec4 intersectionK = p2 + fractionK * ( p6 - p2 );
+	const idVec4 intersectionL = p3 + fractionL * ( p7 - p3 );
+
+	idVec4 edgeVerts[24];
+
+	edgeVerts[ 0] = ( clipA && d0 < 0.0f ) ? intersectionA : p0;
+	edgeVerts[ 2] = ( clipB && d1 < 0.0f ) ? intersectionB : p1;
+	edgeVerts[ 4] = ( clipC && d2 < 0.0f ) ? intersectionC : p2;
+	edgeVerts[ 6] = ( clipD && d3 < 0.0f ) ? intersectionD : p3;
+
+	edgeVerts[ 1] = ( clipA && d1 < 0.0f ) ? intersectionA : p1;
+	edgeVerts[ 3] = ( clipB && d2 < 0.0f ) ? intersectionB : p2;
+	edgeVerts[ 5] = ( clipC && d3 < 0.0f ) ? intersectionC : p3;
+	edgeVerts[ 7] = ( clipD && d0 < 0.0f ) ? intersectionD : p0;
+
+	edgeVerts[ 8] = ( clipE && d4 < 0.0f ) ? intersectionE : p4;
+	edgeVerts[10] = ( clipF && d5 < 0.0f ) ? intersectionF : p5;
+	edgeVerts[12] = ( clipG && d6 < 0.0f ) ? intersectionG : p6;
+	edgeVerts[14] = ( clipH && d7 < 0.0f ) ? intersectionH : p7;
+
+	edgeVerts[ 9] = ( clipE && d5 < 0.0f ) ? intersectionE : p5;
+	edgeVerts[11] = ( clipF && d6 < 0.0f ) ? intersectionF : p6;
+	edgeVerts[13] = ( clipG && d7 < 0.0f ) ? intersectionG : p7;
+	edgeVerts[15] = ( clipH && d4 < 0.0f ) ? intersectionH : p4;
+
+	edgeVerts[16] = ( clipI && d0 < 0.0f ) ? intersectionI : p0;
+	edgeVerts[18] = ( clipJ && d1 < 0.0f ) ? intersectionJ : p1;
+	edgeVerts[20] = ( clipK && d2 < 0.0f ) ? intersectionK : p2;
+	edgeVerts[22] = ( clipL && d3 < 0.0f ) ? intersectionL : p3;
+
+	edgeVerts[17] = ( clipI && d4 < 0.0f ) ? intersectionI : p4;
+	edgeVerts[19] = ( clipJ && d5 < 0.0f ) ? intersectionJ : p5;
+	edgeVerts[21] = ( clipK && d6 < 0.0f ) ? intersectionK : p6;
+	edgeVerts[23] = ( clipL && d7 < 0.0f ) ? intersectionL : p7;
+
+	idBounds projBnds;
+	for ( int i = 0; i < 3; i++ ) {
+		projBnds[0][i] = RENDER_MATRIX_INFINITY;
+		projBnds[1][i] = - RENDER_MATRIX_INFINITY;
+	}
+
+	for ( int i = 0; i < 24; i++ ) {
+		const idVec4 & v = edgeVerts[i];
+
+		if ( v.w <= idMath::FLT_SMALLEST_NON_DENORMAL ) {
+			continue;
+		}
+
+		const float rw = 1.0f / v.w;
+
+		const float px = v.x * rw;
+		const float py = v.y * rw;
+		const float pz = v.z * rw;
+
+		projBnds[0][0] = Min( projBnds[0][0], px );
+		projBnds[0][1] = Min( projBnds[0][1], py );
+		projBnds[0][2] = Min( projBnds[0][2], pz );
+
+		projBnds[1][0] = Max( projBnds[1][0], px );
+		projBnds[1][1] = Max( projBnds[1][1], py );
+		projBnds[1][2] = Max( projBnds[1][2], pz );
+	}
+
+	if ( windowSpace ) {
+		// convert to window coords
+		projBnds[0][0] = projBnds[0][0] * 0.5f + 0.5f;
+		projBnds[1][0] = projBnds[1][0] * 0.5f + 0.5f;
+
+		projBnds[0][1] = projBnds[0][1] * 0.5f + 0.5f;
+		projBnds[1][1] = projBnds[1][1] * 0.5f + 0.5f;
+
+#if !defined( CLIP_SPACE_D3D )	// the D3D clip space Z is already in the range [0,1]
+		projBnds[0][2] = projBnds[0][2] * 0.5f + 0.5f;
+		projBnds[1][2] = projBnds[1][2] * 0.5f + 0.5f;
+#endif
+
+		// clamp to [0, 1] range
+		projBnds[0][0] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[0][0] );
+		projBnds[1][0] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[1][0] );
+
+		projBnds[0][1] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[0][1] );
+		projBnds[1][1] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[1][1] );
+
+		projBnds[0][2] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[0][2] );
+		projBnds[1][2] = idMath::ClampFloat( 0.0f, 1.0f, projBnds[1][2] );
+	}
+
+	assert( projected[0].Compare( projBnds[0], 0.01f ) );
+	assert( projected[1].Compare( projBnds[1], 0.01f ) );
+}
+
+#else
+
+	const idVec3 points[8] = {
+		idVec3( bounds[0][0], bounds[0][1], bounds[0][2] ),
+		idVec3( bounds[1][0], bounds[0][1], bounds[0][2] ),
+		idVec3( bounds[1][0], bounds[1][1], bounds[0][2] ),
+		idVec3( bounds[0][0], bounds[1][1], bounds[0][2] ),
+		idVec3( bounds[0][0], bounds[0][1], bounds[1][2] ),
+		idVec3( bounds[1][0], bounds[0][1], bounds[1][2] ),
+		idVec3( bounds[1][0], bounds[1][1], bounds[1][2] ),
+		idVec3( bounds[0][0], bounds[1][1], bounds[1][2] )
+	};
+
+	idVec4 projectedPoints[8];
+	for ( int i = 0; i < 8; i++ ) {
+		const idVec3 & v = points[i];
+		projectedPoints[i].x = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2] + mvp[0][3];
+		projectedPoints[i].y = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2] + mvp[1][3];
+		projectedPoints[i].z = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3];
+		projectedPoints[i].w = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3];
+	}
+
+	idVec4 edgeVerts[24];
+	for ( int i = 0; i < 3; i++ ) {
+		int offset0 = ( i & 1 ) * 4;
+		int offset1 = ( i & 1 ) * 4 + ( i & 2 ) * 2;
+		int offset3 = ~( i >> 1 ) & 1;
+		for ( int j = 0; j < 4; j++ ) {
+			const idVec4 p0 = projectedPoints[offset0 + ( ( j + 0 ) & 3 )];
+			const idVec4 p1 = projectedPoints[offset1 + ( ( j + offset3 ) & 3 )];
+
+#if defined( CLIP_SPACE_D3D )	// the D3D near plane is at Z=0 instead of Z=-1
+			const float d0 = p0.z;
+			const float d1 = p1.z;
+#else
+			const float d0 = p0.z + p0.w;
+			const float d1 = p1.z + p1.w;
+#endif
+			const float delta = d0 - d1;
+			const float fraction = idMath::Fabs( delta ) > idMath::FLT_SMALLEST_NON_DENORMAL ? ( d0 / delta ) : 1.0f;
+			const bool clip = ( fraction > 0.0f && fraction < 1.0f );
+			const idVec4 intersection = p0 + fraction * ( p1 - p0 );
+
+			edgeVerts[i * 8 + j * 2 + 0] = ( clip && d0 < 0.0f ) ? intersection : p0;
+			edgeVerts[i * 8 + j * 2 + 1] = ( clip && d1 < 0.0f ) ? intersection : p1;
+		}
+	}
+
+	for ( int i = 0; i < 3; i++ ) {
+		projected[0][i] = RENDER_MATRIX_INFINITY;
+		projected[1][i] = - RENDER_MATRIX_INFINITY;
+	}
+
+	for ( int i = 0; i < 24; i++ ) {
+		const idVec4 & v = edgeVerts[i];
+
+		if ( v.w <= idMath::FLT_SMALLEST_NON_DENORMAL ) {
+			continue;
+		}
+
+		const float rw = 1.0f / v.w;
+
+		const float px = v.x * rw;
+		const float py = v.y * rw;
+		const float pz = v.z * rw;
+
+		projected[0][0] = Min( projected[0][0], px );
+		projected[0][1] = Min( projected[0][1], py );
+		projected[0][2] = Min( projected[0][2], pz );
+
+		projected[1][0] = Max( projected[1][0], px );
+		projected[1][1] = Max( projected[1][1], py );
+		projected[1][2] = Max( projected[1][2], pz );
+	}
+
+	if ( windowSpace ) {
+		// convert to window coords
+		projected[0][0] = projected[0][0] * 0.5f + 0.5f;
+		projected[1][0] = projected[1][0] * 0.5f + 0.5f;
+
+		projected[0][1] = projected[0][1] * 0.5f + 0.5f;
+		projected[1][1] = projected[1][1] * 0.5f + 0.5f;
+
+#if !defined( CLIP_SPACE_D3D )	// the D3D clip space Z is already in the range [0,1]
+		projected[0][2] = projected[0][2] * 0.5f + 0.5f;
+		projected[1][2] = projected[1][2] * 0.5f + 0.5f;
+#endif
+
+		// clamp to [0, 1] range
+		projected[0][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][0] );
+		projected[1][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][0] );
+
+		projected[0][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][1] );
+		projected[1][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][1] );
+
+		projected[0][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][2] );
+		projected[1][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][2] );
+	}
+
+#endif
 }
 
 #if 0
@@ -2233,6 +3008,7 @@ static idVec3 LocalNearClipCenterFromMVP( const idRenderMatrix & mvp ) {
 	return idVec3( x * invW, y * invW, z * invW );
 }
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 /*
 ========================
@@ -2383,6 +3159,90 @@ static int ClipHomogeneousPolygonToUnitCube_SSE2( idVec4 * points, int numPoints
 	return numPoints;
 }
 
+#else
+
+/*
+========================
+ClipHomogeneousLineToSide
+
+Clips a line with homogeneous coordinates to the axis aligned plane[axis] = side.
+========================
+*/
+static idVec4 ClipHomogeneousLineToSide( const idVec4 & p0, const idVec4 & p1, int axis, float side ) {
+	const float d0 = p0.w * side - p0[axis];
+	const float d1 = p1.w * side - p1[axis];
+	const float delta = d0 - d1;
+	const float f = idMath::Fabs( delta ) > idMath::FLT_SMALLEST_NON_DENORMAL ? ( d0 / delta ) : 1.0f;
+	const float c = idMath::ClampFloat( 0.0f, 1.0f, f );
+	return p0 + c * ( p1 - p0 );
+}
+
+/*
+========================
+ClipHomogeneousPolygonToSide
+
+Clips a polygon with homogeneous coordinates to the axis aligned plane[axis] = sign * offset.
+========================
+*/
+static int ClipHomogeneousPolygonToSide_Generic( idVec4 * __restrict newPoints, idVec4 * __restrict points, int numPoints, int axis, float sign, float offset ) {
+	assert( newPoints != points );
+
+	assert( numPoints < 16 );
+	int sides[16];
+
+	const float side = sign * offset;
+
+	// calculate the plane side for each original point and calculate all potential new points
+	for ( int i = 0; i < numPoints; i++ ) {
+		int j = ( i + 1 ) & ( ( i + 1 - numPoints ) >> 31 );
+		sides[i] = sign * points[i][axis] < offset * points[i].w;
+		newPoints[i * 2 + 0] = points[i];
+		newPoints[i * 2 + 1] = ClipHomogeneousLineToSide( points[i], points[j], axis, side );
+	};
+
+	// repeat the first side at the end to avoid having to wrap around
+	sides[numPoints] = sides[0];
+
+	// compact the array of points
+	int numNewPoints = 0;
+	for ( int i = 0; i < numPoints; i++ ) {
+		if ( sides[i + 0] != 0 ) {
+			newPoints[numNewPoints++] = newPoints[i * 2 + 0];
+		}
+		if ( ( sides[i + 0] ^ sides[i + 1] ) != 0 ) {
+			newPoints[numNewPoints++] = newPoints[i * 2 + 1];
+		}
+	}
+
+	assert( numNewPoints <= 16 );
+	return numNewPoints;
+}
+
+/*
+========================
+ClipHomogeneousPolygonToUnitCube
+
+Clips a polygon with homogeneous coordinates to all six axis aligned unit cube planes.
+========================
+*/
+static int ClipHomogeneousPolygonToUnitCube_Generic( idVec4 * points, int numPoints ) {
+	assert( numPoints < 16 - 6 );
+	ALIGNTYPE16 idVec4 newPoints[2 * 16];	// the C clip code temporarily doubles the points
+
+#if defined( CLIP_SPACE_D3D )	// the D3D near plane is at Z=0 instead of Z=-1
+	numPoints = ClipHomogeneousPolygonToSide_Generic( newPoints, points, numPoints, 2, -1.0f, 0.0f );	// near
+#else
+	numPoints = ClipHomogeneousPolygonToSide_Generic( newPoints, points, numPoints, 2, -1.0f, 1.0f );	// near
+#endif
+	numPoints = ClipHomogeneousPolygonToSide_Generic( points, newPoints, numPoints, 2, +1.0f, 1.0f );	// far
+	numPoints = ClipHomogeneousPolygonToSide_Generic( newPoints, points, numPoints, 1, -1.0f, 1.0f );	// bottom
+	numPoints = ClipHomogeneousPolygonToSide_Generic( points, newPoints, numPoints, 1, +1.0f, 1.0f );	// top
+	numPoints = ClipHomogeneousPolygonToSide_Generic( newPoints, points, numPoints, 0, -1.0f, 1.0f );	// left
+	numPoints = ClipHomogeneousPolygonToSide_Generic( points, newPoints, numPoints, 0, +1.0f, 1.0f );	// right
+	return numPoints;
+}
+
+#endif
 
 /*
 ========================
@@ -2401,6 +3261,7 @@ the given bounds in which case the projected bounds should be set to fully cover
 ========================
 */
 void idRenderMatrix::ProjectedFullyClippedBounds( idBounds & projected, const idRenderMatrix & mvp, const idBounds & bounds, bool windowSpace ) {
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	const __m128 mvp0 = _mm_loadu_ps( mvp[0] );
 	const __m128 mvp1 = _mm_loadu_ps( mvp[1] );
@@ -2551,6 +3412,95 @@ void idRenderMatrix::ProjectedFullyClippedBounds( idBounds & projected, const id
 	_mm_store_ss( & projected[1].y, maxY );
 	_mm_store_ss( & projected[1].z, maxZ );
 
+#else
+
+	const idVec3 points[8] = {
+		idVec3( bounds[0][0], bounds[0][1], bounds[0][2] ),
+		idVec3( bounds[1][0], bounds[0][1], bounds[0][2] ),
+		idVec3( bounds[1][0], bounds[1][1], bounds[0][2] ),
+		idVec3( bounds[0][0], bounds[1][1], bounds[0][2] ),
+		idVec3( bounds[0][0], bounds[0][1], bounds[1][2] ),
+		idVec3( bounds[1][0], bounds[0][1], bounds[1][2] ),
+		idVec3( bounds[1][0], bounds[1][1], bounds[1][2] ),
+		idVec3( bounds[0][0], bounds[1][1], bounds[1][2] )
+	};
+
+	idVec4 projectedPoints[8];
+	for ( int i = 0; i < 8; i++ ) {
+		const idVec3 & v = points[i];
+		projectedPoints[i].x = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2] + mvp[0][3];
+		projectedPoints[i].y = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2] + mvp[1][3];
+		projectedPoints[i].z = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3];
+		projectedPoints[i].w = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3];
+	}
+
+	idVec4 clippedPoints[6 * 16];
+	int numClippedPoints = 0;
+	for ( int i = 0; i < 6; i++ ) {
+		clippedPoints[numClippedPoints + 0] = projectedPoints[boxPolygonVertices[i][0]];
+		clippedPoints[numClippedPoints + 1] = projectedPoints[boxPolygonVertices[i][1]];
+		clippedPoints[numClippedPoints + 2] = projectedPoints[boxPolygonVertices[i][2]];
+		clippedPoints[numClippedPoints + 3] = projectedPoints[boxPolygonVertices[i][3]];
+		numClippedPoints += ClipHomogeneousPolygonToUnitCube_Generic( &clippedPoints[numClippedPoints], 4 );
+	}
+
+	// test if the center of the near clip plane is inside the given bounding box
+	const idVec3 localNearClipCenter = LocalNearClipCenterFromMVP( mvp );
+	const bool inside = bounds.Expand( RENDER_MATRIX_PROJECTION_EPSILON ).ContainsPoint( localNearClipCenter );
+
+	for ( int i = 0; i < 3; i++ ) {
+		projected[0][i] = RENDER_MATRIX_INFINITY;
+		projected[1][i] = - RENDER_MATRIX_INFINITY;
+	}
+	if ( inside ) {
+		projected[0][2] = -1.0f;
+	}
+
+	for ( int i = 0; i < numClippedPoints; i++ ) {
+		const idVec4 & c = clippedPoints[i];
+
+		assert( c.w > idMath::FLT_SMALLEST_NON_DENORMAL );
+
+		const float rw = 1.0f / c.w;
+
+		const float px = c.x * rw;
+		const float py = c.y * rw;
+		const float pz = c.z * rw;
+
+		projected[0][0] = Min( projected[0][0], px );
+		projected[0][1] = Min( projected[0][1], py );
+		projected[0][2] = Min( projected[0][2], pz );
+
+		projected[1][0] = Max( projected[1][0], px );
+		projected[1][1] = Max( projected[1][1], py );
+		projected[1][2] = Max( projected[1][2], pz );
+	}
+
+	if ( windowSpace ) {
+		// convert to window coords
+		projected[0][0] = projected[0][0] * 0.5f + 0.5f;
+		projected[1][0] = projected[1][0] * 0.5f + 0.5f;
+
+		projected[0][1] = projected[0][1] * 0.5f + 0.5f;
+		projected[1][1] = projected[1][1] * 0.5f + 0.5f;
+
+#if !defined( CLIP_SPACE_D3D )	// the D3D clip space Z is already in the range [0,1]
+		projected[0][2] = projected[0][2] * 0.5f + 0.5f;
+		projected[1][2] = projected[1][2] * 0.5f + 0.5f;
+#endif
+
+		// clamp to [0, 1] range
+		projected[0][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][0] );
+		projected[1][0] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][0] );
+
+		projected[0][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][1] );
+		projected[1][1] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][1] );
+
+		projected[0][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[0][2] );
+		projected[1][2] = idMath::ClampFloat( 0.0f, 1.0f, projected[1][2] );
+	}
+
+#endif
 }
 
 /*
@@ -2564,6 +3514,7 @@ The given bounding box is not clipped to the MVP so the depth bounds may not be
 ========================
 */
 void idRenderMatrix::DepthBoundsForBounds( float & min, float & max, const idRenderMatrix & mvp, const idBounds & bounds, bool windowSpace ) {
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	__m128 mvp2 = _mm_loadu_ps( mvp[2] );
 	__m128 mvp3 = _mm_loadu_ps( mvp[3] );
@@ -2630,6 +3581,46 @@ void idRenderMatrix::DepthBoundsForBounds( float & min, float & max, const idRen
 	_mm_store_ss( & min, minv );
 	_mm_store_ss( & max, maxv );
 
+#else
+
+	float localMin = RENDER_MATRIX_INFINITY;
+	float localMax = - RENDER_MATRIX_INFINITY;
+
+	idVec3 v;
+	for ( int x = 0; x < 2; x++ ) {
+		v[0] = bounds[x][0];
+		for ( int y = 0; y < 2; y++ ) {
+			v[1] = bounds[y][1];
+			for ( int z = 0; z < 2; z++ ) {
+				v[2] = bounds[z][2];
+
+				float tz = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3];
+				float tw = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3];
+
+				if ( tw > idMath::FLT_SMALLEST_NON_DENORMAL ) {
+					tz = tz / tw;
+				} else {
+					tz = -RENDER_MATRIX_INFINITY;
+				}
+
+				localMin = Min( localMin, tz );
+				localMax = Max( localMax, tz );
+			}
+		}
+	}
+
+	if ( windowSpace ) {
+		// convert to window coords
+#if !defined( CLIP_SPACE_D3D )	// the D3D clip space Z is already in the range [0,1]
+ 		min = localMin * 0.5f + 0.5f;
+		max = localMax * 0.5f + 0.5f;
+#endif
+		// clamp to the [0, 1] range
+		min = Max( min, 0.0f );
+		max = Min( max, 1.0f );
+	}
+
+#endif
 }
 
 /*
@@ -2646,6 +3637,7 @@ The extruded bounding box is not clipped to the MVP so the depth bounds may not
 void idRenderMatrix::DepthBoundsForExtrudedBounds( float & min, float & max, const idRenderMatrix & mvp, const idBounds & bounds, const idVec3 & extrudeDirection, const idPlane & clipPlane, bool windowSpace ) {
 	assert( idMath::Fabs( extrudeDirection * clipPlane.Normal() ) >= idMath::FLT_SMALLEST_NON_DENORMAL );
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
  	__m128 mvp2 = _mm_loadu_ps( mvp[2] );
 	__m128 mvp3 = _mm_loadu_ps( mvp[3] );
@@ -2800,6 +3792,60 @@ void idRenderMatrix::DepthBoundsForExtrudedBounds( float & min, float & max, con
 	_mm_store_ss( & min, minv );
 	_mm_store_ss( & max, maxv );
 
+#else
+
+	const float closing = extrudeDirection * clipPlane.Normal();
+	const float invClosing = -1.0f / closing;
+
+	float localMin = RENDER_MATRIX_INFINITY;
+	float localMax = - RENDER_MATRIX_INFINITY;
+
+	idVec3 v;
+	for ( int x = 0; x < 2; x++ ) {
+		v[0] = bounds[x][0];
+		for ( int y = 0; y < 2; y++ ) {
+			v[1] = bounds[y][1];
+			for ( int z = 0; z < 2; z++ ) {
+				v[2] = bounds[z][2];
+
+				for ( int extrude = 0; extrude <= 1; extrude++ ) {
+
+					idVec3 test;
+					if ( extrude ) {
+						float extrudeDist = clipPlane.Distance( v ) * invClosing;
+						test = v + extrudeDirection * extrudeDist;
+					} else {
+						test = v;
+					}
+
+					float tz = test[0] * mvp[2][0] + test[1] * mvp[2][1] + test[2] * mvp[2][2] + mvp[2][3];
+					float tw = test[0] * mvp[3][0] + test[1] * mvp[3][1] + test[2] * mvp[3][2] + mvp[3][3];
+
+					if ( tw > idMath::FLT_SMALLEST_NON_DENORMAL ) {
+						tz = tz / tw;
+					} else {
+						tz = -RENDER_MATRIX_INFINITY;
+					}
+
+					localMin = Min( localMin, tz );
+					localMax = Max( localMax, tz );
+				}
+			}
+		}
+	}
+
+	if ( windowSpace ) {
+		// convert to window coords
+#if !defined( CLIP_SPACE_D3D )	// the D3D clip space Z is already in the range [0,1]
+		min = localMin * 0.5f + 0.5f;
+		max = localMax * 0.5f + 0.5f;
+#endif
+		// clamp to the [0, 1] range
+		min = Max( min, 0.0f );
+		max = Min( max, 1.0f );
+	}
+
+#endif
 }
 
 /*
@@ -2855,6 +3901,7 @@ testing if the center of the far clipping plane is contained inside the shadow v
 ========================
 */
 void idRenderMatrix::DepthBoundsForShadowBounds( float & min, float & max, const idRenderMatrix & mvp, const idBounds & bounds, const idVec3 & localLightOrigin, bool windowSpace ) {
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	const __m128 mvp0 = _mm_loadu_ps( mvp[0] );
 	const __m128 mvp1 = _mm_loadu_ps( mvp[1] );
@@ -3029,6 +4076,108 @@ void idRenderMatrix::DepthBoundsForShadowBounds( float & min, float & max, const
 	_mm_store_ss( & min, minZ );
 	_mm_store_ss( & max, maxZ );
 
+#else
+
+	const idVec3 points[8] = {
+		idVec3( bounds[0][0], bounds[0][1], bounds[0][2] ),
+		idVec3( bounds[1][0], bounds[0][1], bounds[0][2] ),
+		idVec3( bounds[1][0], bounds[1][1], bounds[0][2] ),
+		idVec3( bounds[0][0], bounds[1][1], bounds[0][2] ),
+		idVec3( bounds[0][0], bounds[0][1], bounds[1][2] ),
+		idVec3( bounds[1][0], bounds[0][1], bounds[1][2] ),
+		idVec3( bounds[1][0], bounds[1][1], bounds[1][2] ),
+		idVec3( bounds[0][0], bounds[1][1], bounds[1][2] )
+	};
+
+	// calculate the front facing polygon bits
+	int frontBits = GetBoxFrontBits_Generic( bounds, localLightOrigin );
+
+	// bounding box corners
+	ALIGNTYPE16 idVec4 projectedNearPoints[8];
+	for ( int i = 0; i < 8; i++ ) {
+		const idVec3 & v = points[i];
+		projectedNearPoints[i].x = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2] + mvp[0][3];
+		projectedNearPoints[i].y = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2] + mvp[1][3];
+		projectedNearPoints[i].z = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2] + mvp[2][3];
+		projectedNearPoints[i].w = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2] + mvp[3][3];
+	}
+
+	// bounding box corners projected to infinity from the light position
+	ALIGNTYPE16 idVec4 projectedFarPoints[8];
+	for ( int i = 0; i < 8; i++ ) {
+		const idVec3 v = points[i] - localLightOrigin;
+		projectedFarPoints[i].x = v[0] * mvp[0][0] + v[1] * mvp[0][1] + v[2] * mvp[0][2];
+		projectedFarPoints[i].y = v[0] * mvp[1][0] + v[1] * mvp[1][1] + v[2] * mvp[1][2];
+		projectedFarPoints[i].z = v[0] * mvp[2][0] + v[1] * mvp[2][1] + v[2] * mvp[2][2];
+		projectedFarPoints[i].w = v[0] * mvp[3][0] + v[1] * mvp[3][1] + v[2] * mvp[3][2];
+	}
+
+	ALIGNTYPE16 idVec4 clippedPoints[( 6 + 12 ) * 16];
+	int numClippedPoints = 0;
+
+	// clip the front facing bounding box polygons at the near cap
+	const frontPolygons_t & frontPolygons = boxFrontPolygonsForFrontBits[frontBits];
+	for ( int i = 0; i < frontPolygons.count; i++ ) {
+		const int polygon = frontPolygons.indices[i];
+		clippedPoints[numClippedPoints + 0] = projectedNearPoints[boxPolygonVertices[polygon][0]];
+		clippedPoints[numClippedPoints + 1] = projectedNearPoints[boxPolygonVertices[polygon][1]];
+		clippedPoints[numClippedPoints + 2] = projectedNearPoints[boxPolygonVertices[polygon][2]];
+		clippedPoints[numClippedPoints + 3] = projectedNearPoints[boxPolygonVertices[polygon][3]];
+		numClippedPoints += ClipHomogeneousPolygonToUnitCube_Generic( &clippedPoints[numClippedPoints], 4 );
+	}
+
+	// clip the front facing bounding box polygons projected to the far cap
+	for ( int i = 0; i < frontPolygons.count; i++ ) {
+		const int polygon = frontPolygons.indices[i];
+		clippedPoints[numClippedPoints + 0] = projectedFarPoints[boxPolygonVertices[polygon][0]];
+		clippedPoints[numClippedPoints + 1] = projectedFarPoints[boxPolygonVertices[polygon][1]];
+		clippedPoints[numClippedPoints + 2] = projectedFarPoints[boxPolygonVertices[polygon][2]];
+		clippedPoints[numClippedPoints + 3] = projectedFarPoints[boxPolygonVertices[polygon][3]];
+		numClippedPoints += ClipHomogeneousPolygonToUnitCube_Generic( &clippedPoints[numClippedPoints], 4 );
+	}
+
+	// clip the silhouette edge polygons that stretch to infinity
+	const silhouetteEdges_t & silhouetteEdges = boxSilhouetteEdgesForFrontBits[frontBits];
+	for ( int i = 0; i < silhouetteEdges.count; i++ ) {
+		const int edge = silhouetteEdges.indices[i];
+		clippedPoints[numClippedPoints + 0] = projectedNearPoints[boxEdgeVertices[edge][0]];
+		clippedPoints[numClippedPoints + 1] = projectedNearPoints[boxEdgeVertices[edge][1]];
+		clippedPoints[numClippedPoints + 2] = projectedFarPoints[boxEdgeVertices[edge][1]];
+		clippedPoints[numClippedPoints + 3] = projectedFarPoints[boxEdgeVertices[edge][0]];
+		numClippedPoints += ClipHomogeneousPolygonToUnitCube_Generic( &clippedPoints[numClippedPoints], 4 );
+	}
+
+	// test if the center of the near clip plane is inside the infinite shadow volume
+	const idVec3 localNearClipCenter = LocalNearClipCenterFromMVP( mvp );
+	const bool inside = PointInsideInfiniteShadow( bounds, localLightOrigin, localNearClipCenter, RENDER_MATRIX_PROJECTION_EPSILON );
+
+	min = inside ? -1.0f : RENDER_MATRIX_INFINITY;
+	max = - RENDER_MATRIX_INFINITY;
+
+	for ( int i = 0; i < numClippedPoints; i++ ) {
+		const idVec4 & c = clippedPoints[i];
+
+		assert( c.w > idMath::FLT_SMALLEST_NON_DENORMAL );
+
+		const float rw = 1.0f / c.w;
+		const float pz = c.z * rw;
+
+		min = Min( min, pz );
+		max = Max( max, pz );
+	}
+
+	if ( windowSpace ) {
+		// convert to window coords
+#if !defined( CLIP_SPACE_D3D )	// the D3D clip space Z is already in the range [0,1]
+		min = min * 0.5f + 0.5f;
+		max = max * 0.5f + 0.5f;
+#endif
+		// clamp to [0, 1] range
+		min = idMath::ClampFloat( 0.0f, 1.0f, min );
+		max = idMath::ClampFloat( 0.0f, 1.0f, max );
+	}
+
+#endif
 }
 
 /*
@@ -3122,6 +4271,7 @@ idRenderMatrix::GetFrustumCorners
 void idRenderMatrix::GetFrustumCorners( frustumCorners_t & corners, const idRenderMatrix & frustumTransform, const idBounds & frustumBounds ) {
 	assert_16_byte_aligned( &corners );
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	__m128 mvp0 = _mm_loadu_ps( frustumTransform[0] );
 	__m128 mvp1 = _mm_loadu_ps( frustumTransform[1] );
@@ -3194,6 +4344,33 @@ void idRenderMatrix::GetFrustumCorners( frustumCorners_t & corners, const idRend
 	_mm_store_ps( corners.z + 0, z0 );
 	_mm_store_ps( corners.z + 4, z1 );
 
+#else
+
+	idVec3 v;
+	for ( int x = 0; x < 2; x++ ) {
+		v[0] = frustumBounds[x][0];
+		for ( int y = 0; y < 2; y++ ) {
+			v[1] = frustumBounds[y][1];
+			for ( int z = 0; z < 2; z++ ) {
+				v[2] = frustumBounds[z][2];
+
+				float tx = v[0] * frustumTransform[0][0] + v[1] * frustumTransform[0][1] + v[2] * frustumTransform[0][2] + frustumTransform[0][3];
+				float ty = v[0] * frustumTransform[1][0] + v[1] * frustumTransform[1][1] + v[2] * frustumTransform[1][2] + frustumTransform[1][3];
+				float tz = v[0] * frustumTransform[2][0] + v[1] * frustumTransform[2][1] + v[2] * frustumTransform[2][2] + frustumTransform[2][3];
+				float tw = v[0] * frustumTransform[3][0] + v[1] * frustumTransform[3][1] + v[2] * frustumTransform[3][2] + frustumTransform[3][3];
+
+				assert( tw > idMath::FLT_SMALLEST_NON_DENORMAL );
+
+				float rw = 1.0f / tw;
+
+				corners.x[(z<<2)|(y<<1)|(x<<0)] = tx * rw;
+				corners.y[(z<<2)|(y<<1)|(x<<0)] = ty * rw;
+				corners.z[(z<<2)|(y<<1)|(x<<0)] = tz * rw;
+			}
+		}
+	}
+
+#endif
 }
 
 /*
@@ -3204,6 +4381,7 @@ idRenderMatrix::CullFrustumCornersToPlane
 frustumCull_t idRenderMatrix::CullFrustumCornersToPlane( const frustumCorners_t & corners, const idPlane & plane ) {
 	assert_16_byte_aligned( &corners );
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	__m128 vp = _mm_loadu_ps( plane.ToFloatPtr() );
 
@@ -3235,4 +4413,26 @@ frustumCull_t idRenderMatrix::CullFrustumCornersToPlane( const frustumCorners_t
 
 	return (frustumCull_t) ( front | ( back << 1 ) );
 
+#else
+
+	bool front = false;
+	bool back = false;
+	for ( int i = 0; i < 8; i++ ) {
+		const float d = corners.x[i] * plane[0] + corners.y[i] * plane[1] + corners.z[i] * plane[2] + plane[3];
+		if ( d >= 0.0f ) {
+		    front = true;
+		} else if ( d <= 0.0f ) {
+		    back = true;
+		}
+		if ( back && front ) {
+			return FRUSTUM_CULL_CROSS;
+		}
+	}
+	if ( front ) {
+		return FRUSTUM_CULL_FRONT;
+	} else {
+		return FRUSTUM_CULL_BACK;
+	}
+
+#endif
 }
diff --git a/neo/idlib/math/Lcp.cpp b/neo/idlib/math/Lcp.cpp
index 01d4c5a5..c24037f5 100644
--- a/neo/idlib/math/Lcp.cpp
+++ b/neo/idlib/math/Lcp.cpp
@@ -44,6 +44,7 @@ const float LCP_DELTA_FORCE_EPSILON		= 1e-9f;
 #define IGNORE_UNSATISFIABLE_VARIABLES
 
 
+#if defined( ID_WIN_X86_SSE_ASM ) || defined( ID_WIN_X86_SSE_INTRIN )
 
 ALIGN16( const __m128 SIMD_SP_zero )							= { 0.0f, 0.0f, 0.0f, 0.0f };
 ALIGN16( const __m128 SIMD_SP_one )								= { 1.0f, 1.0f, 1.0f, 1.0f };
@@ -67,6 +68,8 @@ ALIGN16( const unsigned int SIMD_DW_four[4] )					= { 4, 4, 4, 4 };
 ALIGN16( const unsigned int SIMD_DW_index[4] )					= { 0, 1, 2, 3 };
 ALIGN16( const int SIMD_DW_not3[4] )							= { ~3, ~3, ~3, ~3 };
 
+#endif
+
 /*
 ========================
 Multiply_SIMD
@@ -82,6 +85,7 @@ static void Multiply_SIMD( float * dst, const float * src0, const float * src1,
 		dst[i] = src0[i] * src1[i];
 	}
 
+#ifdef ID_WIN_X86_SSE_INTRIN
 
 	for ( ; i + 4 <= count; i += 4 ) {
 		assert_16_byte_aligned( &dst[i] );
@@ -94,6 +98,20 @@ static void Multiply_SIMD( float * dst, const float * src0, const float * src1,
 		_mm_store_ps( dst + i, s0 );
 	}
 
+#else
+
+	for ( ; i + 4 <= count; i += 4 ) {
+		assert_16_byte_aligned( &dst[i] );
+		assert_16_byte_aligned( &src0[i] );
+		assert_16_byte_aligned( &src1[i] );
+
+		dst[i+0] = src0[i+0] * src1[i+0];
+		dst[i+1] = src0[i+1] * src1[i+1];
+		dst[i+2] = src0[i+2] * src1[i+2];
+		dst[i+3] = src0[i+3] * src1[i+3];
+	}
+
+#endif
 
 	for ( ; i < count; i++ ) {
 		dst[i] = src0[i] * src1[i];
@@ -115,6 +133,7 @@ static void MultiplyAdd_SIMD( float * dst, const float constant, const float * s
 		dst[i] += constant * src[i];
 	}
 
+#ifdef ID_WIN_X86_SSE_INTRIN
 
 	__m128 c = _mm_load1_ps( & constant );
 	for ( ; i + 4 <= count; i += 4 ) {
@@ -127,6 +146,19 @@ static void MultiplyAdd_SIMD( float * dst, const float constant, const float * s
 		_mm_store_ps( dst + i, s );
 	}
 
+#else
+
+	for ( ; i + 4 <= count; i += 4 ) {
+		assert_16_byte_aligned( &src[i] );
+		assert_16_byte_aligned( &dst[i] );
+
+		dst[i+0] += constant * src[i+0];
+		dst[i+1] += constant * src[i+1];
+		dst[i+2] += constant * src[i+2];
+		dst[i+3] += constant * src[i+3];
+	}
+
+#endif
 
 	for ( ; i < count; i++ ) {
 		dst[i] += constant * src[i];
@@ -144,7 +176,7 @@ static float DotProduct_SIMD( const float * src0, const float * src1, const int
 	assert_16_byte_aligned( src0 );
 	assert_16_byte_aligned( src1 );
 
-#ifndef _lint
+#ifdef ID_WIN_X86_SSE_INTRIN
 
 	__m128 sum = (__m128 &) SIMD_SP_zero;
 	int i = 0;
@@ -266,7 +298,7 @@ static void LowerTriangularSolve_SIMD( const idMatX & L, float * x, const float
 
 	int i = skip;
 
-#ifndef _lint
+#ifdef ID_WIN_X86_SSE_INTRIN
 
 	// work up to a multiple of 4 rows
 	for ( ; ( i & 3 ) != 0 && i < n; i++ ) {
@@ -520,7 +552,7 @@ static void LowerTriangularSolveTranspose_SIMD( const idMatX & L, float * x, con
 	const float * lptr = L.ToFloatPtr() + m * nc + m - 4;
 	float * xptr = x + m;
 
-#ifndef _lint
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	// process 4 rows at a time
 	for ( int i = m; i >= 4; i -= 4 ) {
@@ -850,7 +882,7 @@ static bool LDLT_Factor_SIMD( idMatX & mat, idVecX & invDiag, const int n ) {
 		mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
 	}
 
-#ifndef _lint
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	__m128 vzero = _mm_setzero_ps();
 	for ( int i = 4; i < n; i += 4 ) {
@@ -1210,6 +1242,7 @@ static void GetMaxStep_SIMD( const float * f, const float * a, const float * del
 							const float * lo, const float * hi, const int * side, int numUnbounded, int numClamped,
 							int d, float dir, float & maxStep, int & limit, int & limitSide ) {
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	__m128 vMaxStep;
 	__m128i vLimit;
@@ -1332,6 +1365,65 @@ static void GetMaxStep_SIMD( const float * f, const float * a, const float * del
 	_mm_store_ss( & maxStep, vMaxStep );
 	limit = _mm_cvtsi128_si32( vLimit );
 	limitSide = _mm_cvtsi128_si32( vLimitSide );
+
+#else
+
+	// default to a full step for the current variable
+	{
+		float negAccel = -a[d];
+		float deltaAccel = delta_a[d];
+		int m0 = ( fabs( deltaAccel ) > LCP_DELTA_ACCEL_EPSILON );
+		float step = negAccel / ( m0 ? deltaAccel : 1.0f );
+		maxStep = m0 ? step : 0.0f;
+		limit = d;
+		limitSide = 0;
+	}
+
+	// test the current variable
+	{
+		float deltaForce = dir;
+		float forceLimit = ( deltaForce < 0.0f ) ? lo[d] : hi[d];
+		float step = ( forceLimit - f[d] ) / deltaForce;
+		int setSide = ( deltaForce < 0.0f ) ? -1 : 1;
+		int m0 = ( fabs( deltaForce ) > LCP_DELTA_FORCE_EPSILON );
+		int m1 = ( fabs( forceLimit ) != idMath::INFINITY );
+		int m2 = ( step < maxStep );
+		int m3 = ( m0 & m1 & m2 );
+		maxStep = m3 ? step : maxStep;
+		limit = m3 ? d : limit;
+		limitSide = m3 ? setSide : limitSide;
+	}
+
+	// test the clamped bounded variables
+	for ( int i = numUnbounded; i < numClamped; i++ ) {
+		float deltaForce = delta_f[i];
+		float forceLimit = ( deltaForce < 0.0f ) ? lo[i] : hi[i];
+		int m0 = ( fabs( deltaForce ) > LCP_DELTA_FORCE_EPSILON );
+		float step = ( forceLimit - f[i] ) / ( m0 ? deltaForce : 1.0f );
+		int setSide = ( deltaForce < 0.0f ) ? -1 : 1;
+		int m1 = ( fabs( forceLimit ) != idMath::INFINITY );
+		int m2 = ( step < maxStep );
+		int m3 = ( m0 & m1 & m2 );
+		maxStep = m3 ? step : maxStep;
+		limit = m3 ? i : limit;
+		limitSide = m3 ? setSide : limitSide;
+	}
+
+	// test the not clamped bounded variables
+	for ( int i = numClamped; i < d; i++ ) {
+		float negAccel = -a[i];
+		float deltaAccel = delta_a[i];
+		int m0 = ( side[i] * deltaAccel > LCP_DELTA_ACCEL_EPSILON );
+		float step = negAccel / ( m0 ? deltaAccel : 1.0f );
+		int m1 = ( lo[i] < -LCP_BOUND_EPSILON || hi[i] > LCP_BOUND_EPSILON );
+		int m2 = ( step < maxStep );
+		int m3 = ( m0 & m1 & m2 );
+		maxStep = m3 ? step : maxStep;
+		limit = m3 ? i : limit;
+		limitSide = m3 ? 0 : limitSide;
+	}
+
+#endif
 }
 
 /*
diff --git a/neo/idlib/math/MatX.cpp b/neo/idlib/math/MatX.cpp
index 78cbff33..dc1702c1 100644
--- a/neo/idlib/math/MatX.cpp
+++ b/neo/idlib/math/MatX.cpp
@@ -171,6 +171,7 @@ void idMatX::CopyLowerToUpperTriangle() {
 	assert( ( GetNumColumns() & 3 ) == 0 );
 	assert( GetNumColumns() >= GetNumRows() );
 
+#ifdef ID_WIN_X86_SSE_INTRIN
 
 	const int n = GetNumColumns();
 	const int m = GetNumRows();
@@ -307,6 +308,20 @@ void idMatX::CopyLowerToUpperTriangle() {
 		_mm_store_ps( basePtr + n0, r0 );
 	}
 
+#else
+
+	const int n = GetNumColumns();
+	const int m = GetNumRows();
+	for ( int i = 0; i < m; i++ ) {
+		const float * __restrict ptr = ToFloatPtr() + ( i + 1 ) * n + i;
+		float * __restrict dstPtr = ToFloatPtr() + i * n;
+		for ( int j = i + 1; j < m; j++ ) {
+			dstPtr[j] = ptr[0];
+			ptr += n;
+		}
+	}
+
+#endif
 
 #ifdef _DEBUG
 	for ( int i = 0; i < numRows; i++ ) {
diff --git a/neo/idlib/math/MatX.h b/neo/idlib/math/MatX.h
index 059a5f3f..876a9091 100644
--- a/neo/idlib/math/MatX.h
+++ b/neo/idlib/math/MatX.h
@@ -389,7 +389,7 @@ idMatX::operator=
 ID_INLINE idMatX &idMatX::operator=( const idMatX &a ) {
 	SetSize( a.numRows, a.numColumns );
 	int s = a.numRows * a.numColumns;
-#ifdef MATX_SIMD
+#if defined(ID_WIN_X86_SSE_INTRIN) && defined(MATX_SIMD)
 	for ( int i = 0; i < s; i += 4 ) {
 		_mm_store_ps( mat + i, _mm_load_ps( a.mat + i ) );
 	}
@@ -410,7 +410,7 @@ ID_INLINE idMatX idMatX::operator*( const float a ) const {
 
 	m.SetTempSize( numRows, numColumns );
 	int s = numRows * numColumns;
-#ifdef MATX_SIMD
+#if defined(ID_WIN_X86_SSE_INTRIN) && defined(MATX_SIMD)
 	__m128 va = _mm_load1_ps( & a );
 	for ( int i = 0; i < s; i += 4 ) {
 		_mm_store_ps( m.mat + i, _mm_mul_ps( _mm_load_ps( mat + i ), va ) );
@@ -462,7 +462,7 @@ ID_INLINE idMatX idMatX::operator+( const idMatX &a ) const {
 	assert( numRows == a.numRows && numColumns == a.numColumns );
 	m.SetTempSize( numRows, numColumns );
 	int s = numRows * numColumns;
-#ifdef MATX_SIMD
+#if defined(ID_WIN_X86_SSE_INTRIN) && defined(MATX_SIMD)
 	for ( int i = 0; i < s; i += 4 ) {
 		_mm_store_ps( m.mat + i, _mm_add_ps( _mm_load_ps( mat + i ), _mm_load_ps( a.mat + i ) ) );
 	}
@@ -485,7 +485,7 @@ ID_INLINE idMatX idMatX::operator-( const idMatX &a ) const {
 	assert( numRows == a.numRows && numColumns == a.numColumns );
 	m.SetTempSize( numRows, numColumns );
 	int s = numRows * numColumns;
-#ifdef MATX_SIMD
+#if defined(ID_WIN_X86_SSE_INTRIN) && defined(MATX_SIMD)
 	for ( int i = 0; i < s; i += 4 ) {
 		_mm_store_ps( m.mat + i, _mm_sub_ps( _mm_load_ps( mat + i ), _mm_load_ps( a.mat + i ) ) );
 	}
@@ -504,7 +504,7 @@ idMatX::operator*=
 */
 ID_INLINE idMatX &idMatX::operator*=( const float a ) {
 	int s = numRows * numColumns;
-#ifdef MATX_SIMD
+#if defined(ID_WIN_X86_SSE_INTRIN) && defined(MATX_SIMD)
 	__m128 va = _mm_load1_ps( & a );
 	for ( int i = 0; i < s; i += 4 ) {
 		_mm_store_ps( mat + i, _mm_mul_ps( _mm_load_ps( mat + i ), va ) );
@@ -537,7 +537,7 @@ idMatX::operator+=
 ID_INLINE idMatX &idMatX::operator+=( const idMatX &a ) {
 	assert( numRows == a.numRows && numColumns == a.numColumns );
 	int s = numRows * numColumns;
-#ifdef MATX_SIMD
+#if defined(ID_WIN_X86_SSE_INTRIN) && defined(MATX_SIMD)
 	for ( int i = 0; i < s; i += 4 ) {
 		_mm_store_ps( mat + i, _mm_add_ps( _mm_load_ps( mat + i ), _mm_load_ps( a.mat + i ) ) );
 	}
@@ -558,7 +558,7 @@ idMatX::operator-=
 ID_INLINE idMatX &idMatX::operator-=( const idMatX &a ) {
 	assert( numRows == a.numRows && numColumns == a.numColumns );
 	int s = numRows * numColumns;
-#ifdef MATX_SIMD
+#if defined(ID_WIN_X86_SSE_INTRIN) && defined(MATX_SIMD)
 	for ( int i = 0; i < s; i += 4 ) {
 		_mm_store_ps( mat + i, _mm_sub_ps( _mm_load_ps( mat + i ), _mm_load_ps( a.mat + i ) ) );
 	}
@@ -744,7 +744,7 @@ idMatX::Zero
 */
 ID_INLINE void idMatX::Zero() {
 	int s = numRows * numColumns;
-#ifdef MATX_SIMD
+#if defined(ID_WIN_X86_SSE_INTRIN) && defined(MATX_SIMD)
 	for ( int i = 0; i < s; i += 4 ) {
 		_mm_store_ps( mat + i, _mm_setzero_ps() );
 	}
@@ -838,7 +838,7 @@ idMatX::Negate
 */
 ID_INLINE void idMatX::Negate() {
 	int s = numRows * numColumns;
-#ifdef MATX_SIMD
+#if defined(ID_WIN_X86_SSE_INTRIN) && defined(MATX_SIMD)
 	ALIGN16( const unsigned int signBit[4] ) = { IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK };
 	for ( int i = 0; i < s; i += 4 ) {
 		_mm_store_ps( mat + i, _mm_xor_ps( _mm_load_ps( mat + i ), (__m128 &) signBit[0] ) );
diff --git a/neo/idlib/math/Math.cpp b/neo/idlib/math/Math.cpp
index ce338b5a..7f7d44e5 100644
--- a/neo/idlib/math/Math.cpp
+++ b/neo/idlib/math/Math.cpp
@@ -51,6 +51,7 @@ const float	idMath::INFINITY		= 1e30f;
 const float idMath::FLT_EPSILON		= 1.192092896e-07f;
 const float idMath::FLT_SMALLEST_NON_DENORMAL	= * reinterpret_cast< const float * >( & SMALLEST_NON_DENORMAL );	// 1.1754944e-038f
 
+#if defined( ID_WIN_X86_SSE_INTRIN )
 const __m128 idMath::SIMD_SP_zero				= { 0.0f, 0.0f, 0.0f, 0.0f };
 const __m128 idMath::SIMD_SP_255				= { 255.0f, 255.0f, 255.0f, 255.0f };
 const __m128 idMath::SIMD_SP_min_char			= { -128.0f, -128.0f, -128.0f, -128.0f };
@@ -61,6 +62,7 @@ const __m128 idMath::SIMD_SP_smallestNonDenorm	= { FLT_SMALLEST_NON_DENORMAL, FL
 const __m128 idMath::SIMD_SP_tiny				= { 1e-4f, 1e-4f, 1e-4f, 1e-4f };
 const __m128 idMath::SIMD_SP_rsqrt_c0			= { 3.0f, 3.0f, 3.0f, 3.0f };
 const __m128 idMath::SIMD_SP_rsqrt_c1			= { -0.5f, -0.5f, -0.5f, -0.5f };
+#endif
 
 bool		idMath::initialized		= false;
 dword		idMath::iSqrt[SQRT_TABLE_SIZE];		// inverse square root lookup table
diff --git a/neo/idlib/math/Math.h b/neo/idlib/math/Math.h
index d08fcac4..abf90b5b 100644
--- a/neo/idlib/math/Math.h
+++ b/neo/idlib/math/Math.h
@@ -419,6 +419,7 @@ public:
 	static const float			FLT_EPSILON;				// smallest positive number such that 1.0+FLT_EPSILON != 1.0
 	static const float			FLT_SMALLEST_NON_DENORMAL;	// smallest non-denormal 32-bit floating point value
 
+#if defined( ID_WIN_X86_SSE_INTRIN )
 	static const __m128				SIMD_SP_zero;
 	static const __m128				SIMD_SP_255;
 	static const __m128				SIMD_SP_min_char;
@@ -429,6 +430,7 @@ public:
 	static const __m128				SIMD_SP_tiny;
 	static const __m128				SIMD_SP_rsqrt_c0;
 	static const __m128				SIMD_SP_rsqrt_c1;
+#endif
 
 private:
 	enum {
@@ -460,9 +462,15 @@ idMath::InvSqrt
 ========================
 */
 ID_INLINE float idMath::InvSqrt( float x ) {
+#ifdef ID_WIN_X86_SSE_INTRIN
 
 	return ( x > FLT_SMALLEST_NON_DENORMAL ) ? sqrtf( 1.0f / x ) : INFINITY;
 
+#else
+
+	return ( x > FLT_SMALLEST_NON_DENORMAL ) ? sqrtf( 1.0f / x ) : INFINITY;
+
+#endif
 }
 
 /*
@@ -471,9 +479,15 @@ idMath::InvSqrt16
 ========================
 */
 ID_INLINE float idMath::InvSqrt16( float x ) {
+#ifdef ID_WIN_X86_SSE_INTRIN
 
 	return ( x > FLT_SMALLEST_NON_DENORMAL ) ? sqrtf( 1.0f / x ) : INFINITY;
 
+#else
+
+	return ( x > FLT_SMALLEST_NON_DENORMAL ) ? sqrtf( 1.0f / x ) : INFINITY;
+
+#endif
 }
 
 /*
@@ -482,7 +496,11 @@ idMath::Sqrt
 ========================
 */
 ID_INLINE float idMath::Sqrt( float x ) {
+#ifdef ID_WIN_X86_SSE_INTRIN
 	return ( x >= 0.0f ) ?  x * InvSqrt( x ) : 0.0f;
+#else
+	return ( x >= 0.0f ) ? sqrtf( x ) : 0.0f;
+#endif
 }
 
 /*
@@ -491,7 +509,11 @@ idMath::Sqrt16
 ========================
 */
 ID_INLINE float idMath::Sqrt16( float x ) {
+#ifdef ID_WIN_X86_SSE_INTRIN
 	return ( x >= 0.0f ) ?  x * InvSqrt16( x ) : 0.0f;
+#else
+	return ( x >= 0.0f ) ? sqrtf( x ) : 0.0f;
+#endif
 }
 
 /*
@@ -601,6 +623,7 @@ idMath::SinCos
 ========================
 */
 ID_INLINE void idMath::SinCos( float a, float &s, float &c ) {
+#if defined( ID_WIN_X86_ASM )
 	_asm {
 		fld		a
 		fsincos
@@ -609,6 +632,10 @@ ID_INLINE void idMath::SinCos( float a, float &s, float &c ) {
 		fstp	dword ptr [ecx]
 		fstp	dword ptr [edx]
 	}
+#else
+	s = sinf( a );
+	c = cosf( a );
+#endif
 }
 
 /*
@@ -1128,11 +1155,24 @@ idMath::Ftoi
 ========================
 */
 ID_INLINE int idMath::Ftoi( float f ) {
+#ifdef ID_WIN_X86_SSE_INTRIN
 	// If a converted result is larger than the maximum signed doubleword integer,
 	// the floating-point invalid exception is raised, and if this exception is masked,
 	// the indefinite integer value (80000000H) is returned.
 	__m128 x = _mm_load_ss( &f );
 	return _mm_cvttss_si32( x );
+#elif 0 // round chop (C/C++ standard)
+	int i, s, e, m, shift;
+	i = *reinterpret_cast<int *>(&f);
+	s = i >> IEEE_FLT_SIGN_BIT;
+	e = ( ( i >> IEEE_FLT_MANTISSA_BITS ) & ( ( 1 << IEEE_FLT_EXPONENT_BITS ) - 1 ) ) - IEEE_FLT_EXPONENT_BIAS;
+	m = ( i & ( ( 1 << IEEE_FLT_MANTISSA_BITS ) - 1 ) ) | ( 1 << IEEE_FLT_MANTISSA_BITS );
+	shift = e - IEEE_FLT_MANTISSA_BITS;
+	return ( ( ( ( m >> -shift ) | ( m << shift ) ) & ~( e >> INT32_SIGN_BIT ) ) ^ s ) - s;
+#else
+	// If a converted result is larger than the maximum signed doubleword integer the result is undefined.
+	return C_FLOAT_TO_INT( f );
+#endif
 }
 
 /*
@@ -1141,10 +1181,21 @@ idMath::Ftoi8
 ========================
 */
 ID_INLINE char idMath::Ftoi8( float f ) {
+#ifdef ID_WIN_X86_SSE_INTRIN
 	__m128 x = _mm_load_ss( &f );
 	x = _mm_max_ss( x, SIMD_SP_min_char );
 	x = _mm_min_ss( x, SIMD_SP_max_char );
 	return static_cast<char>( _mm_cvttss_si32( x ) );
+#else
+	// The converted result is clamped to the range [-128,127].
+	int i = C_FLOAT_TO_INT( f );
+	if ( i < -128 ) {
+		return -128;
+	} else if ( i > 127 ) {
+		return 127;
+	}
+	return static_cast<char>( i );
+#endif
 }
 
 /*
@@ -1153,10 +1204,21 @@ idMath::Ftoi16
 ========================
 */
 ID_INLINE short idMath::Ftoi16( float f ) {
+#ifdef ID_WIN_X86_SSE_INTRIN
 	__m128 x = _mm_load_ss( &f );
 	x = _mm_max_ss( x, SIMD_SP_min_short );
 	x = _mm_min_ss( x, SIMD_SP_max_short );
 	return static_cast<short>( _mm_cvttss_si32( x ) );
+#else
+	// The converted result is clamped to the range [-32768,32767].
+	int i = C_FLOAT_TO_INT( f );
+	if ( i < -32768 ) {
+		return -32768;
+	} else if ( i > 32767 ) {
+		return 32767;
+	}
+	return static_cast<short>( i );
+#endif
 }
 
 /*
@@ -1183,12 +1245,23 @@ idMath::Ftob
 ========================
 */
 ID_INLINE byte idMath::Ftob( float f ) {
+#ifdef ID_WIN_X86_SSE_INTRIN
 	// If a converted result is negative the value (0) is returned and if the
 	// converted result is larger than the maximum byte the value (255) is returned.
 	__m128 x = _mm_load_ss( &f );
 	x = _mm_max_ss( x, SIMD_SP_zero );
 	x = _mm_min_ss( x, SIMD_SP_255 );
 	return static_cast<byte>( _mm_cvttss_si32( x ) );
+#else
+	// The converted result is clamped to the range [0,255].
+	int i = C_FLOAT_TO_INT( f );
+	if ( i < 0 ) {
+		return 0;
+	} else if ( i > 255 ) {
+		return 255;
+	}
+	return static_cast<byte>( i );
+#endif
 }
 
 /*
diff --git a/neo/idlib/math/VecX.h b/neo/idlib/math/VecX.h
index 4086a1a9..e8ff421a 100644
--- a/neo/idlib/math/VecX.h
+++ b/neo/idlib/math/VecX.h
@@ -213,7 +213,7 @@ ID_INLINE idVecX idVecX::operator-() const {
 	idVecX m;
 
 	m.SetTempSize( size );
-#ifdef VECX_SIMD
+#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD)
 	ALIGN16( unsigned int signBit[4] ) = { IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK };
 	for ( int i = 0; i < size; i += 4 ) {
 		_mm_store_ps( m.p + i, _mm_xor_ps( _mm_load_ps( p + i ), (__m128 &) signBit[0] ) );
@@ -233,7 +233,7 @@ idVecX::operator=
 */
 ID_INLINE idVecX &idVecX::operator=( const idVecX &a ) { 
 	SetSize( a.size );
-#ifdef VECX_SIMD
+#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD)
 	for ( int i = 0; i < a.size; i += 4 ) {
 		_mm_store_ps( p + i, _mm_load_ps( a.p + i ) );
 	}
@@ -254,7 +254,7 @@ ID_INLINE idVecX idVecX::operator+( const idVecX &a ) const {
 
 	assert( size == a.size );
 	m.SetTempSize( size );
-#ifdef VECX_SIMD
+#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD)
 	for ( int i = 0; i < size; i += 4 ) {
 		_mm_store_ps( m.p + i, _mm_add_ps( _mm_load_ps( p + i ), _mm_load_ps( a.p + i ) ) );
 	}
@@ -276,7 +276,7 @@ ID_INLINE idVecX idVecX::operator-( const idVecX &a ) const {
 
 	assert( size == a.size );
 	m.SetTempSize( size );
-#ifdef VECX_SIMD
+#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD)
 	for ( int i = 0; i < size; i += 4 ) {
 		_mm_store_ps( m.p + i, _mm_sub_ps( _mm_load_ps( p + i ), _mm_load_ps( a.p + i ) ) );
 	}
@@ -295,7 +295,7 @@ idVecX::operator+=
 */
 ID_INLINE idVecX &idVecX::operator+=( const idVecX &a ) {
 	assert( size == a.size );
-#ifdef VECX_SIMD
+#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD)
 	for ( int i = 0; i < size; i += 4 ) {
 		_mm_store_ps( p + i, _mm_add_ps( _mm_load_ps( p + i ), _mm_load_ps( a.p + i ) ) );
 	}
@@ -315,7 +315,7 @@ idVecX::operator-=
 */
 ID_INLINE idVecX &idVecX::operator-=( const idVecX &a ) {
 	assert( size == a.size );
-#ifdef VECX_SIMD
+#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD)
 	for ( int i = 0; i < size; i += 4 ) {
 		_mm_store_ps( p + i, _mm_sub_ps( _mm_load_ps( p + i ), _mm_load_ps( a.p + i ) ) );
 	}
@@ -337,7 +337,7 @@ ID_INLINE idVecX idVecX::operator*( const float a ) const {
 	idVecX m;
 
 	m.SetTempSize( size );
-#ifdef VECX_SIMD
+#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD)
 	__m128 va = _mm_load1_ps( & a );
 	for ( int i = 0; i < size; i += 4 ) {
 		_mm_store_ps( m.p + i, _mm_mul_ps( _mm_load_ps( p + i ), va ) );
@@ -356,7 +356,7 @@ idVecX::operator*=
 ========================
 */
 ID_INLINE idVecX &idVecX::operator*=( const float a ) {
-#ifdef VECX_SIMD
+#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD)
 	__m128 va = _mm_load1_ps( & a );
 	for ( int i = 0; i < size; i += 4 ) {
 		_mm_store_ps( p + i, _mm_mul_ps( _mm_load_ps( p + i ), va ) );
@@ -551,7 +551,7 @@ idVecX::Zero
 ========================
 */
 ID_INLINE void idVecX::Zero() {
-#ifdef VECX_SIMD
+#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD)
 	for ( int i = 0; i < size; i += 4 ) {
 		_mm_store_ps( p + i, _mm_setzero_ps() );
 	}
@@ -567,7 +567,7 @@ idVecX::Zero
 */
 ID_INLINE void idVecX::Zero( int length ) {
 	SetSize( length );
-#ifdef VECX_SIMD
+#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD)
 	for ( int i = 0; i < length; i += 4 ) {
 		_mm_store_ps( p + i, _mm_setzero_ps() );
 	}
@@ -611,7 +611,7 @@ idVecX::Negate
 ========================
 */
 ID_INLINE void idVecX::Negate() {
-#ifdef VECX_SIMD
+#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD)
 	ALIGN16( const unsigned int signBit[4] ) = { IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK };
 	for ( int i = 0; i < size; i += 4 ) {
 		_mm_store_ps( p + i, _mm_xor_ps( _mm_load_ps( p + i ), (__m128 &) signBit[0] ) );
diff --git a/neo/idlib/math/Vector.h b/neo/idlib/math/Vector.h
index f194618d..c0a010ea 100644
--- a/neo/idlib/math/Vector.h
+++ b/neo/idlib/math/Vector.h
@@ -451,6 +451,10 @@ ID_INLINE idVec3 operator*( const float a, const idVec3 b ) {
 	return idVec3( b.x * a, b.y * a, b.z * a );
 }
 
+ID_INLINE idVec3 operator/( const float a, const idVec3 b ) {
+	return idVec3( a / b.x, a / b.y, a / b.z );
+}
+
 ID_INLINE idVec3 idVec3::operator+( const idVec3 &a ) const {
 	return idVec3( x + a.x, y + a.y, z + a.z );
 }
diff --git a/neo/idlib/sys/sys_defines.h b/neo/idlib/sys/sys_defines.h
index 4a052a00..b20e6169 100644
--- a/neo/idlib/sys/sys_defines.h
+++ b/neo/idlib/sys/sys_defines.h
@@ -28,6 +28,58 @@ If you have questions concerning this license or the applicable additional terms
 #ifndef SYS_DEFINES_H
 #define SYS_DEFINES_H
 
+/*
+================================================================================================
+
+	Platform Specific ID_ Defines
+
+	The ID_ defines are the only platform defines we should be using.
+
+================================================================================================
+*/
+
+#undef ID_PC
+#undef ID_PC_WIN
+#undef ID_PC_WIN64
+#undef ID_CONSOLE
+#undef ID_WIN32
+#undef ID_LITTLE_ENDIAN
+
+#if defined(_WIN32)
+	// _WIN32 always defined
+	// _WIN64 also defined for x64 target
+/*
+	#if !defined( _MANAGED )
+		#if !defined( _WIN64 )
+			#define ID_WIN_X86_ASM
+			#define ID_WIN_X86_MMX_ASM
+			#define ID_WIN_X86_MMX_INTRIN
+			#define ID_WIN_X86_SSE_ASM
+			#define ID_WIN_X86_SSE_INTRIN
+			#define ID_WIN_X86_SSE2_ASM
+			#define ID_WIN_X86_SSE2_INTRIN
+			// the 32 bit build is now as close to the console builds as possible
+			#define ID_CONSOLE
+		#else
+			#define ID_PC_WIN64
+			#define ID_WIN_X86_MMX_INTRIN
+			#define ID_WIN_X86_SSE_INTRIN
+			#define ID_WIN_X86_SSE2_INTRIN
+			#define ID_WIN_X86_SSE3_INTRIN
+		#endif
+	#endif
+*/
+
+	#define ID_PC
+	#define ID_PC_WIN
+	#define ID_WIN32
+	#define ID_LITTLE_ENDIAN
+#else
+#error Unknown Platform
+#endif
+
+#define ID_OPENGL
+
 /*
 ================================================================================================
 
@@ -36,6 +88,7 @@ If you have questions concerning this license or the applicable additional terms
 ================================================================================================
 */
 
+#ifdef ID_PC_WIN
 
 #define	CPUSTRING						"x86"
 
@@ -69,6 +122,8 @@ If you have questions concerning this license or the applicable additional terms
 	#define WIN32
 #endif
 
+#endif
+
 /*
 ================================================================================================
 
@@ -108,6 +163,8 @@ bulk of the codebase, so it is the best place for analyze pragmas.
 ================================================================================================
 */
 
+#if defined( ID_WIN32 )
+
 // disable some /analyze warnings here
 #pragma warning( disable: 6255 )	// warning C6255: _alloca indicates failure by raising a stack overflow exception. Consider using _malloca instead. (Note: _malloca requires _freea.)
 #pragma warning( disable: 6262 )	// warning C6262: Function uses '36924' bytes of stack: exceeds /analyze:stacksize'32768'. Consider moving some data to heap
@@ -135,6 +192,7 @@ bulk of the codebase, so it is the best place for analyze pragmas.
 // guaranteed to be false in the following code
 #define NO_RETURN __declspec(noreturn)
 
+#endif
 
 // I don't want to disable "warning C6031: Return value ignored" from /analyze
 // but there are several cases with sprintf where we pre-initialized the variables
diff --git a/neo/idlib/sys/sys_intrinsics.h b/neo/idlib/sys/sys_intrinsics.h
index 12ad78dd..93e5a515 100644
--- a/neo/idlib/sys/sys_intrinsics.h
+++ b/neo/idlib/sys/sys_intrinsics.h
@@ -56,6 +56,8 @@ ID_INLINE_EXTERN float __frndz( float x )						{	return (float)( (int)( x ) ); }
 ================================================================================================
 */
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
+
 // The code below assumes that a cache line is 64 bytes.
 // We specify the cache line size as 128 here to make the code consistent with the consoles.
 #define CACHE_LINE_SIZE						128
@@ -84,6 +86,24 @@ ID_FORCE_INLINE void FlushCacheLine( const void * ptr, int offset ) {
 	_mm_clflush( bytePtr + 64 );
 }
 
+/*
+================================================
+	Other
+================================================
+*/
+#else
+
+#define CACHE_LINE_SIZE						128
+
+ID_INLINE void Prefetch( const void * ptr, int offset ) {}
+ID_INLINE void ZeroCacheLine( void * ptr, int offset ) {
+	byte * bytePtr = (byte *)( ( ( (UINT_PTR) ( ptr ) ) + ( offset ) ) & ~( CACHE_LINE_SIZE - 1 ) );
+	memset( bytePtr, 0, CACHE_LINE_SIZE );
+}
+ID_INLINE void FlushCacheLine( const void * ptr, int offset ) {}
+
+#endif
+
 /*
 ================================================
 	Block Clear Macros
diff --git a/neo/renderer/BufferObject.cpp b/neo/renderer/BufferObject.cpp
index 2e42d2fb..4a537aa1 100644
--- a/neo/renderer/BufferObject.cpp
+++ b/neo/renderer/BufferObject.cpp
@@ -72,6 +72,7 @@ void UnbindBufferObjects() {
 	qglBindBufferARB( GL_ELEMENT_ARRAY_BUFFER_ARB, 0 );
 }
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 void CopyBuffer( byte * dst, const byte * src, int numBytes ) {
 	assert_16_byte_aligned( dst );
@@ -109,6 +110,15 @@ void CopyBuffer( byte * dst, const byte * src, int numBytes ) {
 	_mm_sfence();
 }
 
+#else
+
+void CopyBuffer( byte * dst, const byte * src, int numBytes ) {
+	assert_16_byte_aligned( dst );
+	assert_16_byte_aligned( src );
+	memcpy( dst, src, numBytes );
+}
+
+#endif
 
 /*
 ================================================================================================
diff --git a/neo/renderer/DXT/DXTCodec.h b/neo/renderer/DXT/DXTCodec.h
index cd84d33a..76d76daf 100644
--- a/neo/renderer/DXT/DXTCodec.h
+++ b/neo/renderer/DXT/DXTCodec.h
@@ -258,7 +258,11 @@ idDxtEncoder::CompressImageDXT1Fast
 ========================
 */
 ID_INLINE void idDxtEncoder::CompressImageDXT1Fast( const byte *inBuf, byte *outBuf, int width, int height ) {
+#ifdef ID_WIN_X86_SSE2_INTRIN
 	CompressImageDXT1Fast_SSE2( inBuf, outBuf, width, height );
+#else
+	CompressImageDXT1Fast_Generic( inBuf, outBuf, width, height );
+#endif
 }
 
 /*
@@ -267,7 +271,11 @@ idDxtEncoder::CompressImageDXT1AlphaFast
 ========================
 */
 ID_INLINE void idDxtEncoder::CompressImageDXT1AlphaFast( const byte *inBuf, byte *outBuf, int width, int height ) {
+#ifdef ID_WIN_X86_SSE2_INTRIN
 	CompressImageDXT1AlphaFast_SSE2( inBuf, outBuf, width, height );
+#else
+	CompressImageDXT1AlphaFast_Generic( inBuf, outBuf, width, height );
+#endif
 }
 
 /*
@@ -276,7 +284,11 @@ idDxtEncoder::CompressImageDXT5Fast
 ========================
 */
 ID_INLINE void idDxtEncoder::CompressImageDXT5Fast( const byte *inBuf, byte *outBuf, int width, int height ) {
+#ifdef ID_WIN_X86_SSE2_INTRIN
 	CompressImageDXT5Fast_SSE2( inBuf, outBuf, width, height );
+#else
+	CompressImageDXT5Fast_Generic( inBuf, outBuf, width, height );
+#endif
 }
 
 /*
@@ -294,7 +306,11 @@ idDxtEncoder::CompressYCoCgDXT5Fast
 ========================
 */
 ID_INLINE void idDxtEncoder::CompressYCoCgDXT5Fast( const byte *inBuf, byte *outBuf, int width, int height ) {
+#ifdef ID_WIN_X86_SSE2_INTRIN
 	CompressYCoCgDXT5Fast_SSE2( inBuf, outBuf, width, height );
+#else
+	CompressYCoCgDXT5Fast_Generic( inBuf, outBuf, width, height );
+#endif
 }
 
 /*
@@ -312,7 +328,11 @@ idDxtEncoder::CompressNormalMapDXT5Fast
 ========================
 */
 ID_INLINE void idDxtEncoder::CompressNormalMapDXT5Fast( const byte *inBuf, byte *outBuf, int width, int height ) {
+#ifdef ID_WIN_X86_SSE2_INTRIN
 	CompressNormalMapDXT5Fast_SSE2( inBuf, outBuf, width, height );
+#else
+	CompressNormalMapDXT5Fast_Generic( inBuf, outBuf, width, height );
+#endif
 }
 
 /*
diff --git a/neo/renderer/DXT/DXTEncoder.cpp b/neo/renderer/DXT/DXTEncoder.cpp
index 3336008e..411edc3f 100644
--- a/neo/renderer/DXT/DXTEncoder.cpp
+++ b/neo/renderer/DXT/DXTEncoder.cpp
@@ -52,6 +52,7 @@ idDxtEncoder::NV4XHardwareBugFix
 ========================
 */
 void idDxtEncoder::NV4XHardwareBugFix( byte *minColor, byte *maxColor ) const {
+#ifdef ID_WIN_X86_ASM
 	int minq = ( ( minColor[0] << 16 ) | ( minColor[1] << 8 ) | minColor[2] ) & 0x00F8FCF8;
 	int maxq = ( ( maxColor[0] << 16 ) | ( maxColor[1] << 8 ) | maxColor[2] ) & 0x00F8FCF8;
 	int mask = -( minq > maxq ) & 0x00FFFFFF;
@@ -62,6 +63,13 @@ void idDxtEncoder::NV4XHardwareBugFix( byte *minColor, byte *maxColor ) const {
 	min ^= max;
 	*(int *)minColor = min;
 	*(int *)maxColor = max;
+#else
+	if ( ColorTo565( minColor ) > ColorTo565( maxColor ) ) {
+		SwapValues( minColor[0], maxColor[0] );
+		SwapValues( minColor[1], maxColor[1] );
+		SwapValues( minColor[2], maxColor[2] );
+	}
+#endif
 }
 
 /*
@@ -950,6 +958,7 @@ int idDxtEncoder::GetMinMaxNormalYHQ( const byte *colorBlock, byte *minColor, by
 	return bestError;
 }
 
+#if defined( ID_WIN_X86_ASM )
 ALIGN16( static float SIMD_SSE2_float_scale[4] ) = { 2.0f / 255.0f, 2.0f / 255.0f, 2.0f / 255.0f, 2.0f / 255.0f };
 ALIGN16( static float SIMD_SSE2_float_descale[4] ) = { 255.0f / 2.0f, 255.0f / 2.0f, 255.0f / 2.0f, 255.0f / 2.0f };
 ALIGN16( static float SIMD_SSE2_float_zero[4] ) = { 0.0f, 0.0f, 0.0f, 0.0f };
@@ -961,6 +970,7 @@ ALIGN16( static float SIMD_SP_rsqrt_c1[4] ) = { -0.5f, -0.5f, -0.5f, -0.5f };
 ALIGN16( static dword SIMD_SSE2_dword_maskFirstThree[4] ) = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
 ALIGN16( static dword SIMD_SSE2_dword_maskWords[4] ) = { 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x00000000 };
 #define R_SHUFFLE_PS( x, y, z, w )	(( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
+#endif
 
 /*
 ========================
@@ -968,6 +978,7 @@ NormalDistanceDXT1
 ========================
 */
 int NormalDistanceDXT1( const int *vector, const int *normalized ) {
+#if defined( ID_WIN_X86_ASM )
 	int result;
 	__asm {
 		mov			esi, vector
@@ -1007,6 +1018,24 @@ int NormalDistanceDXT1( const int *vector, const int *normalized ) {
 		movd		result, xmm0
 	}
 	return result;
+#else
+	float floatNormal[3];
+	byte intNormal[4];
+	floatNormal[0] = vector[0] * ( 2.0f / 255.0f ) - 1.0f;
+	floatNormal[1] = vector[1] * ( 2.0f / 255.0f ) - 1.0f;
+	floatNormal[2] = vector[2] * ( 2.0f / 255.0f ) - 1.0f;
+	float rcplen = idMath::InvSqrt( floatNormal[0] * floatNormal[0] + floatNormal[1] * floatNormal[1] + floatNormal[2] * floatNormal[2] );
+	floatNormal[0] *= rcplen;
+	floatNormal[1] *= rcplen;
+	floatNormal[2] *= rcplen;
+	intNormal[0] = idMath::Ftob( ( floatNormal[0] + 1.0f ) * ( 255.0f / 2.0f ) + 0.5f );
+	intNormal[1] = idMath::Ftob( ( floatNormal[1] + 1.0f ) * ( 255.0f / 2.0f ) + 0.5f );
+	intNormal[2] = idMath::Ftob( ( floatNormal[2] + 1.0f ) * ( 255.0f / 2.0f ) + 0.5f );
+	int result =	( ( intNormal[ 0 ] - normalized[ 0 ] ) * ( intNormal[ 0 ] - normalized[ 0 ] ) ) +
+					( ( intNormal[ 1 ] - normalized[ 1 ] ) * ( intNormal[ 1 ] - normalized[ 1 ] ) ) +
+					( ( intNormal[ 2 ] - normalized[ 2 ] ) * ( intNormal[ 2 ] - normalized[ 2 ] ) );
+	return result;
+#endif
 }
 
 /*
@@ -1015,6 +1044,7 @@ NormalDistanceDXT5
 ========================
 */
 int NormalDistanceDXT5( const int *vector, const int *normalized ) {
+#if defined( ID_WIN_X86_ASM )
 	int result;
 	__asm {
 		mov			esi, vector
@@ -1064,6 +1094,33 @@ int NormalDistanceDXT5( const int *vector, const int *normalized ) {
 		movd		result, xmm0
 	}
 	return result;
+#else
+#if 0	// object-space
+	const int c0 = 0;
+	const int c1 = 1;
+	const int c2 = 3;
+#else
+	const int c0 = 1;
+	const int c1 = 2;
+	const int c2 = 3;
+#endif
+	float floatNormal[3];
+	byte intNormal[4];
+	floatNormal[0] = vector[c0] / 255.0f * 2.0f - 1.0f;
+	floatNormal[1] = vector[c1] / 255.0f * 2.0f - 1.0f;
+	floatNormal[2] = vector[c2] / 255.0f * 2.0f - 1.0f;
+	float rcplen = idMath::InvSqrt( floatNormal[0] * floatNormal[0] + floatNormal[1] * floatNormal[1] + floatNormal[2] * floatNormal[2] );
+	floatNormal[0] *= rcplen;
+	floatNormal[1] *= rcplen;
+	floatNormal[2] *= rcplen;
+	intNormal[c0] = idMath::Ftob( ( floatNormal[0] + 1.0f ) / 2.0f * 255.0f + 0.5f );
+	intNormal[c1] = idMath::Ftob( ( floatNormal[1] + 1.0f ) / 2.0f * 255.0f + 0.5f );
+	intNormal[c2] = idMath::Ftob( ( floatNormal[2] + 1.0f ) / 2.0f * 255.0f + 0.5f );
+	int result =	( ( intNormal[ c0 ] - normalized[ c0 ] ) * ( intNormal[ c0 ] - normalized[ c0 ] ) ) +
+					( ( intNormal[ c1 ] - normalized[ c1 ] ) * ( intNormal[ c1 ] - normalized[ c1 ] ) ) +
+					( ( intNormal[ c2 ] - normalized[ c2 ] ) * ( intNormal[ c2 ] - normalized[ c2 ] ) );
+	return result;
+#endif
 }
 
 /*
diff --git a/neo/renderer/DXT/DXTEncoder_SSE2.cpp b/neo/renderer/DXT/DXTEncoder_SSE2.cpp
index e7f73657..1d09a4b2 100644
--- a/neo/renderer/DXT/DXTEncoder_SSE2.cpp
+++ b/neo/renderer/DXT/DXTEncoder_SSE2.cpp
@@ -34,6 +34,7 @@ Contains the DxtEncoder implementation for SSE2.
 #include "DXTCodec_local.h"
 #include "DXTCodec.h"
 
+#if defined( ID_WIN_X86_SSE2_INTRIN ) || ( ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) ) )
 
 //#define TEST_COMPRESSION
 #ifdef TEST_COMPRESSION
@@ -142,10 +143,30 @@ paramO:	colorBlock	- 4*4 output tile, 4 bytes per pixel
 ========================
 */
 ID_INLINE void idDxtEncoder::ExtractBlock_SSE2( const byte * inPtr, int width, byte * colorBlock ) const {
+#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
+	__asm {
+		mov			esi, inPtr
+		mov			edi, colorBlock
+		mov			eax, width
+		shl			eax, 2
+		movdqa		xmm0, xmmword ptr [esi]
+		movdqa		xmmword ptr [edi+ 0], xmm0
+		movdqa		xmm1, xmmword ptr [esi+eax]			// + 4 * width
+		movdqa		xmmword ptr [edi+16], xmm1
+		movdqa		xmm2, xmmword ptr [esi+eax*2]		// + 8 * width
+		add			esi, eax
+		movdqa		xmmword ptr [edi+32], xmm2
+		movdqa		xmm3, xmmword ptr [esi+eax*2]		// + 12 * width
+		movdqa		xmmword ptr [edi+48], xmm3
+	}
+#elif defined ( ID_WIN_X86_SSE2_INTRIN )
 	*((__m128i *)(&colorBlock[ 0])) = _mm_load_si128( (__m128i *)( inPtr + width * 4 * 0 ) );
 	*((__m128i *)(&colorBlock[16])) = _mm_load_si128( (__m128i *)( inPtr + width * 4 * 1 ) );
 	*((__m128i *)(&colorBlock[32])) = _mm_load_si128( (__m128i *)( inPtr + width * 4 * 2 ) );
 	*((__m128i *)(&colorBlock[48])) = _mm_load_si128( (__m128i *)( inPtr + width * 4 * 3 ) );
+#else
+	assert( false );
+#endif
 }
 
 /*
@@ -160,6 +181,31 @@ paramO:	maxColor	- Max 4 byte output color
 ========================
 */
 ID_INLINE void idDxtEncoder::GetMinMaxBBox_SSE2( const byte * colorBlock, byte * minColor, byte * maxColor ) const {
+#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
+	__asm {
+		mov			eax, colorBlock
+		mov			esi, minColor
+		mov			edi, maxColor
+		movdqa		xmm0, xmmword ptr [eax+ 0]
+		movdqa		xmm1, xmmword ptr [eax+ 0]
+		pminub		xmm0, xmmword ptr [eax+16]
+		pmaxub		xmm1, xmmword ptr [eax+16]
+		pminub		xmm0, xmmword ptr [eax+32]
+		pmaxub		xmm1, xmmword ptr [eax+32]
+		pminub		xmm0, xmmword ptr [eax+48]
+		pmaxub		xmm1, xmmword ptr [eax+48]
+		pshufd		xmm3, xmm0, R_SHUFFLE_D( 2, 3, 2, 3 )
+		pshufd		xmm4, xmm1, R_SHUFFLE_D( 2, 3, 2, 3 )
+		pminub		xmm0, xmm3
+		pmaxub		xmm1, xmm4
+		pshuflw		xmm6, xmm0, R_SHUFFLE_D( 2, 3, 2, 3 )
+		pshuflw		xmm7, xmm1, R_SHUFFLE_D( 2, 3, 2, 3 )
+		pminub		xmm0, xmm6
+		pmaxub		xmm1, xmm7
+		movd		dword ptr [esi], xmm0
+		movd		dword ptr [edi], xmm1
+	}
+#elif defined ( ID_WIN_X86_SSE2_INTRIN )
 	__m128i block0 = *((__m128i *)(&colorBlock[ 0]));
 	__m128i block1 = *((__m128i *)(&colorBlock[16]));
 	__m128i block2 = *((__m128i *)(&colorBlock[32]));
@@ -187,6 +233,9 @@ ID_INLINE void idDxtEncoder::GetMinMaxBBox_SSE2( const byte * colorBlock, byte *
 
 	*((int *)maxColor) = _mm_cvtsi128_si32( max6 );
 	*((int *)minColor) = _mm_cvtsi128_si32( min6 );
+#else
+	assert( false );
+#endif
 }
 
 /*
@@ -195,6 +244,25 @@ idDxtEncoder::InsetColorsBBox_SSE2
 ========================
 */
 ID_INLINE void idDxtEncoder::InsetColorsBBox_SSE2( byte * minColor, byte * maxColor ) const {
+#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
+	__asm {
+		mov			esi, minColor
+		mov			edi, maxColor
+		movd		xmm0, dword ptr [esi]
+		movd		xmm1, dword ptr [edi]
+		punpcklbw	xmm0, SIMD_SSE2_byte_0
+		punpcklbw	xmm1, SIMD_SSE2_byte_0
+		movdqa		xmm2, xmm1
+		psubw		xmm2, xmm0
+		pmulhw		xmm2, SIMD_SSE2_word_insetShift
+		paddw		xmm0, xmm2
+		psubw		xmm1, xmm2
+		packuswb	xmm0, xmm0
+		packuswb	xmm1, xmm1
+		movd		dword ptr [esi], xmm0
+		movd		dword ptr [edi], xmm1
+	}
+#elif defined ( ID_WIN_X86_SSE2_INTRIN )
 	__m128i min = _mm_cvtsi32_si128( *(int *)minColor );
 	__m128i max = _mm_cvtsi32_si128( *(int *)maxColor );
 
@@ -213,6 +281,9 @@ ID_INLINE void idDxtEncoder::InsetColorsBBox_SSE2( byte * minColor, byte * maxCo
 
 	*((int *)minColor) = _mm_cvtsi128_si32( xmm0 );
 	*((int *)maxColor) = _mm_cvtsi128_si32( xmm1 );
+#else
+	assert( false );
+#endif
 }
 
 /*
@@ -226,6 +297,165 @@ return: 4 byte color index block
 ========================
 */
 void idDxtEncoder::EmitColorIndices_SSE2( const byte * colorBlock, const byte * minColor_, const byte * maxColor_ ) {
+#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
+	ALIGN16( byte color0[16] );
+	ALIGN16( byte color1[16] );
+	ALIGN16( byte color2[16] );
+	ALIGN16( byte color3[16] );
+	ALIGN16( byte result[16] );
+	byte *outPtr = outData;
+
+	__asm {
+		mov			esi, maxColor_
+		mov			edi, minColor_
+		pxor		xmm7, xmm7
+		movdqa		result, xmm7
+
+		movd		xmm0, dword ptr [esi]
+		pand		xmm0, SIMD_SSE2_byte_colorMask
+		punpcklbw	xmm0, xmm7
+		pshuflw		xmm4, xmm0, R_SHUFFLE_D( 0, 3, 2, 3 )
+		pshuflw		xmm5, xmm0, R_SHUFFLE_D( 3, 1, 3, 3 )
+		psrlw		xmm4, 5
+		psrlw		xmm5, 6
+		por			xmm0, xmm4
+		por			xmm0, xmm5
+
+		movd		xmm1, dword ptr [edi]
+		pand		xmm1, SIMD_SSE2_byte_colorMask
+		punpcklbw	xmm1, xmm7
+		pshuflw		xmm4, xmm1, R_SHUFFLE_D( 0, 3, 2, 3 )
+		pshuflw		xmm5, xmm1, R_SHUFFLE_D( 3, 1, 3, 3 )
+		psrlw		xmm4, 5
+		psrlw		xmm5, 6
+		por			xmm1, xmm4
+		por			xmm1, xmm5
+
+		movdqa		xmm2, xmm0
+		packuswb	xmm2, xmm7
+		pshufd		xmm2, xmm2, R_SHUFFLE_D( 0, 1, 0, 1 )
+		movdqa		color0, xmm2
+
+		movdqa		xmm6, xmm0
+		paddw		xmm6, xmm0
+		paddw		xmm6, xmm1
+		pmulhw		xmm6, SIMD_SSE2_word_div_by_3	// * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
+		packuswb	xmm6, xmm7
+		pshufd		xmm6, xmm6, R_SHUFFLE_D( 0, 1, 0, 1 )
+		movdqa		color2, xmm6
+
+		movdqa		xmm3, xmm1
+		packuswb	xmm3, xmm7
+		pshufd		xmm3, xmm3, R_SHUFFLE_D( 0, 1, 0, 1 )
+		movdqa		color1, xmm3
+
+		paddw		xmm1, xmm1
+		paddw		xmm0, xmm1
+		pmulhw		xmm0, SIMD_SSE2_word_div_by_3	// * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
+		packuswb	xmm0, xmm7
+		pshufd		xmm0, xmm0, R_SHUFFLE_D( 0, 1, 0, 1 )
+		movdqa		color3, xmm0
+
+		mov			eax, 32
+		mov			esi, colorBlock
+
+	loop1:			// iterates 2 times
+		movq		xmm3, qword ptr [esi+eax+0]
+		pshufd		xmm3, xmm3, R_SHUFFLE_D( 0, 2, 1, 3 )		// punpckldq	xmm4, SIMD_SSE2_dword_0
+		movq		xmm5, qword ptr [esi+eax+8]
+		pshufd		xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 )		// punpckldq	xmm5, SIMD_SSE2_dword_0
+
+		movdqa		xmm0, xmm3
+		movdqa		xmm6, xmm5
+		psadbw		xmm0, color0
+		psadbw		xmm6, color0
+		packssdw	xmm0, xmm6
+		movdqa		xmm1, xmm3
+		movdqa		xmm6, xmm5
+		psadbw		xmm1, color1
+		psadbw		xmm6, color1
+		packssdw	xmm1, xmm6
+		movdqa		xmm2, xmm3
+		movdqa		xmm6, xmm5
+		psadbw		xmm2, color2
+		psadbw		xmm6, color2
+		packssdw	xmm2, xmm6
+		psadbw		xmm3, color3
+		psadbw		xmm5, color3
+		packssdw	xmm3, xmm5
+
+		movq		xmm4, qword ptr [esi+eax+16]
+		pshufd		xmm4, xmm4, R_SHUFFLE_D( 0, 2, 1, 3 )
+		movq		xmm5, qword ptr [esi+eax+24]
+		pshufd		xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 )
+
+		movdqa		xmm6, xmm4
+		movdqa		xmm7, xmm5
+		psadbw		xmm6, color0
+		psadbw		xmm7, color0
+		packssdw	xmm6, xmm7
+		packssdw	xmm0, xmm6				// d1
+		movdqa		xmm6, xmm4
+		movdqa		xmm7, xmm5
+		psadbw		xmm6, color1
+		psadbw		xmm7, color1
+		packssdw	xmm6, xmm7
+		packssdw	xmm1, xmm6				// d1
+		movdqa		xmm6, xmm4
+		movdqa		xmm7, xmm5
+		psadbw		xmm6, color2
+		psadbw		xmm7, color2
+		packssdw	xmm6, xmm7
+		packssdw	xmm2, xmm6				// d2
+		psadbw		xmm4, color3
+		psadbw		xmm5, color3
+		packssdw	xmm4, xmm5
+		packssdw	xmm3, xmm4				// d3
+
+		movdqa		xmm7, result
+		pslld		xmm7, 16
+
+		movdqa		xmm4, xmm0
+		movdqa		xmm5, xmm1
+		pcmpgtw		xmm0, xmm3				// b0
+		pcmpgtw		xmm1, xmm2				// b1
+		pcmpgtw		xmm4, xmm2				// b2
+		pcmpgtw		xmm5, xmm3				// b3
+		pcmpgtw		xmm2, xmm3				// b4
+		pand		xmm4, xmm1				// x0
+		pand		xmm5, xmm0				// x1
+		pand		xmm2, xmm0				// x2
+		por			xmm4, xmm5
+		pand		xmm2, SIMD_SSE2_word_1
+		pand		xmm4, SIMD_SSE2_word_2
+		por			xmm2, xmm4
+
+		pshufd		xmm5, xmm2, R_SHUFFLE_D( 2, 3, 0, 1 )
+		punpcklwd	xmm2, SIMD_SSE2_word_0
+		punpcklwd	xmm5, SIMD_SSE2_word_0
+		pslld		xmm5, 8
+		por			xmm7, xmm5
+		por			xmm7, xmm2
+		movdqa		result, xmm7
+
+		sub			eax, 32
+		jge			loop1
+
+		mov			esi, outPtr
+		pshufd		xmm4, xmm7, R_SHUFFLE_D( 1, 2, 3, 0 )
+		pshufd		xmm5, xmm7, R_SHUFFLE_D( 2, 3, 0, 1 )
+		pshufd		xmm6, xmm7, R_SHUFFLE_D( 3, 0, 1, 2 )
+		pslld		xmm4, 2
+		pslld		xmm5, 4
+		pslld		xmm6, 6
+		por			xmm7, xmm4
+		por			xmm7, xmm5
+		por			xmm7, xmm6
+		movd		dword ptr [esi], xmm7
+	}
+
+	outData += 4;
+#elif defined ( ID_WIN_X86_SSE2_INTRIN )
 	__m128c zero = SIMD_SSE2_zero;
 	__m128c result = SIMD_SSE2_zero;
 	__m128c color0, color1, color2, color3;
@@ -359,6 +589,9 @@ void idDxtEncoder::EmitColorIndices_SSE2( const byte * colorBlock, const byte *
 
 	unsigned int out = _mm_cvtsi128_si32( temp7 );
 	EmitUInt( out );
+#else
+	assert( false );
+#endif
 }
 
 /*
@@ -372,6 +605,162 @@ return: 4 byte color index block
 ========================
 */
 void idDxtEncoder::EmitColorAlphaIndices_SSE2( const byte *colorBlock, const byte *minColor_, const byte *maxColor_ ) {
+#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
+	ALIGN16( byte color0[16] );
+	ALIGN16( byte color1[16] );
+	ALIGN16( byte color2[16] );
+	ALIGN16( byte color3[16] );
+	ALIGN16( byte result[16] );
+	byte *outPtr = outData;
+
+	__asm {
+		mov			esi, maxColor_
+		mov			edi, minColor_
+		pxor		xmm7, xmm7
+		movdqa		result, xmm7
+
+		movd		xmm0, dword ptr [esi]
+		pand		xmm0, SIMD_SSE2_byte_colorMask
+		punpcklbw	xmm0, xmm7
+		pshuflw		xmm4, xmm0, R_SHUFFLE_D( 0, 3, 2, 3 )
+		pshuflw		xmm5, xmm0, R_SHUFFLE_D( 3, 1, 3, 3 )
+		psrlw		xmm4, 5
+		psrlw		xmm5, 6
+		por			xmm0, xmm4
+		por			xmm0, xmm5
+
+		movd		xmm1, dword ptr [edi]
+		pand		xmm1, SIMD_SSE2_byte_colorMask
+		punpcklbw	xmm1, xmm7
+		pshuflw		xmm4, xmm1, R_SHUFFLE_D( 0, 3, 2, 3 )
+		pshuflw		xmm5, xmm1, R_SHUFFLE_D( 3, 1, 3, 3 )
+		psrlw		xmm4, 5
+		psrlw		xmm5, 6
+		por			xmm1, xmm4
+		por			xmm1, xmm5
+
+		movdqa		xmm2, xmm0
+		packuswb	xmm2, xmm7
+		pshufd		xmm2, xmm2, R_SHUFFLE_D( 0, 1, 0, 1 )
+		movdqa		color0, xmm2
+
+		movdqa		xmm6, xmm0
+		paddw		xmm6, xmm1
+		psrlw		xmm6, 1
+		packuswb	xmm6, xmm7
+		pshufd		xmm6, xmm6, R_SHUFFLE_D( 0, 1, 0, 1 )
+		movdqa		color2, xmm6
+
+		movdqa		xmm3, xmm1
+		packuswb	xmm3, xmm7
+		pshufd		xmm3, xmm3, R_SHUFFLE_D( 0, 1, 0, 1 )
+		movdqa		color1, xmm3
+
+		movdqa		color3, xmm7
+
+		mov			eax, 32
+		mov			esi, colorBlock
+
+	loop1:			// iterates 2 times
+		movq		xmm3, qword ptr [esi+eax+0]
+		pshufd		xmm3, xmm3, R_SHUFFLE_D( 0, 2, 1, 3 )
+		movq		xmm5, qword ptr [esi+eax+8]
+		pshufd		xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 )
+
+		movdqa		xmm0, xmm3
+		movdqa		xmm6, xmm5
+		psadbw		xmm0, color0
+		psadbw		xmm6, color0
+		packssdw	xmm0, xmm6
+		movdqa		xmm1, xmm3
+		movdqa		xmm6, xmm5
+		psadbw		xmm1, color1
+		psadbw		xmm6, color1
+		packssdw	xmm1, xmm6
+		movdqa		xmm2, xmm3
+		movdqa		xmm6, xmm5
+		psadbw		xmm2, color2
+		psadbw		xmm6, color2
+		packssdw	xmm2, xmm6
+
+		shufps		xmm3, xmm5, R_SHUFFLE_D( 0, 2, 0, 2 )
+		psrld		xmm3, 24
+		packssdw	xmm3, xmm3
+
+		movq		xmm4, qword ptr [esi+eax+16]
+		pshufd		xmm4, xmm4, R_SHUFFLE_D( 0, 2, 1, 3 )
+		movq		xmm5, qword ptr [esi+eax+24]
+		pshufd		xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 )
+
+		movdqa		xmm6, xmm4
+		movdqa		xmm7, xmm5
+		psadbw		xmm6, color0
+		psadbw		xmm7, color0
+		packssdw	xmm6, xmm7
+		packssdw	xmm0, xmm6					// d1
+		movdqa		xmm6, xmm4
+		movdqa		xmm7, xmm5
+		psadbw		xmm6, color1
+		psadbw		xmm7, color1
+		packssdw	xmm6, xmm7
+		packssdw	xmm1, xmm6					// d1
+		movdqa		xmm6, xmm4
+		movdqa		xmm7, xmm5
+		psadbw		xmm6, color2
+		psadbw		xmm7, color2
+		packssdw	xmm6, xmm7
+		packssdw	xmm2, xmm6					// d2
+
+		shufps		xmm4, xmm5, R_SHUFFLE_D( 0, 2, 0, 2 )
+		psrld		xmm4, 24
+		packssdw	xmm4, xmm4
+
+		punpcklqdq	xmm3, xmm4					// c3
+
+		movdqa		xmm7, result
+		pslld		xmm7, 16
+
+		movdqa		xmm4, xmm2
+		pcmpgtw		xmm2, xmm0					// b0
+		pcmpgtw		xmm4, xmm1					// b1
+		pcmpgtw		xmm1, xmm0					// b2
+		pmaxsw		xmm3, SIMD_SSE2_word_127	// b3
+		pcmpeqw		xmm3, SIMD_SSE2_word_127
+
+		pand		xmm2, xmm4
+		por			xmm2, xmm3					// b0 & b1 | b3
+		pxor		xmm1, xmm4
+		por			xmm1, xmm3					// b2 ^ b1 | b3
+		pand		xmm2, SIMD_SSE2_word_2
+		pand		xmm1, SIMD_SSE2_word_1
+		por			xmm2, xmm1
+
+		pshufd		xmm5, xmm2, R_SHUFFLE_D( 2, 3, 0, 1 )
+		punpcklwd	xmm2, SIMD_SSE2_word_0
+		punpcklwd	xmm5, SIMD_SSE2_word_0
+		pslld		xmm5, 8
+		por			xmm7, xmm5
+		por			xmm7, xmm2
+		movdqa		result, xmm7
+
+		sub			eax, 32
+		jge			loop1
+
+		mov			esi, outPtr
+		pshufd		xmm4, xmm7, R_SHUFFLE_D( 1, 2, 3, 0 )
+		pshufd		xmm5, xmm7, R_SHUFFLE_D( 2, 3, 0, 1 )
+		pshufd		xmm6, xmm7, R_SHUFFLE_D( 3, 0, 1, 2 )
+		pslld		xmm4, 2
+		pslld		xmm5, 4
+		pslld		xmm6, 6
+		por			xmm7, xmm4
+		por			xmm7, xmm5
+		por			xmm7, xmm6
+		movd		dword ptr [esi], xmm7
+	}
+
+	outData += 4;
+#elif defined ( ID_WIN_X86_SSE2_INTRIN )
 	__m128c zero = SIMD_SSE2_zero;
 	__m128c result = SIMD_SSE2_zero;
 	__m128c color0, color1, color2;
@@ -508,6 +897,9 @@ void idDxtEncoder::EmitColorAlphaIndices_SSE2( const byte *colorBlock, const byt
 
 	unsigned int out = _mm_cvtsi128_si32( temp7 );
 	EmitUInt( out );
+#else
+	assert( false );
+#endif
 }
 
 /*
@@ -521,6 +913,147 @@ return: 4 byte color index block
 ========================
 */
 void idDxtEncoder::EmitCoCgIndices_SSE2( const byte *colorBlock, const byte *minColor_, const byte *maxColor_ ) {
+#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
+	ALIGN16( byte color0[16] );
+	ALIGN16( byte color1[16] );
+	ALIGN16( byte color2[16] );
+	ALIGN16( byte color3[16] );
+	ALIGN16( byte result[16] );
+	byte *outPtr = outData;
+
+	__asm {
+		mov			esi, maxColor_
+		mov			edi, minColor_
+		pxor		xmm7, xmm7
+		movdqa		result, xmm7
+
+		movd		xmm0, dword ptr [esi]
+		pand		xmm0, SIMD_SSE2_byte_colorMask2
+		pshufd		xmm0, xmm0, R_SHUFFLE_D( 0, 1, 0, 1 )
+		movdqa		color0, xmm0
+
+		movd		xmm1, dword ptr [edi]
+		pand		xmm1, SIMD_SSE2_byte_colorMask2
+		pshufd		xmm1, xmm1, R_SHUFFLE_D( 0, 1, 0, 1 )
+		movdqa		color1, xmm1
+
+		punpcklbw	xmm0, xmm7
+		punpcklbw	xmm1, xmm7
+
+		movdqa		xmm6, xmm1
+		paddw		xmm1, xmm0
+		paddw		xmm0, xmm1
+		pmulhw		xmm0, SIMD_SSE2_word_div_by_3	// * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
+		packuswb	xmm0, xmm7
+		pshufd		xmm0, xmm0, R_SHUFFLE_D( 0, 1, 0, 1 )
+		movdqa		color2, xmm0
+
+		paddw		xmm1, xmm6
+		pmulhw		xmm1, SIMD_SSE2_word_div_by_3	// * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
+		packuswb	xmm1, xmm7
+		pshufd		xmm1, xmm1, R_SHUFFLE_D( 0, 1, 0, 1 )
+		movdqa		color3, xmm1
+
+		mov			eax, 32
+		mov			esi, colorBlock
+
+	loop1:			// iterates 2 times
+		movq		xmm3, qword ptr [esi+eax+0]
+		pshufd		xmm3, xmm3, R_SHUFFLE_D( 0, 2, 1, 3 )		// punpckldq	xmm4, SIMD_SSE2_dword_0
+		movq		xmm5, qword ptr [esi+eax+8]
+		pshufd		xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 )		// punpckldq	xmm5, SIMD_SSE2_dword_0
+
+		movdqa		xmm0, xmm3
+		movdqa		xmm6, xmm5
+		psadbw		xmm0, color0
+		psadbw		xmm6, color0
+		packssdw	xmm0, xmm6
+		movdqa		xmm1, xmm3
+		movdqa		xmm6, xmm5
+		psadbw		xmm1, color1
+		psadbw		xmm6, color1
+		packssdw	xmm1, xmm6
+		movdqa		xmm2, xmm3
+		movdqa		xmm6, xmm5
+		psadbw		xmm2, color2
+		psadbw		xmm6, color2
+		packssdw	xmm2, xmm6
+		psadbw		xmm3, color3
+		psadbw		xmm5, color3
+		packssdw	xmm3, xmm5
+
+		movq		xmm4, qword ptr [esi+eax+16]
+		pshufd		xmm4, xmm4, R_SHUFFLE_D( 0, 2, 1, 3 )
+		movq		xmm5, qword ptr [esi+eax+24]
+		pshufd		xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 )
+
+		movdqa		xmm6, xmm4
+		movdqa		xmm7, xmm5
+		psadbw		xmm6, color0
+		psadbw		xmm7, color0
+		packssdw	xmm6, xmm7
+		packssdw	xmm0, xmm6				// d1
+		movdqa		xmm6, xmm4
+		movdqa		xmm7, xmm5
+		psadbw		xmm6, color1
+		psadbw		xmm7, color1
+		packssdw	xmm6, xmm7
+		packssdw	xmm1, xmm6				// d1
+		movdqa		xmm6, xmm4
+		movdqa		xmm7, xmm5
+		psadbw		xmm6, color2
+		psadbw		xmm7, color2
+		packssdw	xmm6, xmm7
+		packssdw	xmm2, xmm6				// d2
+		psadbw		xmm4, color3
+		psadbw		xmm5, color3
+		packssdw	xmm4, xmm5
+		packssdw	xmm3, xmm4				// d3
+
+		movdqa		xmm7, result
+		pslld		xmm7, 16
+
+		movdqa		xmm4, xmm0
+		movdqa		xmm5, xmm1
+		pcmpgtw		xmm0, xmm3				// b0
+		pcmpgtw		xmm1, xmm2				// b1
+		pcmpgtw		xmm4, xmm2				// b2
+		pcmpgtw		xmm5, xmm3				// b3
+		pcmpgtw		xmm2, xmm3				// b4
+		pand		xmm4, xmm1				// x0
+		pand		xmm5, xmm0				// x1
+		pand		xmm2, xmm0				// x2
+		por			xmm4, xmm5
+		pand		xmm2, SIMD_SSE2_word_1
+		pand		xmm4, SIMD_SSE2_word_2
+		por			xmm2, xmm4
+
+		pshufd		xmm5, xmm2, R_SHUFFLE_D( 2, 3, 0, 1 )
+		punpcklwd	xmm2, SIMD_SSE2_word_0
+		punpcklwd	xmm5, SIMD_SSE2_word_0
+		pslld		xmm5, 8
+		por			xmm7, xmm5
+		por			xmm7, xmm2
+		movdqa		result, xmm7
+
+		sub			eax, 32
+		jge			loop1
+
+		mov			esi, outPtr
+		pshufd		xmm4, xmm7, R_SHUFFLE_D( 1, 2, 3, 0 )
+		pshufd		xmm5, xmm7, R_SHUFFLE_D( 2, 3, 0, 1 )
+		pshufd		xmm6, xmm7, R_SHUFFLE_D( 3, 0, 1, 2 )
+		pslld		xmm4, 2
+		pslld		xmm5, 4
+		pslld		xmm6, 6
+		por			xmm7, xmm4
+		por			xmm7, xmm5
+		por			xmm7, xmm6
+		movd		dword ptr [esi], xmm7
+	}
+
+	outData += 4;
+#elif defined ( ID_WIN_X86_SSE2_INTRIN )
 	__m128c zero = SIMD_SSE2_zero;
 	__m128c result = SIMD_SSE2_zero;
 	__m128c color0, color1, color2, color3;
@@ -640,6 +1173,9 @@ void idDxtEncoder::EmitCoCgIndices_SSE2( const byte *colorBlock, const byte *min
 
 	unsigned int out = _mm_cvtsi128_si32( temp7 );
 	EmitUInt( out );
+#else
+	assert( false );
+#endif
 }
 
 /*
@@ -652,6 +1188,144 @@ paramO:	maxAlpha	- Max alpha found
 ========================
 */
 void idDxtEncoder::EmitAlphaIndices_SSE2( const byte *block, const int minAlpha_, const int maxAlpha_ ) {
+#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
+	assert( maxAlpha_ >= minAlpha_ );
+
+	byte *outPtr = outData;
+
+	__asm {
+		mov			esi, block
+		movdqa		xmm0, xmmword ptr [esi+	0]
+		movdqa		xmm5, xmmword ptr [esi+16]
+		movdqa		xmm6, xmmword ptr [esi+32]
+		movdqa		xmm4, xmmword ptr [esi+48]
+
+		psrld		xmm0, 24
+		psrld		xmm5, 24
+		psrld		xmm6, 24
+		psrld		xmm4, 24
+
+		packuswb	xmm0, xmm5
+		packuswb	xmm6, xmm4
+
+		//---------------------
+
+		// ab0 = (  7 * maxAlpha +  7 * minAlpha + ALPHA_RANGE ) / 14
+		// ab3 = (  9 * maxAlpha +  5 * minAlpha + ALPHA_RANGE ) / 14
+		// ab2 = ( 11 * maxAlpha +  3 * minAlpha + ALPHA_RANGE ) / 14
+		// ab1 = ( 13 * maxAlpha +  1 * minAlpha + ALPHA_RANGE ) / 14
+
+		// ab4 = (  7 * maxAlpha +  7 * minAlpha + ALPHA_RANGE ) / 14
+		// ab5 = (  5 * maxAlpha +  9 * minAlpha + ALPHA_RANGE ) / 14
+		// ab6 = (  3 * maxAlpha + 11 * minAlpha + ALPHA_RANGE ) / 14
+		// ab7 = (  1 * maxAlpha + 13 * minAlpha + ALPHA_RANGE ) / 14
+
+		movd		xmm5, maxAlpha_
+		pshuflw		xmm5, xmm5,	R_SHUFFLE_D( 0,	0, 0, 0	)
+		pshufd		xmm5, xmm5,	R_SHUFFLE_D( 0,	0, 0, 0	)
+		movdqa		xmm7, xmm5
+
+		movd		xmm2, minAlpha_
+		pshuflw		xmm2, xmm2,	R_SHUFFLE_D( 0,	0, 0, 0	)
+		pshufd		xmm2, xmm2,	R_SHUFFLE_D( 0,	0, 0, 0	)
+		movdqa		xmm3, xmm2
+
+		pmullw		xmm5, SIMD_SSE2_word_scale_7_9_11_13
+		pmullw		xmm7, SIMD_SSE2_word_scale_7_5_3_1
+		pmullw		xmm2, SIMD_SSE2_word_scale_7_5_3_1
+		pmullw		xmm3, SIMD_SSE2_word_scale_7_9_11_13
+
+		paddw		xmm5, xmm2
+		paddw		xmm7, xmm3
+
+		paddw		xmm5, SIMD_SSE2_word_7
+		paddw		xmm7, SIMD_SSE2_word_7
+
+		pmulhw		xmm5, SIMD_SSE2_word_div_by_14			// * ( ( 1 << 16 ) / 14	+ 1	) )	>> 16
+		pmulhw		xmm7, SIMD_SSE2_word_div_by_14			// * ( ( 1 << 16 ) / 14	+ 1	) )	>> 16
+
+		pshufd		xmm1, xmm5,	R_SHUFFLE_D( 3, 3, 3, 3	)
+		pshufd		xmm2, xmm5,	R_SHUFFLE_D( 2, 2, 2, 2	)
+		pshufd		xmm3, xmm5,	R_SHUFFLE_D( 1, 1, 1, 1	)
+		packuswb	xmm1, xmm1								// ab1
+		packuswb	xmm2, xmm2								// ab2
+		packuswb	xmm3, xmm3								// ab3
+
+		packuswb	xmm0, xmm6								// alpha block
+
+		pshufd		xmm4, xmm7,	R_SHUFFLE_D( 0,	0, 0, 0	)
+		pshufd		xmm5, xmm7,	R_SHUFFLE_D( 1,	1, 1, 1	)
+		pshufd		xmm6, xmm7,	R_SHUFFLE_D( 2,	2, 2, 2	)
+		pshufd		xmm7, xmm7,	R_SHUFFLE_D( 3,	3, 3, 3	)
+		packuswb	xmm4, xmm4								// ab4
+		packuswb	xmm5, xmm5								// ab5
+		packuswb	xmm6, xmm6								// ab6
+		packuswb	xmm7, xmm7								// ab7
+
+		pmaxub		xmm1, xmm0
+		pmaxub		xmm2, xmm0
+		pmaxub		xmm3, xmm0
+		pcmpeqb		xmm1, xmm0
+		pcmpeqb		xmm2, xmm0
+		pcmpeqb		xmm3, xmm0
+		pmaxub		xmm4, xmm0
+		pmaxub		xmm5, xmm0
+		pmaxub		xmm6, xmm0
+		pmaxub		xmm7, xmm0
+		pcmpeqb		xmm4, xmm0
+		pcmpeqb		xmm5, xmm0
+		pcmpeqb		xmm6, xmm0
+		pcmpeqb		xmm7, xmm0
+		movdqa		xmm0, SIMD_SSE2_byte_8
+		paddsb		xmm0, xmm1
+		paddsb		xmm2, xmm3
+		paddsb		xmm4, xmm5
+		paddsb		xmm6, xmm7
+		paddsb		xmm0, xmm2
+		paddsb		xmm4, xmm6
+		paddsb		xmm0, xmm4
+		pand		xmm0, SIMD_SSE2_byte_7
+		movdqa		xmm1, SIMD_SSE2_byte_2
+		pcmpgtb		xmm1, xmm0
+		pand		xmm1, SIMD_SSE2_byte_1
+		pxor		xmm0, xmm1
+		movdqa		xmm1, xmm0
+		movdqa		xmm2, xmm0
+		movdqa		xmm3, xmm0
+		movdqa		xmm4, xmm0
+		movdqa		xmm5, xmm0
+		movdqa		xmm6, xmm0
+		movdqa		xmm7, xmm0
+		psrlq		xmm1,  8- 3
+		psrlq		xmm2, 16- 6
+		psrlq		xmm3, 24- 9
+		psrlq		xmm4, 32-12
+		psrlq		xmm5, 40-15
+		psrlq		xmm6, 48-18
+		psrlq		xmm7, 56-21
+		pand		xmm0, SIMD_SSE2_dword_alpha_bit_mask0
+		pand		xmm1, SIMD_SSE2_dword_alpha_bit_mask1
+		pand		xmm2, SIMD_SSE2_dword_alpha_bit_mask2
+		pand		xmm3, SIMD_SSE2_dword_alpha_bit_mask3
+		pand		xmm4, SIMD_SSE2_dword_alpha_bit_mask4
+		pand		xmm5, SIMD_SSE2_dword_alpha_bit_mask5
+		pand		xmm6, SIMD_SSE2_dword_alpha_bit_mask6
+		pand		xmm7, SIMD_SSE2_dword_alpha_bit_mask7
+		por			xmm0, xmm1
+		por			xmm2, xmm3
+		por			xmm4, xmm5
+		por			xmm6, xmm7
+		por			xmm0, xmm2
+		por			xmm4, xmm6
+		por			xmm0, xmm4
+		mov			esi, outPtr
+		movd		[esi+0], xmm0
+		pshufd		xmm1, xmm0, R_SHUFFLE_D( 2, 3, 0, 1 )
+		movd		[esi+3], xmm1
+	}
+
+	outData += 6;
+#elif defined ( ID_WIN_X86_SSE2_INTRIN )
 	__m128i block0 = *((__m128i *)(&block[ 0]));
 	__m128i block1 = *((__m128i *)(&block[16]));
 	__m128i block2 = *((__m128i *)(&block[32]));
@@ -777,6 +1451,9 @@ void idDxtEncoder::EmitAlphaIndices_SSE2( const byte *block, const int minAlpha_
 	out = _mm_cvtsi128_si32( temp1 );
 	EmitUInt( out );
 	outData--;
+#else
+	assert( false );
+#endif
 }
 
 /*
@@ -785,6 +1462,151 @@ idDxtEncoder::EmitAlphaIndices_SSE2
 ========================
 */
 void idDxtEncoder::EmitAlphaIndices_SSE2( const byte *block, const int channelBitOffset, const int minAlpha_, const int maxAlpha_ ) {
+#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
+	assert( maxAlpha_ >= minAlpha_ );
+
+	byte *outPtr = outData;
+
+	__asm {
+		movd		xmm7, channelBitOffset
+
+		mov			esi, block
+		movdqa		xmm0, xmmword ptr [esi+	0]
+		movdqa		xmm5, xmmword ptr [esi+16]
+		movdqa		xmm6, xmmword ptr [esi+32]
+		movdqa		xmm4, xmmword ptr [esi+48]
+
+		psrld		xmm0, xmm7
+		psrld		xmm5, xmm7
+		psrld		xmm6, xmm7
+		psrld		xmm4, xmm7
+
+		pand		xmm0, SIMD_SSE2_dword_byte_mask
+		pand		xmm5, SIMD_SSE2_dword_byte_mask
+		pand		xmm6, SIMD_SSE2_dword_byte_mask
+		pand		xmm4, SIMD_SSE2_dword_byte_mask
+
+		packuswb	xmm0, xmm5
+		packuswb	xmm6, xmm4
+
+		//---------------------
+
+		// ab0 = (  7 * maxAlpha +  7 * minAlpha + ALPHA_RANGE ) / 14
+		// ab3 = (  9 * maxAlpha +  5 * minAlpha + ALPHA_RANGE ) / 14
+		// ab2 = ( 11 * maxAlpha +  3 * minAlpha + ALPHA_RANGE ) / 14
+		// ab1 = ( 13 * maxAlpha +  1 * minAlpha + ALPHA_RANGE ) / 14
+
+		// ab4 = (  7 * maxAlpha +  7 * minAlpha + ALPHA_RANGE ) / 14
+		// ab5 = (  5 * maxAlpha +  9 * minAlpha + ALPHA_RANGE ) / 14
+		// ab6 = (  3 * maxAlpha + 11 * minAlpha + ALPHA_RANGE ) / 14
+		// ab7 = (  1 * maxAlpha + 13 * minAlpha + ALPHA_RANGE ) / 14
+
+		movd		xmm5, maxAlpha_
+		pshuflw		xmm5, xmm5,	R_SHUFFLE_D( 0,	0, 0, 0	)
+		pshufd		xmm5, xmm5,	R_SHUFFLE_D( 0,	0, 0, 0	)
+		movdqa		xmm7, xmm5
+
+		movd		xmm2, minAlpha_
+		pshuflw		xmm2, xmm2,	R_SHUFFLE_D( 0,	0, 0, 0	)
+		pshufd		xmm2, xmm2,	R_SHUFFLE_D( 0,	0, 0, 0	)
+		movdqa		xmm3, xmm2
+
+		pmullw		xmm5, SIMD_SSE2_word_scale_7_9_11_13
+		pmullw		xmm7, SIMD_SSE2_word_scale_7_5_3_1
+		pmullw		xmm2, SIMD_SSE2_word_scale_7_5_3_1
+		pmullw		xmm3, SIMD_SSE2_word_scale_7_9_11_13
+
+		paddw		xmm5, xmm2
+		paddw		xmm7, xmm3
+
+		paddw		xmm5, SIMD_SSE2_word_7
+		paddw		xmm7, SIMD_SSE2_word_7
+
+		pmulhw		xmm5, SIMD_SSE2_word_div_by_14			// * ( ( 1 << 16 ) / 14	+ 1	) )	>> 16
+		pmulhw		xmm7, SIMD_SSE2_word_div_by_14			// * ( ( 1 << 16 ) / 14	+ 1	) )	>> 16
+
+		pshufd		xmm1, xmm5,	R_SHUFFLE_D( 3, 3, 3, 3	)
+		pshufd		xmm2, xmm5,	R_SHUFFLE_D( 2, 2, 2, 2	)
+		pshufd		xmm3, xmm5,	R_SHUFFLE_D( 1, 1, 1, 1	)
+		packuswb	xmm1, xmm1								// ab1
+		packuswb	xmm2, xmm2								// ab2
+		packuswb	xmm3, xmm3								// ab3
+
+		packuswb	xmm0, xmm6								// alpha block
+
+		pshufd		xmm4, xmm7,	R_SHUFFLE_D( 0,	0, 0, 0	)
+		pshufd		xmm5, xmm7,	R_SHUFFLE_D( 1,	1, 1, 1	)
+		pshufd		xmm6, xmm7,	R_SHUFFLE_D( 2,	2, 2, 2	)
+		pshufd		xmm7, xmm7,	R_SHUFFLE_D( 3,	3, 3, 3	)
+		packuswb	xmm4, xmm4								// ab4
+		packuswb	xmm5, xmm5								// ab5
+		packuswb	xmm6, xmm6								// ab6
+		packuswb	xmm7, xmm7								// ab7
+
+		pmaxub		xmm1, xmm0
+		pmaxub		xmm2, xmm0
+		pmaxub		xmm3, xmm0
+		pcmpeqb		xmm1, xmm0
+		pcmpeqb		xmm2, xmm0
+		pcmpeqb		xmm3, xmm0
+		pmaxub		xmm4, xmm0
+		pmaxub		xmm5, xmm0
+		pmaxub		xmm6, xmm0
+		pmaxub		xmm7, xmm0
+		pcmpeqb		xmm4, xmm0
+		pcmpeqb		xmm5, xmm0
+		pcmpeqb		xmm6, xmm0
+		pcmpeqb		xmm7, xmm0
+		movdqa		xmm0, SIMD_SSE2_byte_8
+		paddsb		xmm0, xmm1
+		paddsb		xmm2, xmm3
+		paddsb		xmm4, xmm5
+		paddsb		xmm6, xmm7
+		paddsb		xmm0, xmm2
+		paddsb		xmm4, xmm6
+		paddsb		xmm0, xmm4
+		pand		xmm0, SIMD_SSE2_byte_7
+		movdqa		xmm1, SIMD_SSE2_byte_2
+		pcmpgtb		xmm1, xmm0
+		pand		xmm1, SIMD_SSE2_byte_1
+		pxor		xmm0, xmm1
+		movdqa		xmm1, xmm0
+		movdqa		xmm2, xmm0
+		movdqa		xmm3, xmm0
+		movdqa		xmm4, xmm0
+		movdqa		xmm5, xmm0
+		movdqa		xmm6, xmm0
+		movdqa		xmm7, xmm0
+		psrlq		xmm1,  8- 3
+		psrlq		xmm2, 16- 6
+		psrlq		xmm3, 24- 9
+		psrlq		xmm4, 32-12
+		psrlq		xmm5, 40-15
+		psrlq		xmm6, 48-18
+		psrlq		xmm7, 56-21
+		pand		xmm0, SIMD_SSE2_dword_alpha_bit_mask0
+		pand		xmm1, SIMD_SSE2_dword_alpha_bit_mask1
+		pand		xmm2, SIMD_SSE2_dword_alpha_bit_mask2
+		pand		xmm3, SIMD_SSE2_dword_alpha_bit_mask3
+		pand		xmm4, SIMD_SSE2_dword_alpha_bit_mask4
+		pand		xmm5, SIMD_SSE2_dword_alpha_bit_mask5
+		pand		xmm6, SIMD_SSE2_dword_alpha_bit_mask6
+		pand		xmm7, SIMD_SSE2_dword_alpha_bit_mask7
+		por			xmm0, xmm1
+		por			xmm2, xmm3
+		por			xmm4, xmm5
+		por			xmm6, xmm7
+		por			xmm0, xmm2
+		por			xmm4, xmm6
+		por			xmm0, xmm4
+		mov			esi, outPtr
+		movd		[esi+0], xmm0
+		pshufd		xmm1, xmm0, R_SHUFFLE_D( 2, 3, 0, 1 )
+		movd		[esi+3], xmm1
+	}
+
+	outData += 6;
+#elif defined ( ID_WIN_X86_SSE2_INTRIN )
 	__m128i block0 = *((__m128i *)(&block[ 0]));
 	__m128i block1 = *((__m128i *)(&block[16]));
 	__m128i block2 = *((__m128i *)(&block[32]));
@@ -917,6 +1739,9 @@ void idDxtEncoder::EmitAlphaIndices_SSE2( const byte *block, const int channelBi
 	out = _mm_cvtsi128_si32( temp1 );
 	EmitUInt( out );
 	outData--;
+#else
+	assert( false );
+#endif
 }
 
 /*
@@ -1102,6 +1927,108 @@ idDxtEncoder::ScaleYCoCg_SSE2
 ========================
 */
 ID_INLINE void idDxtEncoder::ScaleYCoCg_SSE2( byte *colorBlock, byte *minColor, byte *maxColor ) const {
+#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
+	__asm {
+		mov			esi, colorBlock
+		mov			edx, minColor
+		mov			ecx, maxColor
+
+		movd		xmm0, dword ptr [edx]
+		movd		xmm1, dword ptr [ecx]
+
+		punpcklbw	xmm0, SIMD_SSE2_byte_0
+		punpcklbw	xmm1, SIMD_SSE2_byte_0
+
+		movdqa		xmm6, SIMD_SSE2_word_center_128
+		movdqa		xmm7, SIMD_SSE2_word_center_128
+
+		psubw		xmm6, xmm0
+		psubw		xmm7, xmm1
+
+		psubw		xmm0, SIMD_SSE2_word_center_128
+		psubw		xmm1, SIMD_SSE2_word_center_128
+
+		pmaxsw		xmm6, xmm0
+		pmaxsw		xmm7, xmm1
+
+		pmaxsw		xmm6, xmm7
+		pshuflw		xmm7, xmm6, R_SHUFFLE_D( 1, 0, 1, 0 )
+		pmaxsw		xmm6, xmm7
+		pshufd		xmm6, xmm6, R_SHUFFLE_D( 0, 0, 0, 0 )
+
+		movdqa		xmm7, xmm6
+		pcmpgtw		xmm6, SIMD_SSE2_word_63				// mask0
+		pcmpgtw		xmm7, SIMD_SSE2_word_31				// mask1
+
+		pandn		xmm7, SIMD_SSE2_byte_2
+		por			xmm7, SIMD_SSE2_byte_1
+		pandn		xmm6, xmm7
+		movdqa		xmm3, xmm6
+		movdqa		xmm7, xmm6
+		pxor		xmm7, SIMD_SSE2_byte_not
+		por			xmm7, SIMD_SSE2_byte_scale_mask0	// 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00
+		paddw		xmm6, SIMD_SSE2_byte_1
+		pand		xmm6, SIMD_SSE2_byte_scale_mask1	// 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0xFF
+		por			xmm6, SIMD_SSE2_byte_scale_mask2	// 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00
+
+		movd		xmm4, dword ptr [edx]
+		movd		xmm5, dword ptr [ecx]
+
+		pand		xmm4, SIMD_SSE2_byte_scale_mask3	// 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0xFF, 0xFF
+		pand		xmm5, SIMD_SSE2_byte_scale_mask3
+
+		pslld		xmm3, 3
+		pand		xmm3, SIMD_SSE2_byte_scale_mask4	// 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00
+
+		por			xmm4, xmm3
+		por			xmm5, xmm3
+
+		paddb		xmm4, SIMD_SSE2_byte_minus_128_0
+		paddb		xmm5, SIMD_SSE2_byte_minus_128_0
+
+		pmullw		xmm4, xmm6
+		pmullw		xmm5, xmm6
+
+		pand		xmm4, xmm7
+		pand		xmm5, xmm7
+
+		psubb		xmm4, SIMD_SSE2_byte_minus_128_0
+		psubb		xmm5, SIMD_SSE2_byte_minus_128_0
+
+		movd		dword ptr [edx], xmm4
+		movd		dword ptr [ecx], xmm5
+
+		movdqa		xmm0, xmmword ptr [esi+ 0*4]
+		movdqa		xmm1, xmmword ptr [esi+ 4*4]
+		movdqa		xmm2, xmmword ptr [esi+ 8*4]
+		movdqa		xmm3, xmmword ptr [esi+12*4]
+
+		paddb		xmm0, SIMD_SSE2_byte_minus_128_0
+		paddb		xmm1, SIMD_SSE2_byte_minus_128_0
+		paddb		xmm2, SIMD_SSE2_byte_minus_128_0
+		paddb		xmm3, SIMD_SSE2_byte_minus_128_0
+
+		pmullw		xmm0, xmm6
+		pmullw		xmm1, xmm6
+		pmullw		xmm2, xmm6
+		pmullw		xmm3, xmm6
+
+		pand		xmm0, xmm7
+		pand		xmm1, xmm7
+		pand		xmm2, xmm7
+		pand		xmm3, xmm7
+
+		psubb		xmm0, SIMD_SSE2_byte_minus_128_0
+		psubb		xmm1, SIMD_SSE2_byte_minus_128_0
+		psubb		xmm2, SIMD_SSE2_byte_minus_128_0
+		psubb		xmm3, SIMD_SSE2_byte_minus_128_0
+
+		movdqa		xmmword ptr [esi+ 0*4], xmm0
+		movdqa		xmmword ptr [esi+ 4*4], xmm1
+		movdqa		xmmword ptr [esi+ 8*4], xmm2
+		movdqa		xmmword ptr [esi+12*4], xmm3
+	}
+#elif defined ( ID_WIN_X86_SSE2_INTRIN )
 	__m128i block0 = *((__m128i *)(&colorBlock[ 0]));
 	__m128i block1 = *((__m128i *)(&colorBlock[16]));
 	__m128i block2 = *((__m128i *)(&colorBlock[32]));
@@ -1189,6 +2116,9 @@ ID_INLINE void idDxtEncoder::ScaleYCoCg_SSE2( byte *colorBlock, byte *minColor,
 	*((__m128i *)(&colorBlock[16])) = _mm_sub_epi8( temp1, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
 	*((__m128i *)(&colorBlock[32])) = _mm_sub_epi8( temp2, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
 	*((__m128i *)(&colorBlock[48])) = _mm_sub_epi8( temp3, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
+#else
+	assert( false );
+#endif
 }
 
 /*
@@ -1197,6 +2127,40 @@ idDxtEncoder::InsetYCoCgBBox_SSE2
 ========================
 */
 ID_INLINE void idDxtEncoder::InsetYCoCgBBox_SSE2( byte *minColor, byte *maxColor ) const {
+#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
+	__asm {
+		mov			esi, minColor
+		mov			edi, maxColor
+		movd		xmm0, dword ptr [esi]
+		movd		xmm1, dword ptr [edi]
+		punpcklbw	xmm0, SIMD_SSE2_byte_0
+		punpcklbw	xmm1, SIMD_SSE2_byte_0
+		movdqa		xmm2, xmm1
+		psubw		xmm2, xmm0
+		psubw		xmm2, SIMD_SSE2_word_insetYCoCgRound
+		pand		xmm2, SIMD_SSE2_word_insetYCoCgMask
+		pmullw		xmm0, SIMD_SSE2_word_insetYCoCgShiftUp
+		pmullw		xmm1, SIMD_SSE2_word_insetYCoCgShiftUp
+		paddw		xmm0, xmm2
+		psubw		xmm1, xmm2
+		pmulhw		xmm0, SIMD_SSE2_word_insetYCoCgShiftDown
+		pmulhw		xmm1, SIMD_SSE2_word_insetYCoCgShiftDown
+		pmaxsw		xmm0, SIMD_SSE2_word_0
+		pmaxsw		xmm1, SIMD_SSE2_word_0
+		pand		xmm0, SIMD_SSE2_word_insetYCoCgQuantMask
+		pand		xmm1, SIMD_SSE2_word_insetYCoCgQuantMask
+		movdqa		xmm2, xmm0
+		movdqa		xmm3, xmm1
+		pmulhw		xmm2, SIMD_SSE2_word_insetYCoCgRep
+		pmulhw		xmm3, SIMD_SSE2_word_insetYCoCgRep
+		por			xmm0, xmm2
+		por			xmm1, xmm3
+		packuswb	xmm0, xmm0
+		packuswb	xmm1, xmm1
+		movd		dword ptr [esi], xmm0
+		movd		dword ptr [edi], xmm1
+	}
+#elif defined ( ID_WIN_X86_SSE2_INTRIN )
 	__m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
 
 	temp0 = _mm_cvtsi32_si128( *(int *)minColor );
@@ -1227,6 +2191,9 @@ ID_INLINE void idDxtEncoder::InsetYCoCgBBox_SSE2( byte *minColor, byte *maxColor
 
 	*(int *)minColor = _mm_cvtsi128_si32( temp0 );
 	*(int *)maxColor = _mm_cvtsi128_si32( temp1 );
+#else
+	assert( false );
+#endif
 }
 
 /*
@@ -1240,6 +2207,80 @@ return: diagonal to use
 ========================
 */
 ID_INLINE void idDxtEncoder::SelectYCoCgDiagonal_SSE2( const byte *colorBlock, byte *minColor, byte *maxColor ) const {
+#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
+	__asm {
+		mov			esi, colorBlock
+		mov			edx, minColor
+		mov			ecx, maxColor
+
+		movdqa		xmm0, xmmword ptr [esi+	0]
+		movdqa		xmm1, xmmword ptr [esi+16]
+		movdqa		xmm2, xmmword ptr [esi+32]
+		movdqa		xmm3, xmmword ptr [esi+48]
+
+		pand		xmm0, SIMD_SSE2_dword_word_mask
+		pand		xmm1, SIMD_SSE2_dword_word_mask
+		pand		xmm2, SIMD_SSE2_dword_word_mask
+		pand		xmm3, SIMD_SSE2_dword_word_mask
+
+		pslldq		xmm1, 2
+		pslldq		xmm3, 2
+		por			xmm0, xmm1
+		por			xmm2, xmm3
+
+		movd		xmm1, dword ptr [edx]					// minColor
+		movd		xmm3, dword ptr [ecx]					// maxColor
+
+		movdqa		xmm6, xmm1
+		movdqa		xmm7, xmm3
+
+		pavgb		xmm1, xmm3
+		pshuflw		xmm1, xmm1, R_SHUFFLE_D( 0, 0, 0, 0 )
+		pshufd		xmm1, xmm1, R_SHUFFLE_D( 0, 0, 0, 0 )
+		movdqa		xmm3, xmm1
+
+		pmaxub		xmm1, xmm0
+		pmaxub		xmm3, xmm2
+		pcmpeqb		xmm1, xmm0
+		pcmpeqb		xmm3, xmm2
+
+		movdqa		xmm0, xmm1
+		movdqa		xmm2, xmm3
+		psrldq		xmm0, 1
+		psrldq		xmm2, 1
+
+		pxor		xmm0, xmm1
+		pxor		xmm2, xmm3
+		pand		xmm0, SIMD_SSE2_word_1
+		pand		xmm2, SIMD_SSE2_word_1
+
+		paddw		xmm0, xmm2
+		psadbw		xmm0, SIMD_SSE2_byte_0
+		pshufd		xmm1, xmm0, R_SHUFFLE_D( 2, 3, 0, 1 )
+
+#ifdef NVIDIA_7X_HARDWARE_BUG_FIX
+		paddw		xmm1, xmm0								// side
+		pcmpgtw		xmm1, SIMD_SSE2_word_8					// mask = -( side > 8 )
+		pand		xmm1, SIMD_SSE2_byte_diagonalMask
+		movdqa		xmm0, xmm6
+		pcmpeqb		xmm0, xmm7								// mask &= -( minColor[0] != maxColor[0] )
+		pslldq		xmm0, 1
+		pandn		xmm0, xmm1
+#else
+		paddw		xmm0, xmm1								// side
+		pcmpgtw		xmm0, SIMD_SSE2_word_8					// mask = -( side > 8 )
+		pand		xmm0, SIMD_SSE2_byte_diagonalMask
+#endif
+
+		pxor		xmm6, xmm7
+		pand		xmm0, xmm6
+		pxor		xmm7, xmm0
+		pxor		xmm6, xmm7
+
+		movd		dword ptr [edx], xmm6
+		movd		dword ptr [ecx], xmm7
+	}
+#elif defined ( ID_WIN_X86_SSE2_INTRIN )
 	__m128i block0 = *((__m128i *)(&colorBlock[ 0]));
 	__m128i block1 = *((__m128i *)(&colorBlock[16]));
 	__m128i block2 = *((__m128i *)(&colorBlock[32]));
@@ -1300,6 +2341,9 @@ ID_INLINE void idDxtEncoder::SelectYCoCgDiagonal_SSE2( const byte *colorBlock, b
 
 	*(int *)minColor = _mm_cvtsi128_si32( temp6 );
 	*(int *)maxColor = _mm_cvtsi128_si32( temp7 );
+#else
+	assert( false );
+#endif
 }
 
 /*
@@ -1376,6 +2420,113 @@ paramO:	maxGreen	- Maximal normal Y found
 ========================
 */
 void idDxtEncoder::EmitGreenIndices_SSE2( const byte *block, const int channelBitOffset, const int minGreen, const int maxGreen ) {
+#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
+	assert( maxGreen >= minGreen );
+
+	byte *outPtr = outData;
+
+	__asm {
+		movd		xmm7, channelBitOffset
+
+		mov			esi, block
+		movdqa		xmm0, xmmword ptr [esi+	0]
+		movdqa		xmm5, xmmword ptr [esi+16]
+		movdqa		xmm6, xmmword ptr [esi+32]
+		movdqa		xmm4, xmmword ptr [esi+48]
+
+		psrld		xmm0, xmm7
+		psrld		xmm5, xmm7
+		psrld		xmm6, xmm7
+		psrld		xmm4, xmm7
+
+		pand		xmm0, SIMD_SSE2_dword_byte_mask
+		pand		xmm5, SIMD_SSE2_dword_byte_mask
+		pand		xmm6, SIMD_SSE2_dword_byte_mask
+		pand		xmm4, SIMD_SSE2_dword_byte_mask
+
+		packuswb	xmm0, xmm5
+		packuswb	xmm6, xmm4
+
+		//---------------------
+
+		movd		xmm2, maxGreen
+		pshuflw		xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 )
+
+		movd		xmm3, minGreen
+		pshuflw		xmm3, xmm3, R_SHUFFLE_D( 0, 0, 0, 0 )
+
+		pmullw		xmm2, SIMD_SSE2_word_scale_5_3_1
+		pmullw		xmm3, SIMD_SSE2_word_scale_1_3_5
+		paddw		xmm2, SIMD_SSE2_word_3
+		paddw		xmm3, xmm2
+		pmulhw		xmm3, SIMD_SSE2_word_div_by_6
+
+		pshuflw		xmm1, xmm3, R_SHUFFLE_D( 0, 0, 0, 0 )
+		pshuflw		xmm2, xmm3, R_SHUFFLE_D( 1, 1, 1, 1 )
+		pshuflw		xmm3, xmm3, R_SHUFFLE_D( 2, 2, 2, 2 )
+
+		pshufd		xmm1, xmm1, R_SHUFFLE_D( 0, 0, 0, 0 )
+		pshufd		xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 )
+		pshufd		xmm3, xmm3, R_SHUFFLE_D( 0, 0, 0, 0 )
+
+		packuswb	xmm1, xmm1
+		packuswb	xmm2, xmm2
+		packuswb	xmm3, xmm3
+
+		packuswb	xmm0, xmm6
+
+		pmaxub		xmm1, xmm0
+		pmaxub		xmm2, xmm0
+		pmaxub		xmm3, xmm0
+		pcmpeqb		xmm1, xmm0
+		pcmpeqb		xmm2, xmm0
+		pcmpeqb		xmm3, xmm0
+		movdqa		xmm0, SIMD_SSE2_byte_4
+		paddsb		xmm0, xmm1
+		paddsb		xmm2, xmm3
+		paddsb		xmm0, xmm2
+		pand		xmm0, SIMD_SSE2_byte_3
+		movdqa		xmm4, SIMD_SSE2_byte_2
+		pcmpgtb		xmm4, xmm0
+		pand		xmm4, SIMD_SSE2_byte_1
+		pxor		xmm0, xmm4
+		movdqa		xmm4, xmm0
+		movdqa		xmm5, xmm0
+		movdqa		xmm6, xmm0
+		movdqa		xmm7, xmm0
+		psrlq		xmm4,  8- 2
+		psrlq		xmm5, 16- 4
+		psrlq		xmm6, 24- 6
+		psrlq		xmm7, 32- 8
+		pand		xmm4, SIMD_SSE2_dword_color_bit_mask1
+		pand		xmm5, SIMD_SSE2_dword_color_bit_mask2
+		pand		xmm6, SIMD_SSE2_dword_color_bit_mask3
+		pand		xmm7, SIMD_SSE2_dword_color_bit_mask4
+		por			xmm5, xmm4
+		por			xmm7, xmm6
+		por			xmm7, xmm5
+		movdqa		xmm4, xmm0
+		movdqa		xmm5, xmm0
+		movdqa		xmm6, xmm0
+		psrlq		xmm4, 40-10
+		psrlq		xmm5, 48-12
+		psrlq		xmm6, 56-14
+		pand		xmm0, SIMD_SSE2_dword_color_bit_mask0
+		pand		xmm4, SIMD_SSE2_dword_color_bit_mask5
+		pand		xmm5, SIMD_SSE2_dword_color_bit_mask6
+		pand		xmm6, SIMD_SSE2_dword_color_bit_mask7
+		por			xmm4, xmm5
+		por			xmm0, xmm6
+		por			xmm7, xmm4
+		por			xmm7, xmm0
+		mov			esi, outPtr
+		pshufd		xmm7, xmm7, R_SHUFFLE_D( 0, 2, 1, 3 )
+		pshuflw		xmm7, xmm7, R_SHUFFLE_D( 0, 2, 1, 3 )
+		movd		[esi], xmm7
+	}
+
+	outData += 4;
+#elif defined ( ID_WIN_X86_SSE2_INTRIN )
 	__m128i block0 = *((__m128i *)(&block[ 0]));
 	__m128i block1 = *((__m128i *)(&block[16]));
 	__m128i block2 = *((__m128i *)(&block[32]));
@@ -1472,6 +2623,9 @@ void idDxtEncoder::EmitGreenIndices_SSE2( const byte *block, const int channelBi
 
 	int result = _mm_cvtsi128_si32( temp7 );
 	EmitUInt( result );
+#else
+	assert( false );
+#endif
 }
 
 /*
@@ -1480,6 +2634,46 @@ idDxtEncoder::InsetNormalsBBoxDXT5_SSE2
 ========================
 */
 void idDxtEncoder::InsetNormalsBBoxDXT5_SSE2( byte *minNormal, byte *maxNormal ) const {
+#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
+	__asm {
+        mov         esi, minNormal
+        mov         edi, maxNormal
+        movd        xmm0, dword ptr [esi]							// xmm0 = minNormal
+        movd        xmm1, dword ptr [edi]							// xmm1 = maxNormal
+        punpcklbw   xmm0, SIMD_SSE2_byte_0
+        punpcklbw   xmm1, SIMD_SSE2_byte_0
+        movdqa      xmm2, xmm1
+        psubw       xmm2, xmm0
+        psubw       xmm2, SIMD_SSE2_word_insetNormalDXT5Round
+        pand        xmm2, SIMD_SSE2_word_insetNormalDXT5Mask		// xmm2 = inset (1 & 3)
+
+        pmullw      xmm0, SIMD_SSE2_word_insetNormalDXT5ShiftUp
+        pmullw      xmm1, SIMD_SSE2_word_insetNormalDXT5ShiftUp
+		paddw		xmm0, xmm2
+		psubw		xmm1, xmm2
+		pmulhw      xmm0, SIMD_SSE2_word_insetNormalDXT5ShiftDown	// xmm0 = mini
+        pmulhw      xmm1, SIMD_SSE2_word_insetNormalDXT5ShiftDown	// xmm1 = maxi
+
+		// mini and maxi must be >= 0 and <= 255
+        pmaxsw      xmm0, SIMD_SSE2_word_0
+        pmaxsw      xmm1, SIMD_SSE2_word_0
+        pminsw      xmm0, SIMD_SSE2_word_255
+        pminsw      xmm1, SIMD_SSE2_word_255
+
+        movdqa      xmm2, xmm0
+        movdqa      xmm3, xmm1
+        pand        xmm0, SIMD_SSE2_word_insetNormalDXT5QuantMask
+        pand        xmm1, SIMD_SSE2_word_insetNormalDXT5QuantMask
+        pmulhw      xmm2, SIMD_SSE2_word_insetNormalDXT5Rep
+        pmulhw      xmm3, SIMD_SSE2_word_insetNormalDXT5Rep
+        por         xmm0, xmm2
+        por         xmm1, xmm3
+        packuswb    xmm0, xmm0
+        packuswb    xmm1, xmm1
+        movd        dword ptr [esi], xmm0
+        movd        dword ptr [edi], xmm1
+    }
+#elif defined ( ID_WIN_X86_SSE2_INTRIN )
 	__m128i temp0, temp1, temp2, temp3;
 
 	temp0 = _mm_cvtsi32_si128( *(int *)minNormal );
@@ -1516,6 +2710,9 @@ void idDxtEncoder::InsetNormalsBBoxDXT5_SSE2( byte *minNormal, byte *maxNormal )
 
 	*(int *)minNormal = _mm_cvtsi128_si32( temp0 );
 	*(int *)maxNormal = _mm_cvtsi128_si32( temp1 );
+#else
+	assert( false );
+#endif
 }
 
 /*
@@ -1578,3 +2775,4 @@ void idDxtEncoder::CompressNormalMapDXT5Fast_SSE2( const byte *inBuf, byte *outB
 #endif
 }
 
+#endif
diff --git a/neo/renderer/GLMatrix.cpp b/neo/renderer/GLMatrix.cpp
index 9c188437..7408b2c9 100644
--- a/neo/renderer/GLMatrix.cpp
+++ b/neo/renderer/GLMatrix.cpp
@@ -72,6 +72,7 @@ R_MatrixMultiply
 ==========================
 */
 void R_MatrixMultiply( const float a[16], const float b[16], float out[16] ) {
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	__m128 a0 = _mm_loadu_ps( a + 0*4 );
 	__m128 a1 = _mm_loadu_ps( a + 1*4 );
@@ -108,6 +109,41 @@ void R_MatrixMultiply( const float a[16], const float b[16], float out[16] ) {
 	_mm_storeu_ps( out + 2*4, t2 );
 	_mm_storeu_ps( out + 3*4, t3 );
 
+#else
+
+	/*
+	for ( int i = 0; i < 4; i++ ) {
+		for ( int j = 0; j < 4; j++ ) {
+			out[ i * 4 + j ] =
+				a[ i * 4 + 0 ] * b[ 0 * 4 + j ] +
+				a[ i * 4 + 1 ] * b[ 1 * 4 + j ] +
+				a[ i * 4 + 2 ] * b[ 2 * 4 + j ] +
+				a[ i * 4 + 3 ] * b[ 3 * 4 + j ];
+		}
+	}
+	*/
+
+	out[0*4+0] = a[0*4+0]*b[0*4+0] + a[0*4+1]*b[1*4+0] + a[0*4+2]*b[2*4+0] + a[0*4+3]*b[3*4+0];
+	out[0*4+1] = a[0*4+0]*b[0*4+1] + a[0*4+1]*b[1*4+1] + a[0*4+2]*b[2*4+1] + a[0*4+3]*b[3*4+1];
+	out[0*4+2] = a[0*4+0]*b[0*4+2] + a[0*4+1]*b[1*4+2] + a[0*4+2]*b[2*4+2] + a[0*4+3]*b[3*4+2];
+	out[0*4+3] = a[0*4+0]*b[0*4+3] + a[0*4+1]*b[1*4+3] + a[0*4+2]*b[2*4+3] + a[0*4+3]*b[3*4+3];
+
+	out[1*4+0] = a[1*4+0]*b[0*4+0] + a[1*4+1]*b[1*4+0] + a[1*4+2]*b[2*4+0] + a[1*4+3]*b[3*4+0];
+	out[1*4+1] = a[1*4+0]*b[0*4+1] + a[1*4+1]*b[1*4+1] + a[1*4+2]*b[2*4+1] + a[1*4+3]*b[3*4+1];
+	out[1*4+2] = a[1*4+0]*b[0*4+2] + a[1*4+1]*b[1*4+2] + a[1*4+2]*b[2*4+2] + a[1*4+3]*b[3*4+2];
+	out[1*4+3] = a[1*4+0]*b[0*4+3] + a[1*4+1]*b[1*4+3] + a[1*4+2]*b[2*4+3] + a[1*4+3]*b[3*4+3];
+
+	out[2*4+0] = a[2*4+0]*b[0*4+0] + a[2*4+1]*b[1*4+0] + a[2*4+2]*b[2*4+0] + a[2*4+3]*b[3*4+0];
+	out[2*4+1] = a[2*4+0]*b[0*4+1] + a[2*4+1]*b[1*4+1] + a[2*4+2]*b[2*4+1] + a[2*4+3]*b[3*4+1];
+	out[2*4+2] = a[2*4+0]*b[0*4+2] + a[2*4+1]*b[1*4+2] + a[2*4+2]*b[2*4+2] + a[2*4+3]*b[3*4+2];
+	out[2*4+3] = a[2*4+0]*b[0*4+3] + a[2*4+1]*b[1*4+3] + a[2*4+2]*b[2*4+3] + a[2*4+3]*b[3*4+3];
+
+	out[3*4+0] = a[3*4+0]*b[0*4+0] + a[3*4+1]*b[1*4+0] + a[3*4+2]*b[2*4+0] + a[3*4+3]*b[3*4+0];
+	out[3*4+1] = a[3*4+0]*b[0*4+1] + a[3*4+1]*b[1*4+1] + a[3*4+2]*b[2*4+1] + a[3*4+3]*b[3*4+1];
+	out[3*4+2] = a[3*4+0]*b[0*4+2] + a[3*4+1]*b[1*4+2] + a[3*4+2]*b[2*4+2] + a[3*4+3]*b[3*4+2];
+	out[3*4+3] = a[3*4+0]*b[0*4+3] + a[3*4+1]*b[1*4+3] + a[3*4+2]*b[2*4+3] + a[3*4+3]*b[3*4+3];
+
+#endif
 }
 
 /*
diff --git a/neo/renderer/ModelDecal.cpp b/neo/renderer/ModelDecal.cpp
index 596522dd..560819aa 100644
--- a/neo/renderer/ModelDecal.cpp
+++ b/neo/renderer/ModelDecal.cpp
@@ -274,6 +274,7 @@ static void R_DecalPointCullStatic( byte * cullBits, const idPlane * planes, con
 	assert_16_byte_aligned( cullBits );
 	assert_16_byte_aligned( verts );
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
 
@@ -376,6 +377,37 @@ static void R_DecalPointCullStatic( byte * cullBits, const idPlane * planes, con
 		}
 	}
 
+#else
+
+	idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
+
+	for ( int i = 0; i < numVerts; ) {
+
+		const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
+
+		for ( ; i <= nextNumVerts; i++ ) {
+			const idVec3 & v = vertsODS[i].xyz;
+
+			const float d0 = planes[0].Distance( v );
+			const float d1 = planes[1].Distance( v );
+			const float d2 = planes[2].Distance( v );
+			const float d3 = planes[3].Distance( v );
+			const float d4 = planes[4].Distance( v );
+			const float d5 = planes[5].Distance( v );
+
+			byte bits;
+			bits  = IEEE_FLT_SIGNBITNOTSET( d0 ) << 0;
+			bits |= IEEE_FLT_SIGNBITNOTSET( d1 ) << 1;
+			bits |= IEEE_FLT_SIGNBITNOTSET( d2 ) << 2;
+			bits |= IEEE_FLT_SIGNBITNOTSET( d3 ) << 3;
+			bits |= IEEE_FLT_SIGNBITNOTSET( d4 ) << 4;
+			bits |= IEEE_FLT_SIGNBITNOTSET( d5 ) << 5;
+
+			cullBits[i] = bits;
+		}
+	}
+
+#endif
 }
 
 /*
@@ -573,6 +605,7 @@ static void R_CopyDecalSurface( idDrawVert * verts, int numVerts, triIndex_t * i
 	assert( ( ( decal->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 );
 	assert_16_byte_aligned( fadeColor );
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 );
 	const __m128i vector_short_num_verts = _mm_packs_epi32( vector_int_num_verts, vector_int_num_verts );
@@ -612,6 +645,25 @@ static void R_CopyDecalSurface( idDrawVert * verts, int numVerts, triIndex_t * i
 
 	_mm_sfence();
 
+#else
+
+	// copy vertices and apply depth/time based fading
+	for ( int i = 0; i < decal->numVerts; i++ ) {
+		// NOTE: bad out-of-order write-combined write, SIMD code does the right thing
+		verts[numVerts + i] = decal->verts[i];
+		for ( int j = 0; j < 4; j++ ) {
+			verts[numVerts + i].color[j] = idMath::Ftob( fadeColor[j] * decal->vertDepthFade[i] );
+		}
+	}
+
+	// copy indices
+	assert( ( decal->numIndexes & 1 ) == 0 );
+	for ( int i = 0; i < decal->numIndexes; i += 2 ) {
+		assert( decal->indexes[i + 0] < decal->numVerts && decal->indexes[i + 1] < decal->numVerts );
+		WriteIndexPair( &indexes[numIndexes + i], numVerts + decal->indexes[i + 0], numVerts + decal->indexes[i + 1] );
+	}
+
+#endif
 }
 
 /*
diff --git a/neo/renderer/ModelOverlay.cpp b/neo/renderer/ModelOverlay.cpp
index fa2b1962..da4f369d 100644
--- a/neo/renderer/ModelOverlay.cpp
+++ b/neo/renderer/ModelOverlay.cpp
@@ -102,6 +102,7 @@ static void R_OverlayPointCullStatic( byte * cullBits, halfFloat_t * texCoordS,
 	assert_16_byte_aligned( texCoordT );
 	assert_16_byte_aligned( verts );
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
 
@@ -176,6 +177,39 @@ static void R_OverlayPointCullStatic( byte * cullBits, halfFloat_t * texCoordS,
 		}
 	}
 
+#else
+
+	idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
+
+	for ( int i = 0; i < numVerts; ) {
+
+		const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
+
+		for ( ; i <= nextNumVerts; i++ ) {
+			const idVec3 & v = vertsODS[i].xyz;
+
+			const float d0 = planes[0].Distance( v );
+			const float d1 = planes[1].Distance( v );
+			const float d2 = 1.0f - d0;
+			const float d3 = 1.0f - d1;
+
+			halfFloat_t s = Scalar_FastF32toF16( d0 );
+			halfFloat_t t = Scalar_FastF32toF16( d1 );
+
+			texCoordS[i] = s;
+			texCoordT[i] = t;
+
+			byte bits;
+			bits  = IEEE_FLT_SIGNBITSET( d0 ) << 0;
+			bits |= IEEE_FLT_SIGNBITSET( d1 ) << 1;
+			bits |= IEEE_FLT_SIGNBITSET( d2 ) << 2;
+			bits |= IEEE_FLT_SIGNBITSET( d3 ) << 3;
+
+			cullBits[i] = bits;
+		}
+	}
+
+#endif
 }
 
 /*
@@ -189,6 +223,7 @@ static void R_OverlayPointCullSkinned( byte * cullBits, halfFloat_t * texCoordS,
 	assert_16_byte_aligned( texCoordT );
 	assert_16_byte_aligned( verts );
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
 
@@ -263,6 +298,39 @@ static void R_OverlayPointCullSkinned( byte * cullBits, halfFloat_t * texCoordS,
 		}
 	}
 
+#else
+
+	idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
+
+	for ( int i = 0; i < numVerts; ) {
+
+		const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
+
+		for ( ; i <= nextNumVerts; i++ ) {
+			const idVec3 transformed = Scalar_LoadSkinnedDrawVertPosition( vertsODS[i], joints );
+
+			const float d0 = planes[0].Distance( transformed );
+			const float d1 = planes[1].Distance( transformed );
+			const float d2 = 1.0f - d0;
+			const float d3 = 1.0f - d1;
+
+			halfFloat_t s = Scalar_FastF32toF16( d0 );
+			halfFloat_t t = Scalar_FastF32toF16( d1 );
+
+			texCoordS[i] = s;
+			texCoordT[i] = t;
+
+			byte bits;
+			bits  = IEEE_FLT_SIGNBITSET( d0 ) << 0;
+			bits |= IEEE_FLT_SIGNBITSET( d1 ) << 1;
+			bits |= IEEE_FLT_SIGNBITSET( d2 ) << 2;
+			bits |= IEEE_FLT_SIGNBITSET( d3 ) << 3;
+
+			cullBits[i] = bits;
+		}
+	}
+
+#endif
 }
 
 /*
@@ -446,6 +514,7 @@ static void R_CopyOverlaySurface( idDrawVert * verts, int numVerts, triIndex_t *
 	assert( ( ( overlay->numVerts * sizeof( idDrawVert ) ) & 15 ) == 0 );
 	assert( ( ( overlay->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 );
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	const __m128i vector_int_clear_last = _mm_set_epi32( 0, -1, -1, -1 );
 	const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 );
@@ -482,6 +551,25 @@ static void R_CopyOverlaySurface( idDrawVert * verts, int numVerts, triIndex_t *
 
 	_mm_sfence();
 
+#else
+
+	// copy vertices
+	for ( int i = 0; i < overlay->numVerts; i++ ) {
+		const overlayVertex_t &overlayVert = overlay->verts[i];
+
+		// NOTE: bad out-of-order write-combined write, SIMD code does the right thing
+		verts[numVerts + i] = sourceVerts[overlayVert.vertexNum];
+		verts[numVerts + i].st[0] = overlayVert.st[0];
+		verts[numVerts + i].st[1] = overlayVert.st[1];
+	}
+
+	// copy indexes
+	for ( int i = 0; i < overlay->numIndexes; i += 2 ) {
+		assert( overlay->indexes[i + 0] < overlay->numVerts && overlay->indexes[i + 1] < overlay->numVerts );
+		WriteIndexPair( &indexes[numIndexes + i], numVerts + overlay->indexes[i + 0], numVerts + overlay->indexes[i + 1] );
+	}
+
+#endif
 }
 
 /*
diff --git a/neo/renderer/Model_md5.cpp b/neo/renderer/Model_md5.cpp
index 30865b41..e2c7f223 100644
--- a/neo/renderer/Model_md5.cpp
+++ b/neo/renderer/Model_md5.cpp
@@ -32,10 +32,12 @@ If you have questions concerning this license or the applicable additional terms
 #include "tr_local.h"
 #include "Model_local.h"
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 static const __m128 vector_float_posInfinity		= { idMath::INFINITY, idMath::INFINITY, idMath::INFINITY, idMath::INFINITY };
 static const __m128 vector_float_negInfinity		= { -idMath::INFINITY, -idMath::INFINITY, -idMath::INFINITY, -idMath::INFINITY };
 
+#endif
 
 static const char *MD5_SnapshotName = "_MD5_Snapshot_";
 
@@ -501,6 +503,7 @@ idMD5Mesh::CalculateBounds
 ====================
 */
 void idMD5Mesh::CalculateBounds( const idJointMat * entJoints, idBounds & bounds ) const {
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	__m128 minX = vector_float_posInfinity;
 	__m128 minY = vector_float_posInfinity;
@@ -534,6 +537,16 @@ void idMD5Mesh::CalculateBounds( const idJointMat * entJoints, idBounds & bounds
 	_mm_store_ss( bounds.ToFloatPtr() + 4, _mm_splat_ps( maxY, 3 ) );
 	_mm_store_ss( bounds.ToFloatPtr() + 5, _mm_splat_ps( maxZ, 3 ) );
 
+#else
+
+	bounds.Clear();
+	for ( int i = 0; i < numMeshJoints; i++ ) {
+		const idJointMat & joint = entJoints[meshJoints[i]];
+		bounds.AddPoint( joint.GetTranslation() );
+	}
+	bounds.ExpandSelf( maxJointVertDist );
+
+#endif
 }
 
 /*
@@ -1085,6 +1098,7 @@ static void TransformJoints( idJointMat *__restrict outJoints, const int numJoin
 	assert_16_byte_aligned( inFloats1 );
 	assert_16_byte_aligned( inFloats2 );
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	const __m128 mask_keep_last = __m128c( _mm_set_epi32( 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 ) );
 
@@ -1160,6 +1174,13 @@ static void TransformJoints( idJointMat *__restrict outJoints, const int numJoin
 		_mm_store_ps( outFloats + 1 * 12 + 8, ri1 );
 	}
 
+#else
+
+	for ( int i = 0; i < numJoints; i++ ) {
+		idJointMat::Multiply( outJoints[i], inJoints1[i], inJoints2[i] );
+	}
+
+#endif
 }
 
 /*
diff --git a/neo/renderer/jobs/ShadowShared.cpp b/neo/renderer/jobs/ShadowShared.cpp
index 1e9f082a..25009a9d 100644
--- a/neo/renderer/jobs/ShadowShared.cpp
+++ b/neo/renderer/jobs/ShadowShared.cpp
@@ -87,6 +87,7 @@ static void R_ShadowVolumeCullBits( byte *cullBits, byte &totalOr, const float r
 	assert_16_byte_aligned( cullBits );
 	assert_16_byte_aligned( verts );
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	idODSStreamedArray< idShadowVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
 
@@ -208,6 +209,54 @@ static void R_ShadowVolumeCullBits( byte *cullBits, byte &totalOr, const float r
 
 	totalOr = (byte) _mm_cvtsi128_si32( vecTotalOrByte );
 
+#else
+
+	idODSStreamedArray< idShadowVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
+
+	byte tOr = 0;
+	for ( int i = 0; i < numVerts; ) {
+
+		const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
+
+		for ( ; i <= nextNumVerts; i++ ) {
+			const idVec3 & v = vertsODS[i].xyzw.ToVec3();
+
+			const float d0 = planes[0].Distance( v );
+			const float d1 = planes[1].Distance( v );
+			const float d2 = planes[2].Distance( v );
+			const float d3 = planes[3].Distance( v );
+
+			const float t0 = d0 + radius;
+			const float t1 = d1 + radius;
+			const float t2 = d2 + radius;
+			const float t3 = d3 + radius;
+
+			const float s0 = d0 - radius;
+			const float s1 = d1 - radius;
+			const float s2 = d2 - radius;
+			const float s3 = d3 - radius;
+
+			byte bits;
+			bits  = IEEE_FLT_SIGNBITSET( t0 ) << 0;
+			bits |= IEEE_FLT_SIGNBITSET( t1 ) << 1;
+			bits |= IEEE_FLT_SIGNBITSET( t2 ) << 2;
+			bits |= IEEE_FLT_SIGNBITSET( t3 ) << 3;
+
+			bits |= IEEE_FLT_SIGNBITSET( s0 ) << 4;
+			bits |= IEEE_FLT_SIGNBITSET( s1 ) << 5;
+			bits |= IEEE_FLT_SIGNBITSET( s2 ) << 6;
+			bits |= IEEE_FLT_SIGNBITSET( s3 ) << 7;
+
+			bits ^= 0x0F;		// flip lower four bits
+
+			tOr |= bits;
+			cullBits[i] = bits;
+		}
+	}
+
+	totalOr = tOr;
+
+#endif
 }
 
 /*
diff --git a/neo/renderer/jobs/dynamicshadowvolume/DynamicShadowVolume.cpp b/neo/renderer/jobs/dynamicshadowvolume/DynamicShadowVolume.cpp
index 241a6aad..0f443929 100644
--- a/neo/renderer/jobs/dynamicshadowvolume/DynamicShadowVolume.cpp
+++ b/neo/renderer/jobs/dynamicshadowvolume/DynamicShadowVolume.cpp
@@ -31,6 +31,7 @@ If you have questions concerning this license or the applicable additional terms
 #include "../../../idlib/sys/sys_intrinsics.h"
 #include "../../../idlib/geometry/DrawVert_intrinsics.h"
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 static const __m128i vector_int_neg_one		= _mm_set_epi32( -1, -1, -1, -1 );
 
@@ -126,6 +127,69 @@ static __forceinline __m128i TriangleCulled_SSE2(	const __m128 & vert0X, const _
 	return _mm_castps_si128( _mm_cmpeq_ps( b0, zero ) );
 }
 
+#else
+
+/*
+=====================
+TriangleFacing
+
+Returns 255 if the triangle is facing the light origin, otherwise returns 0.
+=====================
+*/
+static byte TriangleFacing_Generic( const idVec3 & v1, const idVec3 & v2, const idVec3 & v3, const idVec3 & lightOrigin ) {
+	const float sx = v2.x - v1.x;
+	const float sy = v2.y - v1.y;
+	const float sz = v2.z - v1.z;
+
+	const float tx = v3.x - v1.x;
+	const float ty = v3.y - v1.y;
+	const float tz = v3.z - v1.z;
+
+	const float normalX = ty * sz - tz * sy;
+	const float normalY = tz * sx - tx * sz;
+	const float normalZ = tx * sy - ty * sx;
+	const float normalW = normalX * v1.x + normalY * v1.y + normalZ * v1.z;
+
+	const float d = lightOrigin.x * normalX + lightOrigin.y * normalY + lightOrigin.z * normalZ - normalW;
+	return ( d > 0.0f ) ? 255 : 0;
+}
+
+/*
+=====================
+TriangleCulled
+
+Returns 255 if the triangle is culled to the light projection matrix, otherwise returns 0.
+The clip space of the 'lightProject' is assumed to be in the range [0, 1].
+=====================
+*/
+static byte TriangleCulled_Generic( const idVec3 & v1, const idVec3 & v2, const idVec3 & v3, const idRenderMatrix & lightProject ) {
+	// transform the triangle
+	idVec4 c[3];
+	for ( int i = 0; i < 4; i++ ) {
+		c[0][i] = v1[0] * lightProject[i][0] + v1[1] * lightProject[i][1] + v1[2] * lightProject[i][2] + lightProject[i][3];
+		c[1][i] = v2[0] * lightProject[i][0] + v2[1] * lightProject[i][1] + v2[2] * lightProject[i][2] + lightProject[i][3];
+		c[2][i] = v3[0] * lightProject[i][0] + v3[1] * lightProject[i][1] + v3[2] * lightProject[i][2] + lightProject[i][3];
+	}
+
+	// calculate the culled bits
+	int bits = 0;
+	for ( int i = 0; i < 3; i++ ) {
+		const float minW = 0.0f;
+		const float maxW = c[i][3];
+
+		if ( c[i][0] > minW ) { bits |= ( 1 << 0 ); }
+		if ( c[i][0] < maxW ) { bits |= ( 1 << 1 ); }
+		if ( c[i][1] > minW ) { bits |= ( 1 << 2 ); }
+		if ( c[i][1] < maxW ) { bits |= ( 1 << 3 ); }
+		if ( c[i][2] > minW ) { bits |= ( 1 << 4 ); }
+		if ( c[i][2] < maxW ) { bits |= ( 1 << 5 ); }
+	}
+
+	// if any bits weren't set, the triangle is completely off one side of the frustum
+	return ( bits != 63 ) ? 255 : 0;
+}
+
+#endif
 
 /*
 =====================
@@ -155,6 +219,7 @@ static int CalculateTriangleFacingCulledStatic( byte * __restrict facing, byte *
 	const idVec3 lineDir = lineDelta * lineLengthRcp;
 	const float lineLength = lineLengthSqr * lineLengthRcp;
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	idODSStreamedIndexedArray< idDrawVert, triIndex_t, 32, SBT_QUAD, 4 * 3 > indexedVertsODS( verts, numVerts, indexes, numIndexes );
 
@@ -261,6 +326,55 @@ static int CalculateTriangleFacingCulledStatic( byte * __restrict facing, byte *
 
 	return _mm_cvtsi128_si32( numFrontFacing );
 
+#else
+
+	idODSStreamedIndexedArray< idDrawVert, triIndex_t, 32, SBT_QUAD, 1 > indexedVertsODS( verts, numVerts, indexes, numIndexes );
+
+	const byte cullShadowTrianglesToLightMask = cullShadowTrianglesToLight ? 255 : 0;
+
+	int numFrontFacing = 0;
+
+	for ( int i = 0, j = 0; i < numIndexes; ) {
+
+		const int batchStart = i;
+		const int batchEnd = indexedVertsODS.FetchNextBatch();
+		const int indexStart = j;
+
+		for ( ; i <= batchEnd - 3; i += 3, j++ ) {
+			const idVec3 & v1 = indexedVertsODS[i + 0].xyz;
+			const idVec3 & v2 = indexedVertsODS[i + 1].xyz;
+			const idVec3 & v3 = indexedVertsODS[i + 2].xyz;
+
+			const byte triangleCulled = TriangleCulled_Generic( v1, v2, v3, lightProject );
+
+			byte triangleFacing = TriangleFacing_Generic( v1, v2, v3, lightOrigin );
+
+			// optionally make triangles that are outside the light frustum facing so they do not contribute to the shadow volume
+			triangleFacing |= ( triangleCulled & cullShadowTrianglesToLightMask );
+
+			culled[j] = triangleCulled;
+			facing[j] = triangleFacing;
+
+			// count the number of facing triangles
+			numFrontFacing += ( triangleFacing & 1 );
+		}
+
+		if ( insideShadowVolume != NULL ) {
+			for ( int k = batchStart, n = indexStart; k <= batchEnd - 3; k += 3, n++ ) {
+				if ( !facing[n] ) {
+					if ( R_LineIntersectsTriangleExpandedWithSphere( lineStart, lineEnd, lineDir, lineLength, radius, indexedVertsODS[k + 2].xyz, indexedVertsODS[k + 1].xyz, indexedVertsODS[k + 0].xyz ) ) {
+						*insideShadowVolume = true;
+						insideShadowVolume = NULL;
+						break;
+					}
+				}
+			}
+		}
+	}
+
+	return numFrontFacing;
+
+#endif
 }
 
 /*
@@ -291,6 +405,7 @@ static int CalculateTriangleFacingCulledSkinned( byte * __restrict facing, byte
 	const idVec3 lineDir = lineDelta * lineLengthRcp;
 	const float lineLength = lineLengthSqr * lineLengthRcp;
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	idODSStreamedArray< idDrawVert, 32, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
 
@@ -428,6 +543,74 @@ static int CalculateTriangleFacingCulledSkinned( byte * __restrict facing, byte
 
 	return _mm_cvtsi128_si32( numFrontFacing );
 
+#else
+
+	idODSStreamedArray< idDrawVert, 32, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
+
+	for ( int i = 0; i < numVerts; ) {
+
+		const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
+
+		for ( ; i <= nextNumVerts; i++ ) {
+			tempVerts[i].ToVec3() = Scalar_LoadSkinnedDrawVertPosition( vertsODS[i], joints );
+			tempVerts[i].w = 1.0f;
+		}
+	}
+
+	idODSStreamedArray< triIndex_t, 256, SBT_QUAD, 1 > indexesODS( indexes, numIndexes );
+
+	const byte cullShadowTrianglesToLightMask = cullShadowTrianglesToLight ? 255 : 0;
+
+	int numFrontFacing = 0;
+
+	for ( int i = 0, j = 0; i < numIndexes; ) {
+
+		const int batchStart = i;
+		const int batchEnd = indexesODS.FetchNextBatch();
+		const int indexStart = j;
+
+		for ( ; i <= batchEnd - 3; i += 3, j++ ) {
+			const int i0 = indexesODS[i + 0];
+			const int i1 = indexesODS[i + 1];
+			const int i2 = indexesODS[i + 2];
+
+			const idVec3 & v1 = tempVerts[i0].ToVec3();
+			const idVec3 & v2 = tempVerts[i1].ToVec3();
+			const idVec3 & v3 = tempVerts[i2].ToVec3();
+
+			const byte triangleCulled = TriangleCulled_Generic( v1, v2, v3, lightProject );
+
+			byte triangleFacing = TriangleFacing_Generic( v1, v2, v3, lightOrigin );
+
+			// optionally make triangles that are outside the light frustum facing so they do not contribute to the shadow volume
+			triangleFacing |= ( triangleCulled & cullShadowTrianglesToLightMask );
+
+			culled[j] = triangleCulled;
+			facing[j] = triangleFacing;
+
+			// count the number of facing triangles
+			numFrontFacing += ( triangleFacing & 1 );
+		}
+
+		if ( insideShadowVolume != NULL ) {
+			for ( int k = batchStart, n = indexStart; k <= batchEnd - 3; k += 3, n++ ) {
+				if ( !facing[n] ) {
+					const int i0 = indexesODS[k + 0];
+					const int i1 = indexesODS[k + 1];
+					const int i2 = indexesODS[k + 2];
+					if ( R_LineIntersectsTriangleExpandedWithSphere( lineStart, lineEnd, lineDir, lineLength, radius, tempVerts[i2].ToVec3(), tempVerts[i1].ToVec3(), tempVerts[i0].ToVec3() ) ) {
+						*insideShadowVolume = true;
+						insideShadowVolume = NULL;
+						break;
+					}
+				}
+			}
+		}
+	}
+
+	return numFrontFacing;
+
+#endif
 }
 
 /*
@@ -440,6 +623,7 @@ static void StreamOut( void * dst, const void * src, int numBytes ) {
 	assert_16_byte_aligned( dst );
 	assert_16_byte_aligned( src );
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 	int i = 0;
 	for ( ; i + 128 <= numBytes; i += 128 ) {
 		__m128i d0 = _mm_load_si128( (const __m128i *)( (byte *)src + i + 0*16 ) );
@@ -463,6 +647,9 @@ static void StreamOut( void * dst, const void * src, int numBytes ) {
 		__m128i d = _mm_load_si128( (__m128i *)( (byte *)src + i ) );
 		_mm_stream_si128( (__m128i *)( (byte *)dst + i ), d );
 	}
+#else
+	memcpy( dst, src, numBytes );
+#endif
 }
 
 /*
@@ -671,7 +858,9 @@ static void R_CreateShadowVolumeTriangles( triIndex_t *__restrict shadowIndices,
 
 	numShadowIndexesTotal = numShadowIndices;
 
+#if defined( ID_WIN_X86_SSE2_INTRIN )
 	_mm_sfence();
+#endif
 
 #else	// NOTE: this code will not work on the SPU because it tries to write directly to the destination
 
@@ -844,7 +1033,9 @@ void R_CreateLightTriangles( triIndex_t * __restrict lightIndices, triIndex_t *
 
 	numLightIndicesTotal = numLightIndices;
 
+#if defined( ID_WIN_X86_SSE2_INTRIN )
 	_mm_sfence();
+#endif
 
 #else	// NOTE: this code will not work on the SPU because it tries to write directly to the destination
 
diff --git a/neo/renderer/tr_trace.cpp b/neo/renderer/tr_trace.cpp
index a9d789aa..fe7595de 100644
--- a/neo/renderer/tr_trace.cpp
+++ b/neo/renderer/tr_trace.cpp
@@ -43,6 +43,7 @@ static void R_TracePointCullStatic( byte *cullBits, byte &totalOr, const float r
 	assert_16_byte_aligned( cullBits );
 	assert_16_byte_aligned( verts );
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
 
@@ -164,6 +165,54 @@ static void R_TracePointCullStatic( byte *cullBits, byte &totalOr, const float r
 
 	totalOr = (byte) _mm_cvtsi128_si32( vecTotalOrByte );
 
+#else
+
+	idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
+
+	byte tOr = 0;
+	for ( int i = 0; i < numVerts; ) {
+
+		const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
+
+		for ( ; i <= nextNumVerts; i++ ) {
+			const idVec3 & v = vertsODS[i].xyz;
+
+			const float d0 = planes[0].Distance( v );
+			const float d1 = planes[1].Distance( v );
+			const float d2 = planes[2].Distance( v );
+			const float d3 = planes[3].Distance( v );
+
+			const float t0 = d0 + radius;
+			const float t1 = d1 + radius;
+			const float t2 = d2 + radius;
+			const float t3 = d3 + radius;
+
+			const float s0 = d0 - radius;
+			const float s1 = d1 - radius;
+			const float s2 = d2 - radius;
+			const float s3 = d3 - radius;
+
+			byte bits;
+			bits  = IEEE_FLT_SIGNBITSET( t0 ) << 0;
+			bits |= IEEE_FLT_SIGNBITSET( t1 ) << 1;
+			bits |= IEEE_FLT_SIGNBITSET( t2 ) << 2;
+			bits |= IEEE_FLT_SIGNBITSET( t3 ) << 3;
+
+			bits |= IEEE_FLT_SIGNBITSET( s0 ) << 4;
+			bits |= IEEE_FLT_SIGNBITSET( s1 ) << 5;
+			bits |= IEEE_FLT_SIGNBITSET( s2 ) << 6;
+			bits |= IEEE_FLT_SIGNBITSET( s3 ) << 7;
+
+			bits ^= 0x0F;		// flip lower four bits
+
+			tOr |= bits;
+			cullBits[i] = bits;
+		}
+	}
+
+	totalOr = tOr;
+
+#endif
 }
 
 /*
@@ -175,6 +224,7 @@ static void R_TracePointCullSkinned( byte *cullBits, byte &totalOr, const float
 	assert_16_byte_aligned( cullBits );
 	assert_16_byte_aligned( verts );
 
+#ifdef ID_WIN_X86_SSE2_INTRIN
 
 	idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
 
@@ -296,6 +346,54 @@ static void R_TracePointCullSkinned( byte *cullBits, byte &totalOr, const float
 
 	totalOr = (byte) _mm_cvtsi128_si32( vecTotalOrByte );
 
+#else
+
+	idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
+
+	byte tOr = 0;
+	for ( int i = 0; i < numVerts; ) {
+
+		const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
+
+		for ( ; i <= nextNumVerts; i++ ) {
+			const idVec3 v = Scalar_LoadSkinnedDrawVertPosition( vertsODS[i], joints );
+
+			const float d0 = planes[0].Distance( v );
+			const float d1 = planes[1].Distance( v );
+			const float d2 = planes[2].Distance( v );
+			const float d3 = planes[3].Distance( v );
+
+			const float t0 = d0 + radius;
+			const float t1 = d1 + radius;
+			const float t2 = d2 + radius;
+			const float t3 = d3 + radius;
+
+			const float s0 = d0 - radius;
+			const float s1 = d1 - radius;
+			const float s2 = d2 - radius;
+			const float s3 = d3 - radius;
+
+			byte bits;
+			bits  = IEEE_FLT_SIGNBITSET( t0 ) << 0;
+			bits |= IEEE_FLT_SIGNBITSET( t1 ) << 1;
+			bits |= IEEE_FLT_SIGNBITSET( t2 ) << 2;
+			bits |= IEEE_FLT_SIGNBITSET( t3 ) << 3;
+
+			bits |= IEEE_FLT_SIGNBITSET( s0 ) << 4;
+			bits |= IEEE_FLT_SIGNBITSET( s1 ) << 5;
+			bits |= IEEE_FLT_SIGNBITSET( s2 ) << 6;
+			bits |= IEEE_FLT_SIGNBITSET( s3 ) << 7;
+
+			bits ^= 0x0F;		// flip lower four bits
+
+			tOr |= bits;
+			cullBits[i] = bits;
+		}
+	}
+
+	totalOr = tOr;
+
+#endif
 }
 
 /*