doom3-bfg/neo/renderer/DXT/DXTEncoder.cpp

/*
===========================================================================

Doom 3 BFG Edition GPL Source Code
Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company.
Copyright (C) 2014-2016 Kot in Action Creative Artel
Copyright (C) 2016-2017 Dustin Land
Copyright (C) 2014-2020 Robert Beckebans

This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").

Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with Doom 3 BFG Edition Source Code.  If not, see <http://www.gnu.org/licenses/>.

In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code.  If not, please request a copy in writing from id Software at the address below.

If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.

===========================================================================
*/
#include "precompiled.h"
#pragma hdrstop

#include "DXTCodec_local.h"
#include "DXTCodec.h"

#define INSET_COLOR_SHIFT		4		// inset the bounding box with ( range >> shift )
#define INSET_ALPHA_SHIFT		5		// inset alpha channel

#define C565_5_MASK				0xF8	// 0xFF minus last three bits
#define C565_6_MASK				0xFC	// 0xFF minus last two bits

#define NVIDIA_7X_HARDWARE_BUG_FIX		// keep the DXT5 colors sorted as: max, min

typedef uint16	word;
typedef uint32	dword;

// LordHavoc: macros required by gimp-dds code:
#ifndef MIN
	#ifdef __GNUC__
		#define MIN(a, b)  ({decltype(a) _a=(a); decltype(b) _b=(b); _a < _b ? _a : _b;})
	#else
		#define MIN(a, b)  ((a) < (b) ? (a) : (b))
	#endif
#endif

#define PUTL16( buf, s ) \
	( buf )[0] = ( ( s )      ) & 0xff; \
	( buf )[1] = ( ( s ) >> 8 ) & 0xff;

#define PUTL32( buf, l ) \
	( buf )[0] = ( ( l )       ) & 0xff; \
	( buf )[1] = ( ( l ) >>  8 ) & 0xff; \
	( buf )[2] = ( ( l ) >> 16 ) & 0xff; \
	( buf )[3] = ( ( l ) >> 24 ) & 0xff;

#define INSET_SHIFT  4

#define BLOCK_OFFSET( x, y, w, bs )  ( ( ( y ) >> 2 ) * ( ( bs ) * ( ( ( w ) + 3 ) >> 2 ) ) + ( ( bs ) * ( ( x ) >> 2 ) ) )

/*
========================
idDxtEncoder::NV4XHardwareBugFix
========================
*/
void idDxtEncoder::NV4XHardwareBugFix( byte* minColor, byte* maxColor ) const
{
	int minq = ( ( minColor[0] << 16 ) | ( minColor[1] << 8 ) | minColor[2] ) & 0x00F8FCF8;
	int maxq = ( ( maxColor[0] << 16 ) | ( maxColor[1] << 8 ) | maxColor[2] ) & 0x00F8FCF8;
	int mask = -( minq > maxq ) & 0x00FFFFFF;
	int min = *( int* )minColor;
	int max = *( int* )maxColor;
	min ^= max;
	max ^= ( min & mask );
	min ^= max;
	*( int* )minColor = min;
	*( int* )maxColor = max;
}

/*
========================
idDxtEncoder::HasConstantValuePer4x4Block
========================
*/
bool idDxtEncoder::HasConstantValuePer4x4Block( const byte* inBuf, int width, int height, int channel ) const
{
	if( width < 4 || height < 4 )
	{
		byte value = inBuf[channel];
		for( int k = 0; k < height; k++ )
		{
			for( int l = 0; l < width; l++ )
			{
				if( inBuf[( k * width + l ) * 4 + channel] != value )
				{
					return false;
				}
			}
		}
		return true;
	}

	for( int j = 0; j < height; j += 4, inBuf += width * 4 * 4 )
	{
		for( int i = 0; i < width; i += 4 )
		{
			const byte* inPtr = inBuf + i * 4;
			byte value = inPtr[channel];
			for( int k = 0; k < 4; k++ )
			{
				for( int l = 0; l < 4; l++ )
				{
					if( inPtr[( k * width + l ) * 4 + channel] != value )
					{
						return false;
					}
				}
			}
		}
		inBuf += srcPadding;
	}
	return true;
}

/*
========================
idDxtEncoder::WriteTinyColorDXT1
========================
*/
void idDxtEncoder::WriteTinyColorDXT1( const byte* inBuf, int width, int height )
{
	int numBlocks = ( ( width + 3 ) / 4 ) * ( ( height + 3 ) / 4 );
	int stride = ( ( width * height ) / numBlocks ) * 4;	// number of bytes from one block to the next
	// example: 2x8 pixels
	// numBlocks = 2
	// stride = 32 bytes (8 pixels)

	for( int i = 0; i < numBlocks; i++ )
	{
		// FIXME: This just emits a fake block based on the color at position 0,0
		EmitUShort( ColorTo565( inBuf ) );
		EmitUShort( 0 );	// dummy, never used
		EmitUInt( 0 );		// 4 color index bytes all use the first color

		inBuf += stride;
	}
}

/*
========================
idDxtEncoder::WriteTinyColorDXT5
========================
*/
void idDxtEncoder::WriteTinyColorDXT5( const byte* inBuf, int width, int height )
{
	int numBlocks = ( ( width + 3 ) / 4 ) * ( ( height + 3 ) / 4 );
	int stride = ( ( width * height ) / numBlocks ) * 4;	// number of bytes from one block to the next
	// example: 2x8 pixels
	// numBlocks = 2
	// stride = 32 bytes (8 pixels)

	for( int i = 0; i < numBlocks; i++ )
	{
		// FIXME: This just emits a fake block based on the color at position 0,0
		EmitByte( inBuf[3] );
		EmitByte( 0 );		// dummy, never used
		EmitByte( 0 );		// 6 alpha index bytes all use the first alpha
		EmitByte( 0 );
		EmitByte( 0 );
		EmitByte( 0 );
		EmitByte( 0 );
		EmitByte( 0 );

		EmitUShort( ColorTo565( inBuf ) );
		EmitUShort( 0 );	// dummy, never used
		EmitUInt( 0 );		// 4 color index bytes all use the first color

		inBuf += stride;
	}
}

/*
========================
idDxtEncoder::WriteTinyColorCTX1DXT5A
========================
*/
void idDxtEncoder::WriteTinyColorCTX1DXT5A( const byte* inBuf, int width, int height )
{
	int numBlocks = ( ( width + 3 ) / 4 ) * ( ( height + 3 ) / 4 );
	int stride = ( ( width * height ) / numBlocks ) * 4;	// number of bytes from one block to the next
	// example: 2x8 pixels
	// numBlocks = 2
	// stride = 32 bytes (8 pixels)

	for( int i = 0; i < numBlocks; i++ )
	{
		// FIXME: This just emits a fake block based on the color at position 0,0
		EmitByte( inBuf[0] );
		EmitByte( inBuf[1] );
		EmitByte( inBuf[0] );
		EmitByte( inBuf[1] );
		EmitUInt( 0 );		// 4 color index bytes all use the first color

		EmitByte( inBuf[3] );
		EmitByte( 0 );		// dummy, never used
		EmitByte( 0 );		// 6 alpha index bytes all use the first alpha
		EmitByte( 0 );
		EmitByte( 0 );
		EmitByte( 0 );
		EmitByte( 0 );
		EmitByte( 0 );

		inBuf += stride;
	}
}

/*
========================
idDxtEncoder::WriteTinyNormalMapDXT5
========================
*/
void idDxtEncoder::WriteTinyNormalMapDXT5( const byte* inBuf, int width, int height )
{
	int numBlocks = ( ( width + 3 ) / 4 ) * ( ( height + 3 ) / 4 );
	int stride = ( ( width * height ) / numBlocks ) * 4;	// number of bytes from one block to the next
	// example: 2x8 pixels
	// numBlocks = 2
	// stride = 32 bytes (8 pixels)

	for( int i = 0; i < numBlocks; i++ )
	{
		// FIXME: This just emits a fake block based on the normal at position 0,0
		EmitByte( inBuf[3] );
		EmitByte( 0 );		// dummy, never used
		EmitByte( 0 );		// 6 alpha index bytes all use the first alpha
		EmitByte( 0 );
		EmitByte( 0 );
		EmitByte( 0 );
		EmitByte( 0 );
		EmitByte( 0 );

		EmitUShort( ColorTo565( inBuf[0], inBuf[1], inBuf[2] ) );
		EmitUShort( 0 );	// dummy, never used
		EmitUInt( 0 );		// 4 color index bytes all use the first color

		inBuf += stride;
	}
}

/*
========================
idDxtEncoder::WriteTinyNormalMapDXN
========================
*/
void idDxtEncoder::WriteTinyNormalMapDXN( const byte* inBuf, int width, int height )
{
	int numBlocks = ( ( width + 3 ) / 4 ) * ( ( height + 3 ) / 4 );
	int stride = ( ( width * height ) / numBlocks ) * 4;	// number of bytes from one block to the next
	// example: 2x8 pixels
	// numBlocks = 2
	// stride = 32 bytes (8 pixels)

	for( int i = 0; i < numBlocks; i++ )
	{
		// FIXME: This just emits a fake block based on the normal at position 0,0
		EmitByte( inBuf[0] );
		EmitByte( 0 );		// dummy, never used
		EmitByte( 0 );		// 6 alpha index bytes all use the first alpha
		EmitByte( 0 );
		EmitByte( 0 );
		EmitByte( 0 );
		EmitByte( 0 );
		EmitByte( 0 );

		EmitByte( inBuf[1] );
		EmitByte( 0 );		// dummy, never used
		EmitByte( 0 );		// 6 alpha index bytes all use the first alpha
		EmitByte( 0 );
		EmitByte( 0 );
		EmitByte( 0 );
		EmitByte( 0 );
		EmitByte( 0 );

		inBuf += stride;
	}
}

/*
========================
idDxtEncoder::WriteTinyDXT5A
========================
*/
void idDxtEncoder::WriteTinyDXT5A( const byte* inBuf, int width, int height )
{
	int numBlocks = ( ( width + 3 ) / 4 ) * ( ( height + 3 ) / 4 );
	int stride = ( ( width * height ) / numBlocks ) * 4;	// number of bytes from one block to the next
	// example: 2x8 pixels
	// numBlocks = 2
	// stride = 32 bytes (8 pixels)

	for( int i = 0; i < numBlocks; i++ )
	{
		// FIXME: This just emits a fake block based on the normal at position 0,0
		EmitByte( inBuf[0] );
		EmitByte( 0 );		// dummy, never used
		EmitByte( 0 );		// 6 alpha index bytes all use the first alpha
		EmitByte( 0 );
		EmitByte( 0 );
		EmitByte( 0 );
		EmitByte( 0 );
		EmitByte( 0 );

		inBuf += stride;
	}
}

/*
========================
idDxtEncoder::ExtractBlock

params:	inPtr		- input image, 4 bytes per pixel
paramO:	colorBlock	- 4*4 output tile, 4 bytes per pixel
========================
*/
ID_INLINE void idDxtEncoder::ExtractBlock( const byte* inPtr, int width, byte* colorBlock ) const
{
	for( int j = 0; j < 4; j++ )
	{
		memcpy( &colorBlock[j * 4 * 4], inPtr, 4 * 4 );
		inPtr += width * 4;
	}
}

/*
========================
SwapColors
========================
*/
void SwapColors( byte* c1, byte* c2 )
{
	byte tm[3];
	memcpy( tm, c1, 3 );
	memcpy( c1, c2, 3 );
	memcpy( c2, tm, 3 );
}

/*
========================
idDxtEncoder::GetMinMaxColorsMaxDist

Finds the two RGB colors in a 4x4 block furthest apart. Also finds the two alpha values
furthest apart.

params: colorBlock	- 4*4 input tile, 4 bytes per pixel
paramO:	minColor	- 4 byte min color
paramO:	maxColor	- 4 byte max color
========================
*/
void idDxtEncoder::GetMinMaxColorsMaxDist( const byte* colorBlock, byte* minColor, byte* maxColor ) const
{
	int maxDistC = -1;
	int maxDistA = -1;

	for( int i = 0; i < 64 - 4; i += 4 )
	{
		for( int j = i + 4; j < 64; j += 4 )
		{
			int dc = ColorDistance( &colorBlock[i], &colorBlock[j] );
			if( dc > maxDistC )
			{
				maxDistC = dc;
				memcpy( minColor, colorBlock + i, 3 );
				memcpy( maxColor, colorBlock + j, 3 );
			}
			int da = AlphaDistance( colorBlock[i + 3], colorBlock[j + 3] );
			if( da > maxDistA )
			{
				maxDistA = da;
				minColor[3] = colorBlock[i + 3];
				maxColor[3] = colorBlock[j + 3];
			}
		}
	}
	if( maxColor[0] < minColor[0] )
	{
		SwapColors( minColor, maxColor );
	}
}

/*
========================
idDxtEncoder::GetMinMaxColorsLuminance

Finds the two RGB colors in a 4x4 block furthest apart based on luminance. Also finds the two
alpha values furthest apart.

params: colorBlock	- 4*4 input tile, 4 bytes per pixel
paramO:	minColor	- 4 byte min color
paramO:	maxColor	- 4 byte max color
========================
*/
void idDxtEncoder::GetMinMaxColorsLuminance( const byte* colorBlock, byte* minColor, byte* maxColor ) const
{
	int maxLumC = 0, minLumC = 256 * 4;
	int maxAlpha = 0, minAlpha = 256 * 4;

	for( int i = 0; i < 16; i++ )
	{
		int luminance = colorBlock[i * 4 + 0] + colorBlock[i * 4 + 1] * 2 + colorBlock[i * 4 + 2];
		if( luminance > maxLumC )
		{
			maxLumC = luminance;
			memcpy( maxColor, colorBlock + i * 4, 3 );
		}
		if( luminance < minLumC )
		{
			minLumC = luminance;
			memcpy( minColor, colorBlock + i * 4, 3 );
		}
		int alpha = colorBlock[i * 4 + 3];
		if( alpha > maxAlpha )
		{
			maxAlpha = alpha;
			maxColor[3] = ( byte )alpha;
		}
		if( alpha < minAlpha )
		{
			minAlpha = alpha;
			minColor[3] = ( byte )alpha;
		}
	}
	if( maxColor[0] < minColor[0] )
	{
		SwapColors( minColor, maxColor );
	}
}

/*
========================
idDxtEncoder::GetSquareAlphaError

params:	colorBlock	- 16 pixel block for which to find color indexes
paramO:	minAlpha	- Min alpha found
paramO:	maxAlpha	- Max alpha found
return: 4 byte color index block
========================
*/
int idDxtEncoder::GetSquareAlphaError( const byte* colorBlock, const int alphaOffset, const byte minAlpha, const byte maxAlpha, int lastError ) const
{
	int i, j;
	byte alphas[8];

	alphas[0] = maxAlpha;
	alphas[1] = minAlpha;

	if( maxAlpha > minAlpha )
	{
		alphas[2] = ( 6 * alphas[0] + 1 * alphas[1] ) / 7;
		alphas[3] = ( 5 * alphas[0] + 2 * alphas[1] ) / 7;
		alphas[4] = ( 4 * alphas[0] + 3 * alphas[1] ) / 7;
		alphas[5] = ( 3 * alphas[0] + 4 * alphas[1] ) / 7;
		alphas[6] = ( 2 * alphas[0] + 5 * alphas[1] ) / 7;
		alphas[7] = ( 1 * alphas[0] + 6 * alphas[1] ) / 7;
	}
	else
	{
		alphas[2] = ( 4 * alphas[0] + 1 * alphas[1] ) / 5;
		alphas[3] = ( 3 * alphas[0] + 2 * alphas[1] ) / 5;
		alphas[4] = ( 2 * alphas[0] + 3 * alphas[1] ) / 5;
		alphas[5] = ( 1 * alphas[0] + 4 * alphas[1] ) / 5;
		alphas[6] = 0;
		alphas[7] = 255;
	}

	int error = 0;
	for( i = 0; i < 16; i++ )
	{
		unsigned int minDist = MAX_UNSIGNED_TYPE( int );
		byte a = colorBlock[i * 4 + alphaOffset];
		for( j = 0; j < 8; j++ )
		{
			unsigned int dist = AlphaDistance( a, alphas[j] );
			if( dist < minDist )
			{
				minDist = dist;
			}
		}
		error += minDist;

		if( error >= lastError )
		{
			return error;
		}
	}

	return error;
}

/*
========================
idDxtEncoder::GetMinMaxAlphaHQ

params:	colorBlock	- 4*4 input tile, 4 bytes per pixel
paramO:	minColor		- 4 byte min color found
paramO:	maxColor		- 4 byte max color found
========================
*/
int idDxtEncoder::GetMinMaxAlphaHQ( const byte* colorBlock, const int alphaOffset, byte* minColor, byte* maxColor ) const
{
	int i, j;
	byte alphaMin, alphaMax;
	int error, bestError = MAX_TYPE( int );

	alphaMin = 255;
	alphaMax = 0;

	// get alpha min / max
	for( i = 0; i < 16; i++ )
	{
		if( colorBlock[i * 4 + alphaOffset] < alphaMin )
		{
			alphaMin = colorBlock[i * 4 + alphaOffset];
		}
		if( colorBlock[i * 4 + alphaOffset] > alphaMax )
		{
			alphaMax = colorBlock[i * 4 + alphaOffset];
		}
	}

	const int ALPHA_EXPAND = 32;

	alphaMin = ( alphaMin <= ALPHA_EXPAND ) ? 0 : alphaMin - ALPHA_EXPAND;
	alphaMax = ( alphaMax >= 255 - ALPHA_EXPAND ) ? 255 : alphaMax + ALPHA_EXPAND;

	for( i = alphaMin; i <= alphaMax; i++ )
	{
		for( j = alphaMax; j >= i; j-- )
		{

			error = GetSquareAlphaError( colorBlock, alphaOffset, ( byte )i, ( byte )j, bestError );
			if( error < bestError )
			{
				bestError = error;
				minColor[alphaOffset] = ( byte )i;
				maxColor[alphaOffset] = ( byte )j;
			}

			error = GetSquareAlphaError( colorBlock, alphaOffset, ( byte )j, ( byte )i, bestError );
			if( error < bestError )
			{
				bestError = error;
				minColor[alphaOffset] = ( byte )i;
				maxColor[alphaOffset] = ( byte )j;
			}
		}
	}

	return bestError;
}

/*
========================
idDxtEncoder::GetSquareColorsError

params:	colorBlock	- 16 pixel block for which to find color indexes
paramO:	color0		- 4 byte min color found
paramO:	color1		- 4 byte max color found
return: 4 byte color index block
========================
*/
int idDxtEncoder::GetSquareColorsError( const byte* colorBlock, const unsigned short color0, const unsigned short color1, int lastError ) const
{
	int i, j;
	byte colors[4][4];

	ColorFrom565( color0, colors[0] );
	ColorFrom565( color1, colors[1] );

	if( color0 > color1 )
	{
		colors[2][0] = ( 2 * colors[0][0] + 1 * colors[1][0] ) / 3;
		colors[2][1] = ( 2 * colors[0][1] + 1 * colors[1][1] ) / 3;
		colors[2][2] = ( 2 * colors[0][2] + 1 * colors[1][2] ) / 3;
		colors[3][0] = ( 1 * colors[0][0] + 2 * colors[1][0] ) / 3;
		colors[3][1] = ( 1 * colors[0][1] + 2 * colors[1][1] ) / 3;
		colors[3][2] = ( 1 * colors[0][2] + 2 * colors[1][2] ) / 3;
	}
	else
	{
		colors[2][0] = ( 1 * colors[0][0] + 1 * colors[1][0] ) / 2;
		colors[2][1] = ( 1 * colors[0][1] + 1 * colors[1][1] ) / 2;
		colors[2][2] = ( 1 * colors[0][2] + 1 * colors[1][2] ) / 2;
		colors[3][0] = 0;
		colors[3][1] = 0;
		colors[3][2] = 0;
	}

	int error = 0;
	for( i = 0; i < 16; i++ )
	{
		unsigned int minDist = MAX_UNSIGNED_TYPE( int );
		for( j = 0; j < 4; j++ )
		{
			unsigned int dist = ColorDistance( &colorBlock[i * 4], &colors[j][0] );
			if( dist < minDist )
			{
				minDist = dist;
			}
		}
		// accumulated error
		error += minDist;

		if( error > lastError )
		{
			return error;
		}
	}
	return error;
}

/*
========================
idDxtEncoder::GetSquareNormalYError

params:	colorBlock	- 16 pixel block for which to find color indexes
paramO:	color0		- 4 byte min color found
paramO:	color1		- 4 byte max color found
return: 4 byte color index block
========================
*/
int idDxtEncoder::GetSquareNormalYError( const byte* colorBlock, const unsigned short color0, const unsigned short color1, int lastError, int scale ) const
{
	int i, j;
	byte colors[4][4];

	ColorFrom565( color0, colors[0] );
	ColorFrom565( color1, colors[1] );

	if( color0 > color1 )
	{
		colors[2][0] = ( 2 * colors[0][0] + 1 * colors[1][0] ) / 3;
		colors[2][1] = ( 2 * colors[0][1] + 1 * colors[1][1] ) / 3;
		colors[2][2] = ( 2 * colors[0][2] + 1 * colors[1][2] ) / 3;
		colors[3][0] = ( 1 * colors[0][0] + 2 * colors[1][0] ) / 3;
		colors[3][1] = ( 1 * colors[0][1] + 2 * colors[1][1] ) / 3;
		colors[3][2] = ( 1 * colors[0][2] + 2 * colors[1][2] ) / 3;
	}
	else
	{
		colors[2][0] = ( 1 * colors[0][0] + 1 * colors[1][0] ) / 2;
		colors[2][1] = ( 1 * colors[0][1] + 1 * colors[1][1] ) / 2;
		colors[2][2] = ( 1 * colors[0][2] + 1 * colors[1][2] ) / 2;
		colors[3][0] = 0;
		colors[3][1] = 0;
		colors[3][2] = 0;
	}

	int error = 0;
	for( i = 0; i < 16; i++ )
	{
		unsigned int minDist = MAX_UNSIGNED_TYPE( int );
		for( j = 0; j < 4; j++ )
		{
			float r = ( float ) colorBlock[i * 4 + 1] / scale;
			float s = ( float ) colors[j][1] / scale;
			unsigned int dist = idMath::Ftoi( ( r - s ) * ( r - s ) );
			if( dist < minDist )
			{
				minDist = dist;
			}
		}
		// accumulated error
		error += minDist;

		if( error > lastError )
		{
			return error;
		}
	}
	return error;
}

/*
========================
idDxtEncoder::GetMinMaxColorsHQ

Uses an exhaustive search to find the two RGB colors that produce the least error when used to
compress the 4x4 block. Also finds the minimum and maximum alpha values.

params:	colorBlock	- 4*4 input tile, 4 bytes per pixel
paramO:	minColor	- 4 byte min color found
paramO:	maxColor	- 4 byte max color found
========================
*/
int idDxtEncoder::GetMinMaxColorsHQ( const byte* colorBlock, byte* minColor, byte* maxColor, bool noBlack ) const
{
	int i;
	int i0, i1, i2, j0, j1, j2;
	unsigned short minColor565, maxColor565, bestMinColor565, bestMaxColor565;
	byte bboxMin[3], bboxMax[3], minAxisDist[3];
	int error, bestError = MAX_TYPE( int );

	bboxMin[0] = bboxMin[1] = bboxMin[2] = 255;
	bboxMax[0] = bboxMax[1] = bboxMax[2] = 0;

	// get color bbox
	for( i = 0; i < 16; i++ )
	{
		if( colorBlock[i * 4 + 0] < bboxMin[0] )
		{
			bboxMin[0] = colorBlock[i * 4 + 0];
		}
		if( colorBlock[i * 4 + 1] < bboxMin[1] )
		{
			bboxMin[1] = colorBlock[i * 4 + 1];
		}
		if( colorBlock[i * 4 + 2] < bboxMin[2] )
		{
			bboxMin[2] = colorBlock[i * 4 + 2];
		}
		if( colorBlock[i * 4 + 0] > bboxMax[0] )
		{
			bboxMax[0] = colorBlock[i * 4 + 0];
		}
		if( colorBlock[i * 4 + 1] > bboxMax[1] )
		{
			bboxMax[1] = colorBlock[i * 4 + 1];
		}
		if( colorBlock[i * 4 + 2] > bboxMax[2] )
		{
			bboxMax[2] = colorBlock[i * 4 + 2];
		}
	}

	// decrease range for 565 encoding
	bboxMin[0] >>= 3;
	bboxMin[1] >>= 2;
	bboxMin[2] >>= 3;
	bboxMax[0] >>= 3;
	bboxMax[1] >>= 2;
	bboxMax[2] >>= 3;

	// get the minimum distance the end points of the line must be apart along each axis
	for( i = 0; i < 3; i++ )
	{
		minAxisDist[i] = ( bboxMax[i] - bboxMin[i] );
		if( minAxisDist[i] >= 16 )
		{
			minAxisDist[i] = minAxisDist[i] * 3 / 4;
		}
		else if( minAxisDist[i] >= 8 )
		{
			minAxisDist[i] = minAxisDist[i] * 2 / 4;
		}
		else if( minAxisDist[i] >= 4 )
		{
			minAxisDist[i] = minAxisDist[i] * 1 / 4;
		}
		else
		{
			minAxisDist[i] = 0;
		}
	}

	// expand the bounding box
	const int C565_BBOX_EXPAND = 1;

	bboxMin[0] = ( bboxMin[0] <= C565_BBOX_EXPAND ) ? 0 : bboxMin[0] - C565_BBOX_EXPAND;
	bboxMin[1] = ( bboxMin[1] <= C565_BBOX_EXPAND ) ? 0 : bboxMin[1] - C565_BBOX_EXPAND;
	bboxMin[2] = ( bboxMin[2] <= C565_BBOX_EXPAND ) ? 0 : bboxMin[2] - C565_BBOX_EXPAND;
	bboxMax[0] = ( bboxMax[0] >= ( 255 >> 3 ) - C565_BBOX_EXPAND ) ? ( 255 >> 3 ) : bboxMax[0] + C565_BBOX_EXPAND;
	bboxMax[1] = ( bboxMax[1] >= ( 255 >> 2 ) - C565_BBOX_EXPAND ) ? ( 255 >> 2 ) : bboxMax[1] + C565_BBOX_EXPAND;
	bboxMax[2] = ( bboxMax[2] >= ( 255 >> 3 ) - C565_BBOX_EXPAND ) ? ( 255 >> 3 ) : bboxMax[2] + C565_BBOX_EXPAND;

	bestMinColor565 = 0;
	bestMaxColor565 = 0;

	for( i0 = bboxMin[0]; i0 <= bboxMax[0]; i0++ )
	{
		for( j0 = bboxMax[0]; j0 >= bboxMin[0]; j0-- )
		{
			if( abs( i0 - j0 ) < minAxisDist[0] )
			{
				continue;
			}

			for( i1 = bboxMin[1]; i1 <= bboxMax[1]; i1++ )
			{
				for( j1 = bboxMax[1]; j1 >= bboxMin[1]; j1-- )
				{
					if( abs( i1 - j1 ) < minAxisDist[1] )
					{
						continue;
					}

					for( i2 = bboxMin[2]; i2 <= bboxMax[2]; i2++ )
					{
						for( j2 = bboxMax[2]; j2 >= bboxMin[2]; j2-- )
						{
							if( abs( i2 - j2 ) < minAxisDist[2] )
							{
								continue;
							}

							minColor565 = ( unsigned short )( ( i0 << 11 ) | ( i1 << 5 ) | ( i2 << 0 ) );
							maxColor565 = ( unsigned short )( ( j0 << 11 ) | ( j1 << 5 ) | ( j2 << 0 ) );

							if( !noBlack )
							{
								error = GetSquareColorsError( colorBlock, maxColor565, minColor565, bestError );
								if( error < bestError )
								{
									bestError = error;
									bestMinColor565 = minColor565;
									bestMaxColor565 = maxColor565;
								}
							}
							else
							{
								if( minColor565 <= maxColor565 )
								{
									SwapValues( minColor565, maxColor565 );
								}
							}

							error = GetSquareColorsError( colorBlock, minColor565, maxColor565, bestError );
							if( error < bestError )
							{
								bestError = error;
								bestMinColor565 = minColor565;
								bestMaxColor565 = maxColor565;
							}
						}
					}
				}
			}
		}
	}

	ColorFrom565( bestMinColor565, minColor );
	ColorFrom565( bestMaxColor565, maxColor );

	return bestError;
}

/*
========================
idDxtEncoder::GetSquareCTX1Error

params:	colorBlock	- 16 pixel block for which to find color indexes
paramO:	color0		- Min color found
paramO:	color1		- Max color found
return: 4 byte color index block
========================
*/
int idDxtEncoder::GetSquareCTX1Error( const byte* colorBlock, const byte* color0, const byte* color1, int lastError ) const
{
	int i, j;
	byte colors[4][4];

	colors[0][0] = color0[0];
	colors[0][1] = color0[1];
	colors[1][0] = color1[0];
	colors[1][1] = color1[1];

	colors[2][0] = ( 2 * colors[0][0] + 1 * colors[1][0] ) / 3;
	colors[2][1] = ( 2 * colors[0][1] + 1 * colors[1][1] ) / 3;
	colors[3][0] = ( 1 * colors[0][0] + 2 * colors[1][0] ) / 3;
	colors[3][1] = ( 1 * colors[0][1] + 2 * colors[1][1] ) / 3;

	int error = 0;
	for( i = 0; i < 16; i++ )
	{
		unsigned int minDist = MAX_UNSIGNED_TYPE( int );
		for( j = 0; j < 4; j++ )
		{
			unsigned int dist = CTX1Distance( &colorBlock[i * 4], &colors[j][0] );
			if( dist < minDist )
			{
				minDist = dist;
			}
		}
		// accumulated error
		error += minDist;

		if( error > lastError )
		{
			return error;
		}
	}
	return error;
}

/*
========================
idDxtEncoder::GetMinMaxCTX1HQ

Uses an exhaustive search to find the two RGB colors that produce the least error when used to
compress the 4x4 block. Also finds the minimum and maximum alpha values.

params:	colorBlock	- 4*4 input tile, 4 bytes per pixel
paramO:	minColor	- 4 byte Min color found
paramO:	maxColor	- 4 byte Max color found
========================
*/
int idDxtEncoder::GetMinMaxCTX1HQ( const byte* colorBlock, byte* minColor, byte* maxColor ) const
{
	int i;
	int i0, i1, j0, j1;
	byte curMinColor[2], curMaxColor[2];
	byte bboxMin[2], bboxMax[2], minAxisDist[2];
	int error, bestError = MAX_TYPE( int );

	bboxMin[0] = bboxMin[1] = 255;
	bboxMax[0] = bboxMax[1] = 0;

	// get color bbox
	for( i = 0; i < 16; i++ )
	{
		if( colorBlock[i * 4 + 0] < bboxMin[0] )
		{
			bboxMin[0] = colorBlock[i * 4 + 0];
		}
		if( colorBlock[i * 4 + 1] < bboxMin[1] )
		{
			bboxMin[1] = colorBlock[i * 4 + 1];
		}
		if( colorBlock[i * 4 + 0] > bboxMax[0] )
		{
			bboxMax[0] = colorBlock[i * 4 + 0];
		}
		if( colorBlock[i * 4 + 1] > bboxMax[1] )
		{
			bboxMax[1] = colorBlock[i * 4 + 1];
		}
	}

	// get the minimum distance the end points of the line must be apart along each axis
	for( i = 0; i < 2; i++ )
	{
		minAxisDist[i] = ( bboxMax[i] - bboxMin[i] );
		if( minAxisDist[i] >= 64 )
		{
			minAxisDist[i] = minAxisDist[i] * 3 / 4;
		}
		else if( minAxisDist[i] >= 32 )
		{
			minAxisDist[i] = minAxisDist[i] * 2 / 4;
		}
		else if( minAxisDist[i] >= 16 )
		{
			minAxisDist[i] = minAxisDist[i] * 1 / 4;
		}
		else
		{
			minAxisDist[i] = 0;
		}
	}

	// expand the bounding box
	const int CXT1_BBOX_EXPAND = 6;

	bboxMin[0] = ( bboxMin[0] <= CXT1_BBOX_EXPAND ) ? 0 : bboxMin[0] - CXT1_BBOX_EXPAND;
	bboxMin[1] = ( bboxMin[1] <= CXT1_BBOX_EXPAND ) ? 0 : bboxMin[1] - CXT1_BBOX_EXPAND;
	bboxMax[0] = ( bboxMax[0] >= 255 - CXT1_BBOX_EXPAND ) ? 255 : bboxMax[0] + CXT1_BBOX_EXPAND;
	bboxMax[1] = ( bboxMax[1] >= 255 - CXT1_BBOX_EXPAND ) ? 255 : bboxMax[1] + CXT1_BBOX_EXPAND;

	for( i0 = bboxMin[0]; i0 <= bboxMax[0]; i0++ )
	{
		for( j0 = bboxMax[0]; j0 >= bboxMin[0]; j0-- )
		{
			if( abs( i0 - j0 ) < minAxisDist[0] )
			{
				continue;
			}

			for( i1 = bboxMin[1]; i1 <= bboxMax[1]; i1++ )
			{
				for( j1 = bboxMax[1]; j1 >= bboxMin[1]; j1-- )
				{
					if( abs( i1 - j1 ) < minAxisDist[1] )
					{
						continue;
					}

					curMinColor[0] = ( byte )i0;
					curMinColor[1] = ( byte )i1;

					curMaxColor[0] = ( byte )j0;
					curMaxColor[1] = ( byte )j1;

					error = GetSquareCTX1Error( colorBlock, curMinColor, curMaxColor, bestError );
					if( error < bestError )
					{
						bestError = error;
						memcpy( minColor, curMinColor, 2 );
						memcpy( maxColor, curMaxColor, 2 );
					}
				}
			}
		}
	}

	return bestError;
}

/*
========================
idDxtEncoder::GetMinMaxNormalYHQ

Uses an exhaustive search to find the two RGB colors that produce the least error when used to
compress the 4x4 block. Also finds the minimum and maximum alpha values.

params:	colorBlock	- 4*4 input tile, 4 bytes per pixel
paramO:	minColor	- 4 byte Min color found
paramO:	maxColor	- 4 byte Max color found
========================
*/
int idDxtEncoder::GetMinMaxNormalYHQ( const byte* colorBlock, byte* minColor, byte* maxColor, bool noBlack, int scale ) const
{
	unsigned short bestMinColor565, bestMaxColor565;
	byte bboxMin[3], bboxMax[3];
	int error, bestError = MAX_TYPE( int );

	bboxMin[1] = 255;
	bboxMax[1] = 0;

	// get color bbox
	for( int i = 0; i < 16; i++ )
	{
		if( colorBlock[i * 4 + 1] < bboxMin[1] )
		{
			bboxMin[1] = colorBlock[i * 4 + 1];
		}
		if( colorBlock[i * 4 + 1] > bboxMax[1] )
		{
			bboxMax[1] = colorBlock[i * 4 + 1];
		}
	}

	// decrease range for 565 encoding
	bboxMin[1] >>= 2;
	bboxMax[1] >>= 2;

	// expand the bounding box
	const int C565_BBOX_EXPAND = 1;

	bboxMin[1] = ( bboxMin[1] <= C565_BBOX_EXPAND ) ? 0 : bboxMin[1] - C565_BBOX_EXPAND;
	bboxMax[1] = ( bboxMax[1] >= ( 255 >> 2 ) - C565_BBOX_EXPAND ) ? ( 255 >> 2 ) : bboxMax[1] + C565_BBOX_EXPAND;

	bestMinColor565 = 0;
	bestMaxColor565 = 0;

	for( int i1 = bboxMin[1]; i1 <= bboxMax[1]; i1++ )
	{
		for( int j1 = bboxMax[1]; j1 >= bboxMin[1]; j1-- )
		{
			if( abs( i1 - j1 ) < 0 )
			{
				continue;
			}

			unsigned short minColor565 = ( unsigned short )i1 << 5;
			unsigned short maxColor565 = ( unsigned short )j1 << 5;

			if( !noBlack )
			{
				error = GetSquareNormalYError( colorBlock, maxColor565, minColor565, bestError, scale );
				if( error < bestError )
				{
					bestError = error;
					bestMinColor565 = minColor565;
					bestMaxColor565 = maxColor565;
				}
			}
			else
			{
				if( minColor565 <= maxColor565 )
				{
					SwapValues( minColor565, maxColor565 );
				}
			}

			error = GetSquareNormalYError( colorBlock, minColor565, maxColor565, bestError, scale );
			if( error < bestError )
			{
				bestError = error;
				bestMinColor565 = minColor565;
				bestMaxColor565 = maxColor565;
			}
		}
	}

	ColorFrom565( bestMinColor565, minColor );
	ColorFrom565( bestMaxColor565, maxColor );

	int bias = colorBlock[0 * 4 + 0];
	int size = colorBlock[0 * 4 + 2];

	minColor[0] = maxColor[0] = ( byte )bias;
	minColor[2] = maxColor[2] = ( byte )size;

	return bestError;
}

ALIGN16( static float SIMD_SSE2_float_scale[4] ) = { 2.0f / 255.0f, 2.0f / 255.0f, 2.0f / 255.0f, 2.0f / 255.0f };
ALIGN16( static float SIMD_SSE2_float_descale[4] ) = { 255.0f / 2.0f, 255.0f / 2.0f, 255.0f / 2.0f, 255.0f / 2.0f };
ALIGN16( static float SIMD_SSE2_float_zero[4] ) = { 0.0f, 0.0f, 0.0f, 0.0f };
ALIGN16( static float SIMD_SSE2_float_one[4] ) = { 1.0f, 1.0f, 1.0f, 1.0f };
ALIGN16( static float SIMD_SSE2_float_half[4] ) = { 0.5f, 0.5f, 0.5f, 0.5f };
ALIGN16( static float SIMD_SSE2_float_255[4] ) = { 255.0f, 255.0f, 255.0f, 255.0f };
ALIGN16( static float SIMD_SP_rsqrt_c0[4] ) = { 3.0f, 3.0f, 3.0f, 3.0f };
ALIGN16( static float SIMD_SP_rsqrt_c1[4] ) = { -0.5f, -0.5f, -0.5f, -0.5f };
ALIGN16( static dword SIMD_SSE2_dword_maskFirstThree[4] ) = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
ALIGN16( static dword SIMD_SSE2_dword_maskWords[4] ) = { 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x00000000 };
#define R_SHUFFLE_PS( x, y, z, w )	(( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))

/*
========================
NormalDistanceDXT1
========================
*/
int NormalDistanceDXT1( const int* vector, const int* normalized )
{
#if defined(_MSC_VER) && defined(_M_IX86)
	int result;
	__asm
	{
		mov			esi, vector
		mov			edi, normalized
		cvtdq2ps	xmm0, [esi]
		mulps		xmm0, SIMD_SSE2_float_scale
		subps		xmm0, SIMD_SSE2_float_one
		pand		xmm0, SIMD_SSE2_dword_maskFirstThree
		movaps		xmm1, xmm0
		mulps		xmm1, xmm1
		pshufd		xmm2, xmm1, R_SHUFFLE_PS( 2, 3, 0, 1 )
		addps		xmm2, xmm1
		pshufd		xmm1, xmm2, R_SHUFFLE_PS( 1, 0, 1, 0 )
		addps		xmm2, xmm1

		rsqrtps		xmm1, xmm2
		mulps		xmm2, xmm1
		mulps		xmm2, xmm1
		subps		xmm2, SIMD_SP_rsqrt_c0
		mulps		xmm1, SIMD_SP_rsqrt_c1
		mulps		xmm2, xmm1

		mulps		xmm0, xmm2
		addps		xmm0, SIMD_SSE2_float_one
		mulps		xmm0, SIMD_SSE2_float_descale
		addps		xmm0, SIMD_SSE2_float_half
		maxps		xmm0, SIMD_SSE2_float_zero
		minps		xmm0, SIMD_SSE2_float_255
		cvttps2dq	xmm0, xmm0
		psubd		xmm0, [edi]
		pand		xmm0, SIMD_SSE2_dword_maskWords
		pmullw		xmm0, xmm0
		pshufd		xmm1, xmm0, R_SHUFFLE_PS( 2, 3, 0, 1 )
		paddd		xmm0, xmm1
		pshufd		xmm1, xmm0, R_SHUFFLE_PS( 1, 0, 1, 0 )
		paddd		xmm0, xmm1
		movd		result, xmm0
	}
	return result;
#else
	float floatNormal[3];
	byte intNormal[4];
	floatNormal[0] = vector[0] * ( 2.0f / 255.0f ) - 1.0f;
	floatNormal[1] = vector[1] * ( 2.0f / 255.0f ) - 1.0f;
	floatNormal[2] = vector[2] * ( 2.0f / 255.0f ) - 1.0f;
	float rcplen = idMath::InvSqrt( floatNormal[0] * floatNormal[0] + floatNormal[1] * floatNormal[1] + floatNormal[2] * floatNormal[2] );
	floatNormal[0] *= rcplen;
	floatNormal[1] *= rcplen;
	floatNormal[2] *= rcplen;
	intNormal[0] = idMath::Ftob( ( floatNormal[0] + 1.0f ) * ( 255.0f / 2.0f ) + 0.5f );
	intNormal[1] = idMath::Ftob( ( floatNormal[1] + 1.0f ) * ( 255.0f / 2.0f ) + 0.5f );
	intNormal[2] = idMath::Ftob( ( floatNormal[2] + 1.0f ) * ( 255.0f / 2.0f ) + 0.5f );
	int result =	( ( intNormal[ 0 ] - normalized[ 0 ] ) * ( intNormal[ 0 ] - normalized[ 0 ] ) ) +
					( ( intNormal[ 1 ] - normalized[ 1 ] ) * ( intNormal[ 1 ] - normalized[ 1 ] ) ) +
					( ( intNormal[ 2 ] - normalized[ 2 ] ) * ( intNormal[ 2 ] - normalized[ 2 ] ) );
	return result;
#endif
}

/*
========================
NormalDistanceDXT5
========================
*/
int NormalDistanceDXT5( const int* vector, const int* normalized )
{
#if _MSC_VER && defined(_M_IX86)
	int result;
	__asm
	{
		mov			esi, vector
		mov			edi, normalized
#if 0	// object-space
		pshufd		xmm0, [esi], R_SHUFFLE_PS( 0, 1, 3, 2 )
#else
		pshufd		xmm0, [esi], R_SHUFFLE_PS( 1, 2, 3, 0 )
#endif
		cvtdq2ps	xmm0, xmm0
		mulps		xmm0, SIMD_SSE2_float_scale
		subps		xmm0, SIMD_SSE2_float_one
		pand		xmm0, SIMD_SSE2_dword_maskFirstThree
		movaps		xmm1, xmm0
		mulps		xmm1, xmm1
		pshufd		xmm2, xmm1, R_SHUFFLE_PS( 2, 3, 0, 1 )
		addps		xmm2, xmm1
		pshufd		xmm1, xmm2, R_SHUFFLE_PS( 1, 0, 1, 0 )
		addps		xmm2, xmm1

		rsqrtps		xmm1, xmm2
		mulps		xmm2, xmm1
		mulps		xmm2, xmm1
		subps		xmm2, SIMD_SP_rsqrt_c0
		mulps		xmm1, SIMD_SP_rsqrt_c1
		mulps		xmm2, xmm1

		mulps		xmm0, xmm2
		addps		xmm0, SIMD_SSE2_float_one
		mulps		xmm0, SIMD_SSE2_float_descale
		addps		xmm0, SIMD_SSE2_float_half
		maxps		xmm0, SIMD_SSE2_float_zero
		minps		xmm0, SIMD_SSE2_float_255
		cvttps2dq	xmm0, xmm0
#if 0	// object-space
		pshufd		xmm3, [edi], R_SHUFFLE_PS( 0, 1, 3, 2 )
#else
		pshufd		xmm3, [edi], R_SHUFFLE_PS( 1, 2, 3, 0 )
#endif
		psubd		xmm0, xmm3
		pand		xmm0, SIMD_SSE2_dword_maskWords
		pmullw		xmm0, xmm0
		pshufd		xmm1, xmm0, R_SHUFFLE_PS( 2, 3, 0, 1 )
		paddd		xmm0, xmm1
		pshufd		xmm1, xmm0, R_SHUFFLE_PS( 1, 0, 1, 0 )
		paddd		xmm0, xmm1
		movd		result, xmm0
	}
	return result;
#else
#if 0	// object-space
	const int c0 = 0;
	const int c1 = 1;
	const int c2 = 3;
#else
	const int c0 = 1;
	const int c1 = 2;
	const int c2 = 3;
#endif
	float floatNormal[3];
	byte intNormal[4];
	floatNormal[0] = vector[c0] / 255.0f * 2.0f - 1.0f;
	floatNormal[1] = vector[c1] / 255.0f * 2.0f - 1.0f;
	floatNormal[2] = vector[c2] / 255.0f * 2.0f - 1.0f;
	float rcplen = idMath::InvSqrt( floatNormal[0] * floatNormal[0] + floatNormal[1] * floatNormal[1] + floatNormal[2] * floatNormal[2] );
	floatNormal[0] *= rcplen;
	floatNormal[1] *= rcplen;
	floatNormal[2] *= rcplen;
	intNormal[c0] = idMath::Ftob( ( floatNormal[0] + 1.0f ) / 2.0f * 255.0f + 0.5f );
	intNormal[c1] = idMath::Ftob( ( floatNormal[1] + 1.0f ) / 2.0f * 255.0f + 0.5f );
	intNormal[c2] = idMath::Ftob( ( floatNormal[2] + 1.0f ) / 2.0f * 255.0f + 0.5f );
	int result =	( ( intNormal[ c0 ] - normalized[ c0 ] ) * ( intNormal[ c0 ] - normalized[ c0 ] ) ) +
					( ( intNormal[ c1 ] - normalized[ c1 ] ) * ( intNormal[ c1 ] - normalized[ c1 ] ) ) +
					( ( intNormal[ c2 ] - normalized[ c2 ] ) * ( intNormal[ c2 ] - normalized[ c2 ] ) );
	return result;
#endif
}

/*
========================
idDxtEncoder::GetSquareNormalsDXT1Error

params:	colorBlock	- 4*4 input tile, 4 bytes per pixel
paramO:	color0		- 4 byte Min color found
paramO:	color1		- 4 byte Max color found
return: 4 byte color index block
========================
*/
int idDxtEncoder::GetSquareNormalsDXT1Error( const int* colorBlock, const unsigned short color0, const unsigned short color1, int lastError, unsigned int& colorIndices ) const
{
	byte byteColors[2][4];
	ALIGN16( int colors[4][4] );

	ColorFrom565( color0, byteColors[0] );
	ColorFrom565( color1, byteColors[1] );

	for( int i = 0; i < 4; i++ )
	{
		colors[0][i] = byteColors[0][i];
		colors[1][i] = byteColors[1][i];
	}

	if( color0 > color1 )
	{
		colors[2][0] = ( 2 * colors[0][0] + 1 * colors[1][0] ) / 3;
		colors[2][1] = ( 2 * colors[0][1] + 1 * colors[1][1] ) / 3;
		colors[2][2] = ( 2 * colors[0][2] + 1 * colors[1][2] ) / 3;
		colors[3][0] = ( 1 * colors[0][0] + 2 * colors[1][0] ) / 3;
		colors[3][1] = ( 1 * colors[0][1] + 2 * colors[1][1] ) / 3;
		colors[3][2] = ( 1 * colors[0][2] + 2 * colors[1][2] ) / 3;
	}
	else
	{
		assert( color0 == color1 );
		colors[2][0] = ( 1 * colors[0][0] + 1 * colors[1][0] ) / 2;
		colors[2][1] = ( 1 * colors[0][1] + 1 * colors[1][1] ) / 2;
		colors[2][2] = ( 1 * colors[0][2] + 1 * colors[1][2] ) / 2;
		colors[3][0] = 0;
		colors[3][1] = 0;
		colors[3][2] = 0;
	}

	int error = 0;
	int tempColorIndices[16];
	for( int i = 0; i < 16; i++ )
	{
		unsigned int minDist = MAX_UNSIGNED_TYPE( int );

		for( int j = 0; j < 4; j++ )
		{
			unsigned int dist = NormalDistanceDXT1( &colors[j][0], &colorBlock[i * 4] );
			if( dist < minDist )
			{
				minDist = dist;
				tempColorIndices[i] = j;
			}
		}
		// accumulated error
		error += minDist;

		if( error > lastError )
		{
			return error;
		}
	}

	colorIndices = 0;
	for( int i = 0; i < 16; i++ )
	{
		colorIndices |= ( tempColorIndices[i] << ( unsigned int )( i << 1 ) );
	}

	return error;
}

/*
========================
idDxtEncoder::GetMinMaxNormalsDXT1HQ

Uses an exhaustive search to find the two RGB colors that produce the least error when used to
compress the 4x4 block. Also finds the minimum and maximum alpha values.

params:	colorBlock	- 4*4 input tile, 4 bytes per pixel
paramO:	minColor	- 4 byte Min color found
paramO:	maxColor	- 4 byte Max color found
========================
*/
int idDxtEncoder::GetMinMaxNormalsDXT1HQ( const byte* colorBlock, byte* minColor, byte* maxColor, unsigned int& colorIndices, bool noBlack ) const
{
	int i;
	int i0, i1, i2, j0, j1, j2;
	unsigned short bestMinColor565 = 0;
	unsigned short bestMaxColor565 = 0;
	byte bboxMin[3], bboxMax[3], minAxisDist[3];
	int error, bestError = MAX_TYPE( int );
	unsigned int tempColorIndices;
	ALIGN16( int intColorBlock[16 * 4] );

	bboxMin[0] = bboxMin[1] = bboxMin[2] = 128;
	bboxMax[0] = bboxMax[1] = bboxMax[2] = 128;

	// get color bbox
	for( i = 0; i < 16; i++ )
	{
		if( colorBlock[i * 4 + 0] < bboxMin[0] )
		{
			bboxMin[0] = colorBlock[i * 4 + 0];
		}
		if( colorBlock[i * 4 + 1] < bboxMin[1] )
		{
			bboxMin[1] = colorBlock[i * 4 + 1];
		}
		if( colorBlock[i * 4 + 2] < bboxMin[2] )
		{
			bboxMin[2] = colorBlock[i * 4 + 2];
		}
		if( colorBlock[i * 4 + 0] > bboxMax[0] )
		{
			bboxMax[0] = colorBlock[i * 4 + 0];
		}
		if( colorBlock[i * 4 + 1] > bboxMax[1] )
		{
			bboxMax[1] = colorBlock[i * 4 + 1];
		}
		if( colorBlock[i * 4 + 2] > bboxMax[2] )
		{
			bboxMax[2] = colorBlock[i * 4 + 2];
		}
	}

	for( int i = 0; i < 64; i++ )
	{
		intColorBlock[i] = colorBlock[i];
	}

	// decrease range for 565 encoding
	bboxMin[0] >>= 3;
	bboxMin[1] >>= 2;
	bboxMin[2] >>= 3;
	bboxMax[0] >>= 3;
	bboxMax[1] >>= 2;
	bboxMax[2] >>= 3;

	// get the minimum distance the end points of the line must be apart along each axis
	for( i = 0; i < 3; i++ )
	{
		minAxisDist[i] = 0;
	}

	// expand the bounding box
	const int C565_BBOX_EXPAND = 2;

	bboxMin[0] = ( bboxMin[0] <= C565_BBOX_EXPAND ) ? 0 : bboxMin[0] - C565_BBOX_EXPAND;
	bboxMin[1] = ( bboxMin[1] <= C565_BBOX_EXPAND ) ? 0 : bboxMin[1] - C565_BBOX_EXPAND;
	bboxMin[2] = ( bboxMin[2] <= C565_BBOX_EXPAND ) ? 0 : bboxMin[2] - C565_BBOX_EXPAND;
	bboxMax[0] = ( bboxMax[0] >= ( 255 >> 3 ) - C565_BBOX_EXPAND ) ? ( 255 >> 3 ) : bboxMax[0] + C565_BBOX_EXPAND;
	bboxMax[1] = ( bboxMax[1] >= ( 255 >> 2 ) - C565_BBOX_EXPAND ) ? ( 255 >> 2 ) : bboxMax[1] + C565_BBOX_EXPAND;
	bboxMax[2] = ( bboxMax[2] >= ( 255 >> 3 ) - C565_BBOX_EXPAND ) ? ( 255 >> 3 ) : bboxMax[2] + C565_BBOX_EXPAND;

	for( i0 = bboxMin[0]; i0 <= bboxMax[0]; i0++ )
	{
		for( j0 = bboxMax[0]; j0 >= bboxMin[0]; j0-- )
		{
			if( abs( i0 - j0 ) < minAxisDist[0] )
			{
				continue;
			}

			for( i1 = bboxMin[1]; i1 <= bboxMax[1]; i1++ )
			{
				for( j1 = bboxMax[1]; j1 >= bboxMin[1]; j1-- )
				{
					if( abs( i1 - j1 ) < minAxisDist[1] )
					{
						continue;
					}

					for( i2 = bboxMin[2]; i2 <= bboxMax[2]; i2++ )
					{
						for( j2 = bboxMax[2]; j2 >= bboxMin[2]; j2-- )
						{
							if( abs( i2 - j2 ) < minAxisDist[2] )
							{
								continue;
							}

							unsigned short minColor565 = ( unsigned short )( ( i0 << 11 ) | ( i1 << 5 ) | ( i2 << 0 ) );
							unsigned short maxColor565 = ( unsigned short )( ( j0 << 11 ) | ( j1 << 5 ) | ( j2 << 0 ) );

							if( !noBlack )
							{
								error = GetSquareNormalsDXT1Error( intColorBlock, maxColor565, minColor565, bestError, tempColorIndices );
								if( error < bestError )
								{
									bestError = error;
									bestMinColor565 = minColor565;
									bestMaxColor565 = maxColor565;
									colorIndices = tempColorIndices;
								}
							}
							else
							{
								if( minColor565 <= maxColor565 )
								{
									SwapValues( minColor565, maxColor565 );
								}
							}

							error = GetSquareNormalsDXT1Error( intColorBlock, minColor565, maxColor565, bestError, tempColorIndices );
							if( error < bestError )
							{
								bestError = error;
								bestMinColor565 = minColor565;
								bestMaxColor565 = maxColor565;
								colorIndices = tempColorIndices;
							}
						}
					}
				}
			}
		}
	}

	ColorFrom565( bestMinColor565, minColor );
	ColorFrom565( bestMaxColor565, maxColor );

	return bestError;
}

/*
========================
idDxtEncoder::GetSquareNormalsDXT5Error

params:	normalBlock	- 16 pixel block for which to find normal indexes
paramO:	minNormal	- Min normal found
paramO:	maxNormal	- Max normal found
========================
*/
int idDxtEncoder::GetSquareNormalsDXT5Error( const int* normalBlock, const byte* minNormal, const byte* maxNormal, int lastError, unsigned int& colorIndices, byte* alphaIndices ) const
{
	byte alphas[8];
	byte colors[4][4];

	unsigned short smin = ColorTo565( minNormal );
	unsigned short smax = ColorTo565( maxNormal );

	ColorFrom565( smax, colors[0] );
	ColorFrom565( smin, colors[1] );

	if( smax > smin )
	{
		colors[2][0] = ( 2 * colors[0][0] + 1 * colors[1][0] ) / 3;
		colors[2][1] = ( 2 * colors[0][1] + 1 * colors[1][1] ) / 3;
		colors[2][2] = ( 2 * colors[0][2] + 1 * colors[1][2] ) / 3;
		colors[3][0] = ( 1 * colors[0][0] + 2 * colors[1][0] ) / 3;
		colors[3][1] = ( 1 * colors[0][1] + 2 * colors[1][1] ) / 3;
		colors[3][2] = ( 1 * colors[0][2] + 2 * colors[1][2] ) / 3;
	}
	else
	{
		assert( smax == smin );
		colors[2][0] = ( 1 * colors[0][0] + 1 * colors[1][0] ) / 2;
		colors[2][1] = ( 1 * colors[0][1] + 1 * colors[1][1] ) / 2;
		colors[2][2] = ( 1 * colors[0][2] + 1 * colors[1][2] ) / 2;
		colors[3][0] = 0;
		colors[3][1] = 0;
		colors[3][2] = 0;
	}

	alphas[0] = maxNormal[3];
	alphas[1] = minNormal[3];

	if( maxNormal[3] > minNormal[3] )
	{
		alphas[2] = ( 6 * alphas[0] + 1 * alphas[1] ) / 7;
		alphas[3] = ( 5 * alphas[0] + 2 * alphas[1] ) / 7;
		alphas[4] = ( 4 * alphas[0] + 3 * alphas[1] ) / 7;
		alphas[5] = ( 3 * alphas[0] + 4 * alphas[1] ) / 7;
		alphas[6] = ( 2 * alphas[0] + 5 * alphas[1] ) / 7;
		alphas[7] = ( 1 * alphas[0] + 6 * alphas[1] ) / 7;
	}
	else
	{
		alphas[2] = ( 4 * alphas[0] + 1 * alphas[1] ) / 5;
		alphas[3] = ( 3 * alphas[0] + 2 * alphas[1] ) / 5;
		alphas[4] = ( 2 * alphas[0] + 3 * alphas[1] ) / 5;
		alphas[5] = ( 1 * alphas[0] + 4 * alphas[1] ) / 5;
		alphas[6] = 0;
		alphas[7] = 255;
	}

	int error = 0;
	int tempColorIndices[16];
	int tempAlphaIndices[16];
	for( int i = 0; i < 16; i++ )
	{
		ALIGN16( int normal[4] );
		unsigned int minDist = MAX_UNSIGNED_TYPE( int );

		for( int j = 0; j < 4; j++ )
		{
			normal[0] = colors[j][0];
			normal[1] = colors[j][1];
			normal[2] = colors[j][2];

			for( int k = 0; k < 8; k++ )
			{
				normal[3] = alphas[k];
				unsigned int dist = NormalDistanceDXT5( normal, &normalBlock[i * 4] );
				if( dist < minDist )
				{
					minDist = dist;
					tempColorIndices[i] = j;
					tempAlphaIndices[i] = k;
				}
			}
		}
		error += minDist;

		if( error >= lastError )
		{
			return error;
		}
	}

	alphaIndices[0] = byte( ( tempAlphaIndices[ 0] >> 0 ) | ( tempAlphaIndices[ 1] << 3 ) | ( tempAlphaIndices[ 2] << 6 ) );
	alphaIndices[1] = byte( ( tempAlphaIndices[ 2] >> 2 ) | ( tempAlphaIndices[ 3] << 1 ) | ( tempAlphaIndices[ 4] << 4 ) | ( tempAlphaIndices[ 5] << 7 ) );
	alphaIndices[2] = byte( ( tempAlphaIndices[ 5] >> 1 ) | ( tempAlphaIndices[ 6] << 2 ) | ( tempAlphaIndices[ 7] << 5 ) );

	alphaIndices[3] = byte( ( tempAlphaIndices[ 8] >> 0 ) | ( tempAlphaIndices[ 9] << 3 ) | ( tempAlphaIndices[10] << 6 ) );
	alphaIndices[4] = byte( ( tempAlphaIndices[10] >> 2 ) | ( tempAlphaIndices[11] << 1 ) | ( tempAlphaIndices[12] << 4 ) | ( tempAlphaIndices[13] << 7 ) );
	alphaIndices[5] = byte( ( tempAlphaIndices[13] >> 1 ) | ( tempAlphaIndices[14] << 2 ) | ( tempAlphaIndices[15] << 5 ) );

	colorIndices = 0;
	for( int i = 0; i < 16; i++ )
	{
		colorIndices |= ( tempColorIndices[i] << ( unsigned int )( i << 1 ) );
	}

	return error;
}

/*
========================
idDxtEncoder::GetMinMaxNormalsDXT5HQ

Uses an exhaustive search to find the two RGB colors that produce the least error when used to
compress the 4x4 block. Also finds the minimum and maximum alpha values.

params:	colorBlock	- 4*4 input tile, 4 bytes per pixel
paramO:	minColor	- 4 byte Min color found
paramO:	maxColor	- 4 byte Max color found
========================
*/
int idDxtEncoder::GetMinMaxNormalsDXT5HQ( const byte* colorBlock, byte* minColor, byte* maxColor, unsigned int& colorIndices, byte* alphaIndices ) const
{
	int i;
	int i0, i1, i3, j0, j1, j3;
	byte bboxMin[4], bboxMax[4], minAxisDist[4];
	byte tmin[4], tmax[4];
	int error, bestError = MAX_TYPE( int );
	unsigned int tempColorIndices;
	byte tempAlphaIndices[6];
	ALIGN16( int intColorBlock[16 * 4] );

	bboxMin[0] = bboxMin[1] = bboxMin[2] = bboxMin[3] = 255;
	bboxMax[0] = bboxMax[1] = bboxMax[2] = bboxMax[3] = 0;

	// get color bbox
	for( i = 0; i < 16; i++ )
	{
		if( colorBlock[i * 4 + 0] < bboxMin[0] )
		{
			bboxMin[0] = colorBlock[i * 4 + 0];
		}
		if( colorBlock[i * 4 + 1] < bboxMin[1] )
		{
			bboxMin[1] = colorBlock[i * 4 + 1];
		}
		if( colorBlock[i * 4 + 2] < bboxMin[2] )
		{
			bboxMin[2] = colorBlock[i * 4 + 2];
		}
		if( colorBlock[i * 4 + 3] < bboxMin[3] )
		{
			bboxMin[3] = colorBlock[i * 4 + 3];
		}
		if( colorBlock[i * 4 + 0] > bboxMax[0] )
		{
			bboxMax[0] = colorBlock[i * 4 + 0];
		}
		if( colorBlock[i * 4 + 1] > bboxMax[1] )
		{
			bboxMax[1] = colorBlock[i * 4 + 1];
		}
		if( colorBlock[i * 4 + 2] > bboxMax[2] )
		{
			bboxMax[2] = colorBlock[i * 4 + 2];
		}
		if( colorBlock[i * 4 + 3] > bboxMax[3] )
		{
			bboxMax[3] = colorBlock[i * 4 + 3];
		}
	}

	for( int i = 0; i < 64; i++ )
	{
		intColorBlock[i] = colorBlock[i];
	}

	// decrease range for 565 encoding
	bboxMin[0] >>= 3;
	bboxMin[1] >>= 2;
	bboxMax[0] >>= 3;
	bboxMax[1] >>= 2;

	// get the minimum distance the end points of the line must be apart along each axis
	for( i = 0; i < 4; i++ )
	{
		minAxisDist[i] = 0;
	}

	// expand the bounding box
	const int C565_BBOX_EXPAND = 2;
	const int ALPHA_BBOX_EXPAND = 32;

	bboxMin[0] = ( bboxMin[0] <= C565_BBOX_EXPAND ) ? 0 : bboxMin[0] - C565_BBOX_EXPAND;
	bboxMin[1] = ( bboxMin[1] <= C565_BBOX_EXPAND ) ? 0 : bboxMin[1] - C565_BBOX_EXPAND;
	bboxMin[3] = ( bboxMin[3] <= ALPHA_BBOX_EXPAND ) ? 0 : bboxMin[3] - ALPHA_BBOX_EXPAND;
	bboxMax[0] = ( bboxMax[0] >= ( 255 >> 3 ) - C565_BBOX_EXPAND ) ? ( 255 >> 3 ) : bboxMax[0] + C565_BBOX_EXPAND;
	bboxMax[1] = ( bboxMax[1] >= ( 255 >> 2 ) - C565_BBOX_EXPAND ) ? ( 255 >> 2 ) : bboxMax[1] + C565_BBOX_EXPAND;
	bboxMax[3] = ( bboxMax[3] >= ( 255 ) - ALPHA_BBOX_EXPAND ) ? ( 255 ) : bboxMax[3] + ALPHA_BBOX_EXPAND;

	for( i0 = bboxMin[0]; i0 <= bboxMax[0]; i0++ )
	{
		for( j0 = bboxMax[0]; j0 >= bboxMin[0]; j0-- )
		{
			if( abs( i0 - j0 ) < minAxisDist[0] )
			{
				continue;
			}

			for( i1 = bboxMin[1]; i1 <= bboxMax[1]; i1++ )
			{
				for( j1 = bboxMax[1]; j1 >= bboxMin[1]; j1-- )
				{
					if( abs( i1 - j1 ) < minAxisDist[1] )
					{
						continue;
					}

					tmin[0] = ( byte )j0 << 3;
					tmin[1] = ( byte )j1 << 2;
					tmin[2] = 0;

					tmax[0] = ( byte )i0 << 3;
					tmax[1] = ( byte )i1 << 2;
					tmax[2] = 0;

					for( i3 = bboxMin[3]; i3 <= bboxMax[3]; i3++ )
					{
						for( j3 = bboxMax[3]; j3 >= bboxMin[3]; j3-- )
						{
							if( abs( i3 - j3 ) < minAxisDist[3] )
							{
								continue;
							}

							tmin[3] = ( byte )j3;
							tmax[3] = ( byte )i3;

							error = GetSquareNormalsDXT5Error( intColorBlock, tmin, tmax, bestError, tempColorIndices, tempAlphaIndices );
							if( error < bestError )
							{
								bestError = error;
								memcpy( minColor, tmin, 4 );
								memcpy( maxColor, tmax, 4 );
								colorIndices = tempColorIndices;
								memcpy( alphaIndices, tempAlphaIndices, 6 );
							}

							tmin[3] = ( byte )i3;
							tmax[3] = ( byte )j3;

							error = GetSquareNormalsDXT5Error( intColorBlock, tmin, tmax, bestError, tempColorIndices, tempAlphaIndices );
							if( error < bestError )
							{
								bestError = error;
								memcpy( minColor, tmin, 4 );
								memcpy( maxColor, tmax, 4 );
								colorIndices = tempColorIndices;
								memcpy( alphaIndices, tempAlphaIndices, 6 );
							}
						}
					}
				}
			}
		}
	}

	return bestError;
}

/*
========================
idDxtEncoder::GetMinMaxNormalsDXT5HQFast

Uses an exhaustive search to find the two RGB colors that produce the least error when used to
compress the 4x4 block. Also finds the minimum and maximum alpha values.

params:	colorBlock	- 4*4 input tile, 4 bytes per pixel
paramO:	minColor	- 4 byte Min color found
paramO:	maxColor	- 4 byte Max color found
========================
*/
int idDxtEncoder::GetMinMaxNormalsDXT5HQFast( const byte* colorBlock, byte* minColor, byte* maxColor, unsigned int& colorIndices, byte* alphaIndices ) const
{
	int i0, i1, i2, i3, j0, j1, j2, j3;
	byte bboxMin[4], bboxMax[4], minAxisDist[4];
	byte tmin[4], tmax[4];
	int error, bestError = MAX_TYPE( int );
	unsigned int tempColorIndices;
	byte tempAlphaIndices[6];
	ALIGN16( int intColorBlock[16 * 4] );

	bboxMin[0] = bboxMin[1] = bboxMin[2] = bboxMin[3] = 255;
	bboxMax[0] = bboxMax[1] = bboxMax[2] = bboxMax[3] = 0;

	// get color bbox
	for( int i = 0; i < 16; i++ )
	{
		if( colorBlock[i * 4 + 0] < bboxMin[0] )
		{
			bboxMin[0] = colorBlock[i * 4 + 0];
		}
		if( colorBlock[i * 4 + 1] < bboxMin[1] )
		{
			bboxMin[1] = colorBlock[i * 4 + 1];
		}
		if( colorBlock[i * 4 + 2] < bboxMin[2] )
		{
			bboxMin[2] = colorBlock[i * 4 + 2];
		}
		if( colorBlock[i * 4 + 3] < bboxMin[3] )
		{
			bboxMin[3] = colorBlock[i * 4 + 3];
		}
		if( colorBlock[i * 4 + 0] > bboxMax[0] )
		{
			bboxMax[0] = colorBlock[i * 4 + 0];
		}
		if( colorBlock[i * 4 + 1] > bboxMax[1] )
		{
			bboxMax[1] = colorBlock[i * 4 + 1];
		}
		if( colorBlock[i * 4 + 2] > bboxMax[2] )
		{
			bboxMax[2] = colorBlock[i * 4 + 2];
		}
		if( colorBlock[i * 4 + 3] > bboxMax[3] )
		{
			bboxMax[3] = colorBlock[i * 4 + 3];
		}
	}

	for( int i = 0; i < 64; i++ )
	{
		intColorBlock[i] = colorBlock[i];
	}

	// decrease range for 565 encoding
	bboxMin[0] >>= 3;
	bboxMin[1] >>= 2;
	bboxMin[2] >>= 3;
	bboxMax[0] >>= 3;
	bboxMax[1] >>= 2;
	bboxMax[2] >>= 3;

	bboxMin[3] = 0;
	bboxMax[3] = 255;

	// get the minimum distance the end points of the line must be apart along each axis
	for( int i = 0; i < 4; i++ )
	{
		minAxisDist[i] = 0;
	}

	// expand the bounding box
	const int C565_BBOX_EXPAND = 1;
	const int ALPHA_BBOX_EXPAND = 128;

#if 0 // object-space
	bboxMin[0] = ( bboxMin[0] <= C565_BBOX_EXPAND ) ? 0 : bboxMin[0] - C565_BBOX_EXPAND;
	bboxMax[0] = ( bboxMax[0] >= ( 255 >> 3 ) - C565_BBOX_EXPAND ) ? ( 255 >> 3 ) : bboxMax[0] + C565_BBOX_EXPAND;
	bboxMin[2] = 0;
	bboxMax[2] = 0;
#else
	bboxMin[0] = 0;
	bboxMax[0] = 0;
	bboxMin[2] = ( bboxMin[2] <= C565_BBOX_EXPAND ) ? 0 : bboxMin[2] - C565_BBOX_EXPAND;
	bboxMax[2] = ( bboxMax[2] >= ( 255 >> 2 ) - C565_BBOX_EXPAND ) ? ( 255 >> 2 ) : bboxMax[2] + C565_BBOX_EXPAND;
#endif

	bboxMin[1] = ( bboxMin[1] <= C565_BBOX_EXPAND ) ? 0 : bboxMin[1] - C565_BBOX_EXPAND;
	bboxMax[1] = ( bboxMax[1] >= ( 255 >> 2 ) - C565_BBOX_EXPAND ) ? ( 255 >> 2 ) : bboxMax[1] + C565_BBOX_EXPAND;

	bboxMin[3] = ( bboxMin[3] <= ALPHA_BBOX_EXPAND ) ? 0 : bboxMin[3] - ALPHA_BBOX_EXPAND;
	bboxMax[3] = ( bboxMax[3] >= ( 255 ) - ALPHA_BBOX_EXPAND ) ? ( 255 ) : bboxMax[3] + ALPHA_BBOX_EXPAND;

	for( i0 = bboxMin[0]; i0 <= bboxMax[0]; i0++ )
	{
		for( j0 = bboxMax[0]; j0 >= bboxMin[0]; j0-- )
		{
			if( abs( i0 - j0 ) < minAxisDist[0] )
			{
				continue;
			}

			for( i1 = bboxMin[1]; i1 <= bboxMax[1]; i1++ )
			{
				for( j1 = bboxMax[1]; j1 >= bboxMin[1]; j1-- )
				{
					if( abs( i1 - j1 ) < minAxisDist[1] )
					{
						continue;
					}

					for( i2 = bboxMin[2]; i2 <= bboxMax[2]; i2++ )
					{
						for( j2 = bboxMax[2]; j2 >= bboxMin[2]; j2-- )
						{
							if( abs( i2 - j2 ) < minAxisDist[2] )
							{
								continue;
							}

							unsigned short minColor565 = ( unsigned short )( ( i0 << 11 ) | ( i1 << 5 ) | i2 );
							unsigned short maxColor565 = ( unsigned short )( ( j0 << 11 ) | ( j1 << 5 ) | j2 );

							if( minColor565 > maxColor565 )
							{
								SwapValues( minColor565, maxColor565 );
							}

							error = GetSquareNormalsDXT1Error( intColorBlock, maxColor565, minColor565, bestError, tempColorIndices );
							if( error < bestError )
							{
								bestError = error;
								ColorFrom565( minColor565, minColor );
								ColorFrom565( maxColor565, maxColor );
								colorIndices = tempColorIndices;
							}
						}
					}
				}
			}
		}
	}

	bestError = MAX_TYPE( int );

	memcpy( tmin, minColor, 4 );
	memcpy( tmax, maxColor, 4 );

	for( i3 = bboxMin[3]; i3 <= bboxMax[3]; i3++ )
	{
		for( j3 = bboxMax[3]; j3 >= bboxMin[3]; j3-- )
		{
			if( abs( i3 - j3 ) < minAxisDist[3] )
			{
				continue;
			}

			tmin[3] = ( byte )j3;
			tmax[3] = ( byte )i3;

			error = GetSquareNormalsDXT5Error( intColorBlock, tmin, tmax, bestError, tempColorIndices, tempAlphaIndices );
			if( error < bestError )
			{
				bestError = error;
				memcpy( minColor, tmin, 4 );
				memcpy( maxColor, tmax, 4 );
				colorIndices = tempColorIndices;
				memcpy( alphaIndices, tempAlphaIndices, 6 );
			}

			tmin[3] = ( byte )i3;
			tmax[3] = ( byte )j3;

			error = GetSquareNormalsDXT5Error( intColorBlock, tmin, tmax, bestError, tempColorIndices, tempAlphaIndices );
			if( error < bestError )
			{
				bestError = error;
				memcpy( minColor, tmin, 4 );
				memcpy( maxColor, tmax, 4 );
				colorIndices = tempColorIndices;
				memcpy( alphaIndices, tempAlphaIndices, 6 );
			}
		}
	}

	return bestError;
}

/*
========================
idDxtEncoder::FindColorIndices

params:	colorBlock	- 16 pixel block for which find color indexes
paramO:	color0		- Min color found
paramO:	color1		- Max color found
return: 4 byte color index block
========================
*/
int idDxtEncoder::FindColorIndices( const byte* colorBlock, const unsigned short color0, const unsigned short color1, unsigned int& result ) const
{
	int i, j;
	unsigned int indexes[16];
	byte colors[4][4];

	ColorFrom565( color0, colors[0] );
	ColorFrom565( color1, colors[1] );

	if( color0 > color1 )
	{
		colors[2][0] = ( 2 * colors[0][0] + 1 * colors[1][0] ) / 3;
		colors[2][1] = ( 2 * colors[0][1] + 1 * colors[1][1] ) / 3;
		colors[2][2] = ( 2 * colors[0][2] + 1 * colors[1][2] ) / 3;
		colors[3][0] = ( 1 * colors[0][0] + 2 * colors[1][0] ) / 3;
		colors[3][1] = ( 1 * colors[0][1] + 2 * colors[1][1] ) / 3;
		colors[3][2] = ( 1 * colors[0][2] + 2 * colors[1][2] ) / 3;
	}
	else
	{
		colors[2][0] = ( 1 * colors[0][0] + 1 * colors[1][0] ) / 2;
		colors[2][1] = ( 1 * colors[0][1] + 1 * colors[1][1] ) / 2;
		colors[2][2] = ( 1 * colors[0][2] + 1 * colors[1][2] ) / 2;
		colors[3][0] = 0;
		colors[3][1] = 0;
		colors[3][2] = 0;
	}

	int error = 0;
	for( i = 0; i < 16; i++ )
	{
		unsigned int minDist = MAX_UNSIGNED_TYPE( int );
		for( j = 0; j < 4; j++ )
		{
			unsigned int dist = ColorDistance( &colorBlock[i * 4], &colors[j][0] );
			if( dist < minDist )
			{
				minDist = dist;
				indexes[i] = j;
			}
		}
		// accumulated error
		error += minDist;
	}

	result = 0;
	for( i = 0; i < 16; i++ )
	{
		result |= ( indexes[i] << ( unsigned int )( i << 1 ) );
	}

	return error;
}

/*
========================
idDxtEncoder::FindAlphaIndices

params:	colorBlock	- 16 pixel block for which find alpha indexes
paramO:	alpha0		- Min alpha found
paramO:	alpha1		- Max alpha found
params:	rindexes	- 6 byte alpha index block
return: error metric for this compression
========================
*/
int idDxtEncoder::FindAlphaIndices( const byte* colorBlock, const int alphaOffset, const byte alpha0, const byte alpha1, byte* rindexes ) const
{
	int i, j;
	unsigned int indexes[16];
	byte alphas[8];

	alphas[0] = alpha0;
	alphas[1] = alpha1;
	if( alpha0 > alpha1 )
	{
		alphas[2] = ( 6 * alpha0 + 1 * alpha1 ) / 7;
		alphas[3] = ( 5 * alpha0 + 2 * alpha1 ) / 7;
		alphas[4] = ( 4 * alpha0 + 3 * alpha1 ) / 7;
		alphas[5] = ( 3 * alpha0 + 4 * alpha1 ) / 7;
		alphas[6] = ( 2 * alpha0 + 5 * alpha1 ) / 7;
		alphas[7] = ( 1 * alpha0 + 6 * alpha1 ) / 7;
	}
	else
	{
		alphas[2] = ( 4 * alpha0 + 1 * alpha1 ) / 5;
		alphas[3] = ( 3 * alpha0 + 2 * alpha1 ) / 5;
		alphas[4] = ( 2 * alpha0 + 3 * alpha1 ) / 5;
		alphas[5] = ( 1 * alpha0 + 4 * alpha1 ) / 5;
		alphas[6] = 0;
		alphas[7] = 255;
	}

	int error = 0;
	for( i = 0; i < 16; i++ )
	{
		unsigned int minDist = MAX_UNSIGNED_TYPE( int );
		byte a = colorBlock[i * 4 + alphaOffset];
		for( j = 0; j < 8; j++ )
		{
			unsigned int dist = AlphaDistance( a, alphas[j] );
			if( dist < minDist )
			{
				minDist = dist;
				indexes[i] = j;
			}
		}
		error += minDist;
	}

	rindexes[0] = byte( ( indexes[ 0] >> 0 ) | ( indexes[ 1] << 3 ) | ( indexes[ 2] << 6 ) );
	rindexes[1] = byte( ( indexes[ 2] >> 2 ) | ( indexes[ 3] << 1 ) | ( indexes[ 4] << 4 ) | ( indexes[ 5] << 7 ) );
	rindexes[2] = byte( ( indexes[ 5] >> 1 ) | ( indexes[ 6] << 2 ) | ( indexes[ 7] << 5 ) );

	rindexes[3] = byte( ( indexes[ 8] >> 0 ) | ( indexes[ 9] << 3 ) | ( indexes[10] << 6 ) );
	rindexes[4] = byte( ( indexes[10] >> 2 ) | ( indexes[11] << 1 ) | ( indexes[12] << 4 ) | ( indexes[13] << 7 ) );
	rindexes[5] = byte( ( indexes[13] >> 1 ) | ( indexes[14] << 2 ) | ( indexes[15] << 5 ) );

	return error;
}

/*
========================
idDxtEncoder::FindCTX1Indices

params:	colorBlock	- 16 pixel block for which find color indexes
paramO:	color0		- Min color found
paramO:	color1		- Max color found
return: 4 byte color index block
========================
*/
int idDxtEncoder::FindCTX1Indices( const byte* colorBlock, const byte* color0, const byte* color1, unsigned int& result ) const
{
	int i, j;
	unsigned int indexes[16];
	byte colors[4][4];

	colors[0][0] = color1[0];
	colors[0][1] = color1[1];
	colors[1][0] = color0[0];
	colors[1][1] = color0[1];

	colors[2][0] = ( 2 * colors[0][0] + 1 * colors[1][0] ) / 3;
	colors[2][1] = ( 2 * colors[0][1] + 1 * colors[1][1] ) / 3;
	colors[3][0] = ( 1 * colors[0][0] + 2 * colors[1][0] ) / 3;
	colors[3][1] = ( 1 * colors[0][1] + 2 * colors[1][1] ) / 3;

	int error = 0;
	for( i = 0; i < 16; i++ )
	{
		unsigned int minDist = MAX_UNSIGNED_TYPE( int );
		for( j = 0; j < 4; j++ )
		{
			unsigned int dist = CTX1Distance( &colorBlock[i * 4], &colors[j][0] );
			if( dist < minDist )
			{
				minDist = dist;
				indexes[i] = j;
			}
		}
		// accumulated error
		error += minDist;
	}

	result = 0;
	for( i = 0; i < 16; i++ )
	{
		result |= ( indexes[i] << ( unsigned int )( i << 1 ) );
	}

	return error;
}

/*
========================
idDxtEncoder::CompressImageDXT1HQ

params:	inBuf		- image to compress
paramO:	outBuf		- result of compression
params:	width		- width of image
params:	height		- height of image
========================
*/
void idDxtEncoder::CompressImageDXT1HQ( const byte* inBuf, byte* outBuf, int width, int height )
{
	ALIGN16( byte block[64] );
	unsigned int colorIndices1;
	unsigned int colorIndices2;
	byte col1[4];
	byte col2[4];
	int error1;
	int error2;

	this->width = width;
	this->height = height;
	this->outData = outBuf;

	if( width > 4 && ( width & 3 ) != 0 )
	{
		return;
	}
	if( height > 4 && ( height & 3 ) != 0 )
	{
		return;
	}

	if( width < 4 || height < 4 )
	{
		WriteTinyColorDXT1( inBuf, width, height );
		return;
	}

	for( int j = 0; j < height; j += 4, inBuf += width * 4 * 4 )
	{
		for( int i = 0; i < width; i += 4 )
		{
			common->LoadPacifierBinarizeProgressIncrement( 16 );

			ExtractBlock( inBuf + i * 4, width, block );

			GetMinMaxColorsHQ( block, col1, col2, false );

			// Write out color data. Try and find minimum error for the two encoding methods.
			unsigned short scol1 = ColorTo565( col1 );
			unsigned short scol2 = ColorTo565( col2 );

			error1 = FindColorIndices( block, scol1, scol2, colorIndices1 );
			error2 = FindColorIndices( block, scol2, scol1, colorIndices2 );

			if( error1 < error2 )
			{

				EmitUShort( scol1 );
				EmitUShort( scol2 );
				EmitUInt( colorIndices1 );

			}
			else
			{

				EmitUShort( scol2 );
				EmitUShort( scol1 );
				EmitUInt( colorIndices2 );
			}

			//idLib::Printf( "\r%3d%%", ( j * width + i ) * 100 / ( width * height ) );
		}
		outData += dstPadding;
		inBuf += srcPadding;
	}

	//idLib::Printf( "\r100%%\n" );
}

/*
========================
idDxtEncoder::CompressImageDXT5HQ

params:	inBuf		- image to compress
paramO:	outBuf		- result of compression
params:	width		- width of image
params:	height		- height of image
========================
*/
void idDxtEncoder::CompressImageDXT5HQ( const byte* inBuf, byte* outBuf, int width, int height )
{
	ALIGN16( byte block[64] );
	byte alphaIndices1[6];
	byte alphaIndices2[6];
	unsigned int colorIndices;
	byte col1[4];
	byte col2[4];
	int error1;
	int error2;

	this->width = width;
	this->height = height;
	this->outData = outBuf;

	if( width > 4 && ( width & 3 ) != 0 )
	{
		return;
	}
	if( height > 4 && ( height & 3 ) != 0 )
	{
		return;
	}

	if( width < 4 || height < 4 )
	{
		WriteTinyColorDXT5( inBuf, width, height );
		return;
	}

	for( int j = 0; j < height; j += 4, inBuf += width * 4 * 4 )
	{
		for( int i = 0; i < width; i += 4 )
		{
			common->LoadPacifierBinarizeProgressIncrement( 16 );

			ExtractBlock( inBuf + i * 4, width, block );

			GetMinMaxColorsHQ( block, col1, col2, true );
			GetMinMaxAlphaHQ( block, 3, col1, col2 );

			// Write out alpha data. Try and find minimum error for the two encoding methods.
			error1 = FindAlphaIndices( block, 3, col1[3], col2[3], alphaIndices1 );
			error2 = FindAlphaIndices( block, 3, col2[3], col1[3], alphaIndices2 );

			if( error1 < error2 )
			{

				EmitByte( col1[3] );
				EmitByte( col2[3] );
				EmitByte( alphaIndices1[0] );
				EmitByte( alphaIndices1[1] );
				EmitByte( alphaIndices1[2] );
				EmitByte( alphaIndices1[3] );
				EmitByte( alphaIndices1[4] );
				EmitByte( alphaIndices1[5] );

			}
			else
			{

				EmitByte( col2[3] );
				EmitByte( col1[3] );
				EmitByte( alphaIndices2[0] );
				EmitByte( alphaIndices2[1] );
				EmitByte( alphaIndices2[2] );
				EmitByte( alphaIndices2[3] );
				EmitByte( alphaIndices2[4] );
				EmitByte( alphaIndices2[5] );
			}

#ifdef NVIDIA_7X_HARDWARE_BUG_FIX
			NV4XHardwareBugFix( col2, col1 );
#endif

			// Write out color data. Always take the path with 4 interpolated values.
			unsigned short scol1 = ColorTo565( col1 );
			unsigned short scol2 = ColorTo565( col2 );

			EmitUShort( scol1 );
			EmitUShort( scol2 );

			FindColorIndices( block, scol1, scol2, colorIndices );
			EmitUInt( colorIndices );

			//idLib::Printf( "\r%3d%%", ( j * width + i ) * 100 / ( width * height ) );
		}
		outData += dstPadding;
		inBuf += srcPadding;
	}

	//idLib::Printf( "\r100%%\n" );
}

/*
========================
idDxtEncoder::CompressImageCTX1HQ

params:	inBuf		- image to compress
paramO:	outBuf		- result of compression
params:	width		- width of image
params:	height		- height of image
========================
*/
void idDxtEncoder::CompressImageCTX1HQ( const byte* inBuf, byte* outBuf, int width, int height )
{
	ALIGN16( byte block[64] );
	unsigned int colorIndices;
	byte col1[4];
	byte col2[4];

	this->width = width;
	this->height = height;
	this->outData = outBuf;

	if( width > 4 && ( width & 3 ) != 0 )
	{
		return;
	}
	if( height > 4 && ( height & 3 ) != 0 )
	{
		return;
	}

	if( width < 4 || height < 4 )
	{
		WriteTinyColorCTX1DXT5A( inBuf, width, height );
		return;
	}

	for( int j = 0; j < height; j += 4, inBuf += width * 4 * 4 )
	{
		for( int i = 0; i < width; i += 4 )
		{
			common->LoadPacifierBinarizeProgressIncrement( 16 );

			ExtractBlock( inBuf + i * 4, width, block );

			GetMinMaxCTX1HQ( block, col1, col2 );

			EmitByte( col2[0] );
			EmitByte( col2[1] );
			EmitByte( col1[0] );
			EmitByte( col1[1] );

			FindCTX1Indices( block, col1, col2, colorIndices );
			EmitUInt( colorIndices );

			//idLib::Printf( "\r%3d%%", ( j * width + i ) * 100 / ( width * height ) );
		}
		outData += dstPadding;
		inBuf += srcPadding;
	}

	//idLib::Printf( "\r100%%\n" );
}

/*
========================
idDxtEncoder::ScaleYCoCg

params:	colorBlock	- 16 pixel block for which find color indexes
========================
*/
void idDxtEncoder::ScaleYCoCg( byte* colorBlock ) const
{
	ALIGN16( byte minColor[4] );
	ALIGN16( byte maxColor[4] );

	minColor[0] = minColor[1] = minColor[2] = minColor[3] = 255;
	maxColor[0] = maxColor[1] = maxColor[2] = maxColor[3] = 0;

	for( int i = 0; i < 16; i++ )
	{
		if( colorBlock[i * 4 + 0] < minColor[0] )
		{
			minColor[0] = colorBlock[i * 4 + 0];
		}
		if( colorBlock[i * 4 + 1] < minColor[1] )
		{
			minColor[1] = colorBlock[i * 4 + 1];
		}
		if( colorBlock[i * 4 + 0] > maxColor[0] )
		{
			maxColor[0] = colorBlock[i * 4 + 0];
		}
		if( colorBlock[i * 4 + 1] > maxColor[1] )
		{
			maxColor[1] = colorBlock[i * 4 + 1];
		}
	}

	int m0 = abs( minColor[0] - 128 );
	int m1 = abs( minColor[1] - 128 );
	int m2 = abs( maxColor[0] - 128 );
	int m3 = abs( maxColor[1] - 128 );

	if( m1 > m0 )
	{
		m0 = m1;
	}
	if( m3 > m2 )
	{
		m2 = m3;
	}
	if( m2 > m0 )
	{
		m0 = m2;
	}

	const int s0 = 128 / 2 - 1;
	const int s1 = 128 / 4 - 1;

	int scale = 1 + ( m0 <= s0 ) + 2 * ( m0 <= s1 );

	for( int i = 0; i < 16; i++ )
	{
		colorBlock[i * 4 + 0] = byte( ( colorBlock[i * 4 + 0] - 128 ) * scale + 128 );
		colorBlock[i * 4 + 1] = byte( ( colorBlock[i * 4 + 1] - 128 ) * scale + 128 );
		colorBlock[i * 4 + 2] = byte( ( scale - 1 ) << 3 );
	}
}

// LordHavoc begin

/*
========================
idDxtEncoder::ExtractBlockGimpDDS

Extract 4x4 BGRA block
========================
*/
void idDxtEncoder::ExtractBlockGimpDDS( const byte* src, int x, int y, int w, int h, byte* block )
{
	int i, j;
	int bw = MIN( w - x, 4 );
	int bh = MIN( h - y, 4 );
	int bx, by;
	const int rem[] =
	{
		0, 0, 0, 0,
		0, 1, 0, 1,
		0, 1, 2, 0,
		0, 1, 2, 3
	};

	for( i = 0; i < 4; i++ )
	{
		by = rem[( bh - 1 ) * 4 + i] + y;
		for( j = 0; j < 4; j++ )
		{
			bx = rem[( bw - 1 ) * 4 + j] + x;
			block[( i * 4 * 4 ) + ( j * 4 ) + 0] = src[( by * ( w * 4 ) ) + ( bx * 4 ) + 0];
			block[( i * 4 * 4 ) + ( j * 4 ) + 1] = src[( by * ( w * 4 ) ) + ( bx * 4 ) + 1];
			block[( i * 4 * 4 ) + ( j * 4 ) + 2] = src[( by * ( w * 4 ) ) + ( bx * 4 ) + 2];
			block[( i * 4 * 4 ) + ( j * 4 ) + 3] = src[( by * ( w * 4 ) ) + ( bx * 4 ) + 3];
		}
	}
}

/*
========================
idDxtEncoder::EncodeAlphaBlockBC3GimpDDS

Write DXT5 alpha block
========================
*/
void idDxtEncoder::EncodeAlphaBlockBC3GimpDDS( byte* dst, const byte* block, const int offset )
{
	int i, v, mn, mx;
	int dist, bias, dist2, dist4, bits, mask;
	int a, idx, t;

	block += offset;
	block += 3;
	// find min/max alpha pair
	mn = mx = block[0];
	for( i = 0; i < 16; i++ )
	{
		v = block[4 * i];
		if( v > mx )
		{
			mx = v;
		}
		if( v < mn )
		{
			mn = v;
		}
	}
	// encode them
	*dst++ = mx;
	*dst++ = mn;
	// determine bias and emit indices
	// given the choice of mx/mn, these indices are optimal:
	// http://fgiesen.wordpress.com/2009/12/15/dxt5-alpha-block-index-determination/
	dist = mx - mn;
	dist4 = dist * 4;
	dist2 = dist * 2;
	bias = ( dist < 8 ) ? ( dist - 1 ) : ( dist / 2 + 2 );
	bias -= mn * 7;
	bits = 0;
	mask = 0;
	for( i = 0; i < 16; i++ )
	{
		a = block[4 * i] * 7 + bias;
		// Select index. This is a "linear scale" lerp factor between 0 (val=min) and 7 (val=max).
		t = ( a >= dist4 ) ? -1 : 0;
		idx =  t & 4;
		a -= dist4 & t;
		t = ( a >= dist2 ) ? -1 : 0;
		idx += t & 2;
		a -= dist2 & t;
		idx += ( a >= dist );
		// turn linear scale into DXT index (0/1 are extremal pts)
		idx = -idx & 7;
		idx ^= ( 2 > idx );
		// write index
		mask |= idx << bits;
		if( ( bits += 3 ) >= 8 )
		{
			*dst++ = mask;
			mask >>= 8;
			bits -= 8;
		}
	}
}

/*
========================
idDxtEncoder::GetMinMaxYCoCgGimpDDS
========================
*/
void idDxtEncoder::GetMinMaxYCoCgGimpDDS( const byte* block, byte* mincolor, byte* maxcolor )
{
	int i;

	mincolor[2] = mincolor[1] = 255;
	maxcolor[2] = maxcolor[1] = 0;
	for( i = 0; i < 16; i++ )
	{
		if( block[4 * i + 2] < mincolor[2] )
		{
			mincolor[2] = block[4 * i + 2];
		}
		if( block[4 * i + 1] < mincolor[1] )
		{
			mincolor[1] = block[4 * i + 1];
		}
		if( block[4 * i + 2] > maxcolor[2] )
		{
			maxcolor[2] = block[4 * i + 2];
		}
		if( block[4 * i + 1] > maxcolor[1] )
		{
			maxcolor[1] = block[4 * i + 1];
		}
	}
}

/*
========================
idDxtEncoder::ScaleYCoCgGimpDDS
========================
*/
void idDxtEncoder::ScaleYCoCgGimpDDS( byte* block, byte* mincolor, byte* maxcolor )
{
	const int s0 = 128 / 2 - 1;
	const int s1 = 128 / 4 - 1;
	int m0, m1, m2, m3;
	int mask0, mask1, scale;
	int i;

	m0 = abs( mincolor[2] - 128 );
	m1 = abs( mincolor[1] - 128 );
	m2 = abs( maxcolor[2] - 128 );
	m3 = abs( maxcolor[1] - 128 );
	if( m1 > m0 )
	{
		m0 = m1;
	}
	if( m3 > m2 )
	{
		m2 = m3;
	}
	if( m2 > m0 )
	{
		m0 = m2;
	}

	mask0 = -( m0 <= s0 );
	mask1 = -( m0 <= s1 );
	scale = 1 + ( 1 & mask0 ) + ( 2 & mask1 );

	mincolor[2] = ( mincolor[2] - 128 ) * scale + 128;
	mincolor[1] = ( mincolor[1] - 128 ) * scale + 128;
	mincolor[0] = ( scale - 1 ) << 3;

	maxcolor[2] = ( maxcolor[2] - 128 ) * scale + 128;
	maxcolor[1] = ( maxcolor[1] - 128 ) * scale + 128;
	maxcolor[0] = ( scale - 1 ) << 3;

	for( i = 0; i < 16; i++ )
	{
		block[i * 4 + 2] = ( block[i * 4 + 2] - 128 ) * scale + 128;
		block[i * 4 + 1] = ( block[i * 4 + 1] - 128 ) * scale + 128;
	}
}

/*
========================
idDxtEncoder::InsetBBoxYCoCgGimpDDS
========================
*/
void idDxtEncoder::InsetBBoxYCoCgGimpDDS( byte* mincolor, byte* maxcolor )
{
	int inset[4], mini[4], maxi[4];

	inset[2] = ( maxcolor[2] - mincolor[2] ) - ( ( 1 << ( INSET_SHIFT - 1 ) ) - 1 );
	inset[1] = ( maxcolor[1] - mincolor[1] ) - ( ( 1 << ( INSET_SHIFT - 1 ) ) - 1 );

	mini[2] = ( ( mincolor[2] << INSET_SHIFT ) + inset[2] ) >> INSET_SHIFT;
	mini[1] = ( ( mincolor[1] << INSET_SHIFT ) + inset[1] ) >> INSET_SHIFT;

	maxi[2] = ( ( maxcolor[2] << INSET_SHIFT ) - inset[2] ) >> INSET_SHIFT;
	maxi[1] = ( ( maxcolor[1] << INSET_SHIFT ) - inset[1] ) >> INSET_SHIFT;

	mini[2] = ( mini[2] >= 0 ) ? mini[2] : 0;
	mini[1] = ( mini[1] >= 0 ) ? mini[1] : 0;

	maxi[2] = ( maxi[2] <= 255 ) ? maxi[2] : 255;
	maxi[1] = ( maxi[1] <= 255 ) ? maxi[1] : 255;

	mincolor[2] = ( mini[2] & 0xf8 ) | ( mini[2] >> 5 );
	mincolor[1] = ( mini[1] & 0xfc ) | ( mini[1] >> 6 );

	maxcolor[2] = ( maxi[2] & 0xf8 ) | ( maxi[2] >> 5 );
	maxcolor[1] = ( maxi[1] & 0xfc ) | ( maxi[1] >> 6 );
}

/*
========================
idDxtEncoder::SelectDiagonalYCoCgGimpDDS
========================
*/
void idDxtEncoder::SelectDiagonalYCoCgGimpDDS( const byte* block, byte* mincolor, byte* maxcolor )
{
	byte mid0, mid1, side, mask, b0, b1, c0, c1;
	int i;

	mid0 = ( ( int )mincolor[2] + maxcolor[2] + 1 ) >> 1;
	mid1 = ( ( int )mincolor[1] + maxcolor[1] + 1 ) >> 1;

	side = 0;
	for( i = 0; i < 16; i++ )
	{
		b0 = block[i * 4 + 2] >= mid0;
		b1 = block[i * 4 + 1] >= mid1;
		side += ( b0 ^ b1 );
	}

	mask = -( side > 8 );
	mask &= -( mincolor[2] != maxcolor[2] );

	c0 = mincolor[1];
	c1 = maxcolor[1];

	c0 ^= c1;
	c1 ^= c0 & mask;
	c0 ^= c1;

	mincolor[1] = c0;
	maxcolor[1] = c1;
}

/*
========================
idDxtEncoder::LerpRGB13GimpDDS

Linear interpolation at 1/3 point between a and b
========================
*/
void idDxtEncoder::LerpRGB13GimpDDS( byte* dst, byte* a, byte* b )
{
#if 0
	dst[0] = blerp( a[0], b[0], 0x55 );
	dst[1] = blerp( a[1], b[1], 0x55 );
	dst[2] = blerp( a[2], b[2], 0x55 );
#else
	// according to the S3TC/DX10 specs, this is the correct way to do the
	// interpolation (with no rounding bias)
	//
	// dst = ( 2 * a + b ) / 3;
	dst[0] = ( 2 * a[0] + b[0] ) / 3;
	dst[1] = ( 2 * a[1] + b[1] ) / 3;
	dst[2] = ( 2 * a[2] + b[2] ) / 3;
#endif
}

/*
========================
idDxtEncoder::Mul8BitGimpDDS
========================
*/
inline int idDxtEncoder::Mul8BitGimpDDS( int a, int b )
{
	int t = a * b + 128;
	return ( ( t + ( t >> 8 ) ) >> 8 );
}

/*
========================
idDxtEncoder::PackRGB565GimpDDS

Pack BGR8 to RGB565
========================
*/
inline unsigned short idDxtEncoder::PackRGB565GimpDDS( const byte* c )
{
	return( ( Mul8BitGimpDDS( c[2], 31 ) << 11 ) |
			( Mul8BitGimpDDS( c[1], 63 ) <<  5 ) |
			( Mul8BitGimpDDS( c[0], 31 ) ) );
}

/*
========================
idDxtEncoder::EncodeYCoCgBlockGimpDDS
========================
*/
void idDxtEncoder::EncodeYCoCgBlockGimpDDS( byte* dst, byte* block )
{
	byte colors[4][3], *maxcolor, *mincolor;
	unsigned int mask;
	int c0, c1, d0, d1, d2, d3;
	int b0, b1, b2, b3, b4;
	int x0, x1, x2;
	int i, idx;

	maxcolor = &colors[0][0];
	mincolor = &colors[1][0];

	GetMinMaxYCoCgGimpDDS( block, mincolor, maxcolor );
	ScaleYCoCgGimpDDS( block, mincolor, maxcolor );
	InsetBBoxYCoCgGimpDDS( mincolor, maxcolor );
	SelectDiagonalYCoCgGimpDDS( block, mincolor, maxcolor );

	LerpRGB13GimpDDS( &colors[2][0], maxcolor, mincolor );
	LerpRGB13GimpDDS( &colors[3][0], mincolor, maxcolor );

	mask = 0;

	for( i = 0; i < 16; i++ )
	{
		c0 = block[4 * i + 2];
		c1 = block[4 * i + 1];

		d0 = abs( colors[0][2] - c0 ) + abs( colors[0][1] - c1 );
		d1 = abs( colors[1][2] - c0 ) + abs( colors[1][1] - c1 );
		d2 = abs( colors[2][2] - c0 ) + abs( colors[2][1] - c1 );
		d3 = abs( colors[3][2] - c0 ) + abs( colors[3][1] - c1 );

		b0 = d0 > d3;
		b1 = d1 > d2;
		b2 = d0 > d2;
		b3 = d1 > d3;
		b4 = d2 > d3;

		x0 = b1 & b2;
		x1 = b0 & b3;
		x2 = b0 & b4;

		idx = ( x2 | ( ( x0 | x1 ) << 1 ) );

		mask |= idx << ( 2 * i );
	}

	PUTL16( dst + 0, PackRGB565GimpDDS( maxcolor ) );
	PUTL16( dst + 2, PackRGB565GimpDDS( mincolor ) );
	PUTL32( dst + 4, mask );
}

// LordHavoc end

/*
========================
idDxtEncoder::CompressYCoCgDXT5HQ

params:	inBuf		- image to compress
paramO:	outBuf		- result of compression
params:	width		- width of image
params:	height		- height of image
========================
*/
void idDxtEncoder::CompressYCoCgDXT5HQ( const byte* inBuf, byte* outBuf, int width, int height )
{
	ALIGN16( byte block[64] );
	byte alphaIndices1[6];
	byte alphaIndices2[6];
	unsigned int colorIndices;
	byte col1[4];
	byte col2[4];
	int error1;
	int error2;

	assert( HasConstantValuePer4x4Block( inBuf, width, height, 2 ) );

	this->width = width;
	this->height = height;
	this->outData = outBuf;

	if( width > 4 && ( width & 3 ) != 0 )
	{
		return;
	}
	if( height > 4 && ( height & 3 ) != 0 )
	{
		return;
	}

	if( width < 4 || height < 4 )
	{
		WriteTinyColorDXT5( inBuf, width, height );
		return;
	}

	for( int j = 0; j < height; j += 4, inBuf += width * 4 * 4 )
	{
		for( int i = 0; i < width; i += 4 )
		{
			common->LoadPacifierBinarizeProgressIncrement( 16 );

			ExtractBlock( inBuf + i * 4, width, block );
			ScaleYCoCg( block );

			GetMinMaxColorsHQ( block, col1, col2, true );
			GetMinMaxAlphaHQ( block, 3, col1, col2 );

			// Write out alpha data. Try and find minimum error for the two encoding methods.
			error1 = FindAlphaIndices( block, 3, col1[3], col2[3], alphaIndices1 );
			error2 = FindAlphaIndices( block, 3, col2[3], col1[3], alphaIndices2 );

			if( error1 < error2 )
			{

				EmitByte( col1[3] );
				EmitByte( col2[3] );
				EmitByte( alphaIndices1[0] );
				EmitByte( alphaIndices1[1] );
				EmitByte( alphaIndices1[2] );
				EmitByte( alphaIndices1[3] );
				EmitByte( alphaIndices1[4] );
				EmitByte( alphaIndices1[5] );

			}
			else
			{

				EmitByte( col2[3] );
				EmitByte( col1[3] );
				EmitByte( alphaIndices2[0] );
				EmitByte( alphaIndices2[1] );
				EmitByte( alphaIndices2[2] );
				EmitByte( alphaIndices2[3] );
				EmitByte( alphaIndices2[4] );
				EmitByte( alphaIndices2[5] );
			}

#ifdef NVIDIA_7X_HARDWARE_BUG_FIX
			NV4XHardwareBugFix( col2, col1 );
#endif

			// Write out color data. Always take the path with 4 interpolated values.
			unsigned short scol1 = ColorTo565( col1 );
			unsigned short scol2 = ColorTo565( col2 );

			EmitUShort( scol1 );
			EmitUShort( scol2 );

			FindColorIndices( block, scol1, scol2, colorIndices );
			EmitUInt( colorIndices );

			//idLib::Printf( "\r%3d%%", ( j * width + i ) * 100 / ( width * height ) );
		}
		outData += dstPadding;
		inBuf += srcPadding;
	}

	//idLib::Printf( "\r100%%\n" );
}

/*
========================
idDxtEncoder::CompressYCoCgCTX1DXT5AHQ

params:	inBuf		- image to compress
paramO:	outBuf		- result of compression
params:	width		- width of image
params:	height		- height of image
========================
*/
void idDxtEncoder::CompressYCoCgCTX1DXT5AHQ( const byte* inBuf, byte* outBuf, int width, int height )
{
	ALIGN16( byte block[64] );
	byte alphaIndices1[6];
	byte alphaIndices2[6];
	unsigned int colorIndices;
	byte col1[4];
	byte col2[4];
	int error1;
	int error2;

	assert( HasConstantValuePer4x4Block( inBuf, width, height, 2 ) );

	this->width = width;
	this->height = height;
	this->outData = outBuf;

	if( width > 4 && ( width & 3 ) != 0 )
	{
		return;
	}
	if( height > 4 && ( height & 3 ) != 0 )
	{
		return;
	}

	if( width < 4 || height < 4 )
	{
		WriteTinyColorCTX1DXT5A( inBuf, width, height );
		return;
	}

	for( int j = 0; j < height; j += 4, inBuf += width * 4 * 4 )
	{
		for( int i = 0; i < width; i += 4 )
		{
			common->LoadPacifierBinarizeProgressIncrement( 16 );

			ExtractBlock( inBuf + i * 4, width, block );

			GetMinMaxAlphaHQ( block, 3, col1, col2 );

			// Write out alpha data. Try and find minimum error for the two encoding methods.
			error1 = FindAlphaIndices( block, 3, col1[3], col2[3], alphaIndices1 );
			error2 = FindAlphaIndices( block, 3, col2[3], col1[3], alphaIndices2 );

			if( error1 < error2 )
			{

				EmitByte( col1[3] );
				EmitByte( col2[3] );
				EmitByte( alphaIndices1[0] );
				EmitByte( alphaIndices1[1] );
				EmitByte( alphaIndices1[2] );
				EmitByte( alphaIndices1[3] );
				EmitByte( alphaIndices1[4] );
				EmitByte( alphaIndices1[5] );

			}
			else
			{

				EmitByte( col2[3] );
				EmitByte( col1[3] );
				EmitByte( alphaIndices2[0] );
				EmitByte( alphaIndices2[1] );
				EmitByte( alphaIndices2[2] );
				EmitByte( alphaIndices2[3] );
				EmitByte( alphaIndices2[4] );
				EmitByte( alphaIndices2[5] );
			}

			GetMinMaxCTX1HQ( block, col1, col2 );

			EmitByte( col2[0] );
			EmitByte( col2[1] );
			EmitByte( col1[0] );
			EmitByte( col1[1] );

			FindCTX1Indices( block, col1, col2, colorIndices );
			EmitUInt( colorIndices );

			//idLib::Printf( "\r%3d%%", ( j * width + i ) * 100 / ( width * height ) );
		}
		outData += dstPadding;
		inBuf += srcPadding;
	}

	//idLib::Printf( "\r100%%\n" );
}

/*
========================
idDxtEncoder::RotateNormalsDXT1
========================
*/
void idDxtEncoder::RotateNormalsDXT1( byte* block ) const
{
	byte rotatedBlock[64];
	byte col1[4];
	byte col2[4];
	int bestError = MAX_TYPE( int );
	int bestRotation = 0;

	for( int i = 0; i < 32; i += 1 )
	{
		int r = ( i << 3 ) | ( i >> 2 );
		float angle = ( r / 255.0f ) * idMath::PI;
		float s = sin( angle );
		float c = cos( angle );

		for( int j = 0; j < 16; j++ )
		{
			float x = block[j * 4 + 0] / 255.0f * 2.0f - 1.0f;
			float y = block[j * 4 + 1] / 255.0f * 2.0f - 1.0f;
			float rx = c * x - s * y;
			float ry = s * x + c * y;
			rotatedBlock[j * 4 + 0] = idMath::Ftob( ( rx + 1.0f ) / 2.0f * 255.0f );
			rotatedBlock[j * 4 + 1] = idMath::Ftob( ( ry + 1.0f ) / 2.0f * 255.0f );
		}

		int error = GetMinMaxColorsHQ( rotatedBlock, col1, col2, true );
		if( error < bestError )
		{
			bestError = error;
			bestRotation = r;
		}
	}

	float angle = ( bestRotation / 255.0f ) * idMath::PI;
	float s = sin( angle );
	float c = cos( angle );

	for( int j = 0; j < 16; j++ )
	{
		float x = block[j * 4 + 0] / 255.0f * 2.0f - 1.0f;
		float y = block[j * 4 + 1] / 255.0f * 2.0f - 1.0f;
		float rx = c * x - s * y;
		float ry = s * x + c * y;
		block[j * 4 + 0] = idMath::Ftob( ( rx + 1.0f ) / 2.0f * 255.0f );
		block[j * 4 + 1] = idMath::Ftob( ( ry + 1.0f ) / 2.0f * 255.0f );
		block[j * 4 + 2] = ( byte )bestRotation;
	}
}

/*
========================
idDxtEncoder::CompressNormalMapDXT1HQ

params:	inBuf		- image to compress
paramO:	outBuf		- result of compression
params:	width		- width of image
params:	height		- height of image
========================
*/
void idDxtEncoder::CompressNormalMapDXT1HQ( const byte* inBuf, byte* outBuf, int width, int height )
{
	ALIGN16( byte block[64] );
	unsigned int colorIndices;
	byte col1[4];
	byte col2[4];

	this->width = width;
	this->height = height;
	this->outData = outBuf;

	if( width > 4 && ( width & 3 ) != 0 )
	{
		return;
	}
	if( height > 4 && ( height & 3 ) != 0 )
	{
		return;
	}

	if( width < 4 || height < 4 )
	{
		WriteTinyColorDXT1( inBuf, width, height );
		return;
	}

	for( int j = 0; j < height; j += 4, inBuf += width * 4 * 4 )
	{
		for( int i = 0; i < width; i += 4 )
		{
			common->LoadPacifierBinarizeProgressIncrement( 16 );

			ExtractBlock( inBuf + i * 4, width, block );

			for( int k = 0; k < 16; k++ )
			{
				block[k * 4 + 2] = 0;
			}

			GetMinMaxColorsHQ( block, col1, col2, true );

			// Write out color data. Always take the path with 4 interpolated values.
			unsigned short scol1 = ColorTo565( col1 );
			unsigned short scol2 = ColorTo565( col2 );

			EmitUShort( scol1 );
			EmitUShort( scol2 );

			FindColorIndices( block, scol1, scol2, colorIndices );
			EmitUInt( colorIndices );

			//idLib::Printf( "\r%3d%%", ( j * width + i * 4 ) * 100 / ( width * height ) );
		}
		outData += dstPadding;
		inBuf += srcPadding;
	}

	//idLib::Printf( "\r100%%\n" );
}

/*
========================
idDxtEncoder::CompressNormalMapDXT1RenormalizeHQ

params:	inBuf		- image to compress
paramO:	outBuf		- result of compression
params:	width		- width of image
params:	height		- height of image
========================
*/
void idDxtEncoder::CompressNormalMapDXT1RenormalizeHQ( const byte* inBuf, byte* outBuf, int width, int height )
{
	ALIGN16( byte block[64] );
	unsigned int colorIndices;
	byte col1[4];
	byte col2[4];

	this->width = width;
	this->height = height;
	this->outData = outBuf;

	if( width > 4 && ( width & 3 ) != 0 )
	{
		return;
	}
	if( height > 4 && ( height & 3 ) != 0 )
	{
		return;
	}

	if( width < 4 || height < 4 )
	{
		WriteTinyColorDXT1( inBuf, width, height );
		return;
	}

	for( int j = 0; j < height; j += 4, inBuf += width * 4 * 4 )
	{
		for( int i = 0; i < width; i += 4 )
		{
			common->LoadPacifierBinarizeProgressIncrement( 16 );

			ExtractBlock( inBuf + i * 4, width, block );

			// clear alpha channel
			for( int k = 0; k < 16; k++ )
			{
				block[k * 4 + 3] = 0;
			}

			GetMinMaxNormalsDXT1HQ( block, col1, col2, colorIndices, true );

			// Write out color data. Always take the path with 4 interpolated values.
			unsigned short scol1 = ColorTo565( col1 );
			unsigned short scol2 = ColorTo565( col2 );

			EmitUShort( scol1 );
			EmitUShort( scol2 );
			EmitUInt( colorIndices );

			////idLib::Printf( "\r%3d%%", ( j * width + i * 4 ) * 100 / ( width * height ) );
		}
		outData += dstPadding;
		inBuf += srcPadding;
	}

	////idLib::Printf( "\r100%%\n" );
}

#define USE_SCALE		1
#define USE_BIAS		1

static int c_blocks;
static int c_scaled;
static int c_scaled2x;
static int c_scaled4x;
static int c_differentBias;
static int c_biasHelped;

/*
========================
idDxtEncoder::BiasScaleNormalY

	* scale2x = 33%
	* scale4x = 23%
	* bias + scale2x = 30%
	* bias + scale4x = 55%
========================
*/
void idDxtEncoder::BiasScaleNormalY( byte* colorBlock ) const
{

	byte minColor = 255;
	byte maxColor = 0;

	for( int i = 0; i < 16; i++ )
	{
		if( colorBlock[i * 4 + 1] < minColor )
		{
			minColor = colorBlock[i * 4 + 1];
		}
		if( colorBlock[i * 4 + 1] > maxColor )
		{
			maxColor = colorBlock[i * 4 + 1];
		}
	}

	int bestBias = 128;
	int bestRange = Max( abs( minColor - bestBias ), abs( maxColor - bestBias ) );
#if USE_BIAS
	for( int i = 0; i < 32; i++ )
	{
		int bias = ( ( i << 3 ) | ( i >> 2 ) ) - 4;
		int range = Max( abs( minColor - bias ), abs( maxColor - bias ) );
		if( range < bestRange )
		{
			bestRange = range;
			bestBias = bias;
		}
	}
#endif

	const int s0 = 128 / 2 - 1;
	const int s1 = 128 / 4 - 1;

#if USE_SCALE
	int scale = 1 + ( bestRange <= s0 ) + 2 * ( bestRange <= s1 );
#else
	int scale = 1;
#endif

	if( scale == 1 )
	{
		bestBias = 128;
	}
	else
	{
		c_scaled++;
		if( scale == 2 )
		{
			c_scaled2x++;
		}
		if( scale == 4 )
		{
			c_scaled4x++;
		}
		if( bestBias != 128 )
		{
			c_differentBias++;
			int r = Max( abs( minColor - 128 ), abs( maxColor - 128 ) );
			int s = 1 + ( r <= s0 ) + 2 * ( r <= s1 );
			if( scale > s )
			{
				c_biasHelped++;
			}
		}
	}

	c_blocks++;

	for( int i = 0; i < 16; i++ )
	{
		colorBlock[i * 4 + 0] = byte( bestBias + 4 );
		colorBlock[i * 4 + 1] = byte( ( colorBlock[i * 4 + 1] - bestBias ) * scale + 128 );
		colorBlock[i * 4 + 2] = byte( ( scale - 1 ) << 3 );
	}
}

/*
========================
idDxtEncoder::RotateNormalsDXT5
========================
*/
void idDxtEncoder::RotateNormalsDXT5( byte* block ) const
{
	byte rotatedBlock[64];
	byte col1[4];
	byte col2[4];
	int bestError = MAX_TYPE( int );
	int bestRotation = 0;
	int bestScale = 1;

	for( int i = 0; i < 32; i += 1 )
	{
		int r = ( i << 3 ) | ( i >> 2 );
		float angle = ( r / 255.0f ) * idMath::PI;
		float s = sin( angle );
		float c = cos( angle );

		for( int j = 0; j < 16; j++ )
		{
			float x = block[j * 4 + 3] / 255.0f * 2.0f - 1.0f;
			float y = block[j * 4 + 1] / 255.0f * 2.0f - 1.0f;
			float rx = c * x - s * y;
			float ry = s * x + c * y;
			rotatedBlock[j * 4 + 3] = idMath::Ftob( ( rx + 1.0f ) / 2.0f * 255.0f );
			rotatedBlock[j * 4 + 1] = idMath::Ftob( ( ry + 1.0f ) / 2.0f * 255.0f );
		}

#if USE_SCALE
		byte minColor = 255;
		byte maxColor = 0;

		for( int j = 0; j < 16; j++ )
		{
			if( rotatedBlock[j * 4 + 1] < minColor )
			{
				minColor = rotatedBlock[j * 4 + 1];
			}
			if( rotatedBlock[j * 4 + 1] > maxColor )
			{
				maxColor = rotatedBlock[j * 4 + 1];
			}
		}

		const int s0 = 128 / 2 - 1;
		const int s1 = 128 / 4 - 1;

		int range = Max( abs( minColor - 128 ), abs( maxColor - 128 ) );
		int scale = 1 + ( range <= s0 ) + 2 * ( range <= s1 );

		for( int j = 0; j < 16; j++ )
		{
			rotatedBlock[j * 4 + 1] = byte( ( rotatedBlock[j * 4 + 1] - 128 ) * scale + 128 );
		}
#endif

		int errorY = GetMinMaxNormalYHQ( rotatedBlock, col1, col2, true, scale );
		int errorX = GetMinMaxAlphaHQ( rotatedBlock, 3, col1, col2 );
		int error = errorX + errorY;
		if( error < bestError )
		{
			bestError = error;
			bestRotation = r;
			bestScale = scale;
		}
	}

	float angle = ( bestRotation / 255.0f ) * idMath::PI;
	float s = sin( angle );
	float c = cos( angle );

	for( int j = 0; j < 16; j++ )
	{
		float x = block[j * 4 + 3] / 255.0f * 2.0f - 1.0f;
		float y = block[j * 4 + 1] / 255.0f * 2.0f - 1.0f;
		float rx = c * x - s * y;
		float ry = s * x + c * y;
		block[j * 4 + 0] = ( byte )bestRotation;
		block[j * 4 + 1] = idMath::Ftob( ( ry + 1.0f ) / 2.0f * 255.0f );
		block[j * 4 + 3] = idMath::Ftob( ( rx + 1.0f ) / 2.0f * 255.0f );

#if USE_SCALE
		block[j * 4 + 1] = byte( ( block[j * 4 + 1] - 128 ) * bestScale + 128 );
		block[j * 4 + 2] = byte( ( bestScale - 1 ) << 3 );
#endif
	}
}

/*
========================
idDxtEncoder::CompressNormalMapDXT5HQ

params:	inBuf		- image to compress
paramO:	outBuf		- result of compression
params:	width		- width of image
params:	height		- height of image
========================
*/
void idDxtEncoder::CompressNormalMapDXT5HQ( const byte* inBuf, byte* outBuf, int width, int height )
{
	ALIGN16( byte block[64] );
	byte alphaIndices1[6];
	byte alphaIndices2[6];
	unsigned int colorIndices;
	byte col1[4];
	byte col2[4];
	int error1;
	int error2;

	this->width = width;
	this->height = height;
	this->outData = outBuf;

	if( width > 4 && ( width & 3 ) != 0 )
	{
		return;
	}
	if( height > 4 && ( height & 3 ) != 0 )
	{
		return;
	}

	if( width < 4 || height < 4 )
	{
		WriteTinyColorDXT5( inBuf, width, height );
		return;
	}

	for( int j = 0; j < height; j += 4, inBuf += width * 4 * 4 )
	{
		for( int i = 0; i < width; i += 4 )
		{
			common->LoadPacifierBinarizeProgressIncrement( 16 );

			ExtractBlock( inBuf + i * 4, width, block );

			// swizzle components
			for( int k = 0; k < 16; k++ )
			{
				block[k * 4 + 3] = block[k * 4 + 0];
				block[k * 4 + 0] = 0;
				block[k * 4 + 2] = 0;
			}

			//BiasScaleNormalY( block );
			//RotateNormalsDXT5( block );

			GetMinMaxNormalYHQ( block, col1, col2, true, 1 );
			GetMinMaxAlphaHQ( block, 3, col1, col2 );

			// Write out alpha data. Try and find minimum error for the two encoding methods.
			error1 = FindAlphaIndices( block, 3, col1[3], col2[3], alphaIndices1 );
			error2 = FindAlphaIndices( block, 3, col2[3], col1[3], alphaIndices2 );

			if( error1 < error2 )
			{

				EmitByte( col1[3] );
				EmitByte( col2[3] );
				EmitByte( alphaIndices1[0] );
				EmitByte( alphaIndices1[1] );
				EmitByte( alphaIndices1[2] );
				EmitByte( alphaIndices1[3] );
				EmitByte( alphaIndices1[4] );
				EmitByte( alphaIndices1[5] );

			}
			else
			{

				EmitByte( col2[3] );
				EmitByte( col1[3] );
				EmitByte( alphaIndices2[0] );
				EmitByte( alphaIndices2[1] );
				EmitByte( alphaIndices2[2] );
				EmitByte( alphaIndices2[3] );
				EmitByte( alphaIndices2[4] );
				EmitByte( alphaIndices2[5] );
			}

#ifdef NVIDIA_7X_HARDWARE_BUG_FIX
			NV4XHardwareBugFix( col2, col1 );
#endif

			// Write out color data. Always take the path with 4 interpolated values.
			unsigned short scol1 = ColorTo565( col1 );
			unsigned short scol2 = ColorTo565( col2 );

			EmitUShort( scol1 );
			EmitUShort( scol2 );

			FindColorIndices( block, scol1, scol2, colorIndices );
			EmitUInt( colorIndices );

			//idLib::Printf( "\r%3d%%", ( j * width + i ) * 100 / ( width * height ) );
		}
		outData += dstPadding;
		inBuf += srcPadding;
	}

	//idLib::Printf( "\r100%%\n" );
}

/*
========================
idDxtEncoder::CompressNormalMapDXT5RenormalizeHQ

params:	inBuf		- image to compress
paramO:	outBuf		- result of compression
params:	width		- width of image
params:	height		- height of image
========================
*/
void idDxtEncoder::CompressNormalMapDXT5RenormalizeHQ( const byte* inBuf, byte* outBuf, int width, int height )
{
	ALIGN16( byte block[64] );
	unsigned int colorIndices;
	byte alphaIndices[6];
	byte col1[4];
	byte col2[4];

	this->width = width;
	this->height = height;
	this->outData = outBuf;

	if( width > 4 && ( width & 3 ) != 0 )
	{
		return;
	}
	if( height > 4 && ( height & 3 ) != 0 )
	{
		return;
	}

	if( width < 4 || height < 4 )
	{
		WriteTinyColorDXT5( inBuf, width, height );
		return;
	}

	for( int j = 0; j < height; j += 4, inBuf += width * 4 * 4 )
	{
		for( int i = 0; i < width; i += 4 )
		{
			common->LoadPacifierBinarizeProgressIncrement( 16 );

			ExtractBlock( inBuf + i * 4, width, block );

			// swizzle components
			for( int k = 0; k < 16; k++ )
			{
#if 0 // object-space
				block[k * 4 + 3] = block[k * 4 + 2];
				block[k * 4 + 2] = 0;
#else
				block[k * 4 + 3] = block[k * 4 + 0];
				block[k * 4 + 0] = 0;
#endif
			}

			GetMinMaxNormalsDXT5HQFast( block, col1, col2, colorIndices, alphaIndices );

			EmitByte( col2[3] );
			EmitByte( col1[3] );
			EmitByte( alphaIndices[0] );
			EmitByte( alphaIndices[1] );
			EmitByte( alphaIndices[2] );
			EmitByte( alphaIndices[3] );
			EmitByte( alphaIndices[4] );
			EmitByte( alphaIndices[5] );

			unsigned short scol1 = ColorTo565( col1 );
			unsigned short scol2 = ColorTo565( col2 );

			EmitUShort( scol2 );
			EmitUShort( scol1 );
			EmitUInt( colorIndices );

			////idLib::Printf( "\r%3d%%", ( j * width + i ) * 100 / ( width * height ) );
		}
		outData += dstPadding;
		inBuf += srcPadding;

	}

	////idLib::Printf( "\r100%%\n" );
}

/*
========================
idDxtEncoder::CompressNormalMapDXN2HQ

params:	inBuf		- image to compress
paramO:	outBuf		- result of compression
params:	width		- width of image
params:	height		- height of image
========================
*/
void idDxtEncoder::CompressNormalMapDXN2HQ( const byte* inBuf, byte* outBuf, int width, int height )
{
	ALIGN16( byte block[64] );
	byte alphaIndices1[6];
	byte alphaIndices2[6];
	byte col1[4];
	byte col2[4];
	int error1;
	int error2;

	this->width = width;
	this->height = height;
	this->outData = outBuf;

	if( width > 4 && ( width & 3 ) != 0 )
	{
		return;
	}
	if( height > 4 && ( height & 3 ) != 0 )
	{
		return;
	}

	if( width < 4 || height < 4 )
	{
		WriteTinyColorDXT5( inBuf, width, height );
		return;
	}

	for( int j = 0; j < height; j += 4, inBuf += width * 4 * 4 )
	{
		for( int i = 0; i < width; i += 4 )
		{
			common->LoadPacifierBinarizeProgressIncrement( 16 );

			ExtractBlock( inBuf + i * 4, width, block );

			for( int k = 0; k < 2; k++ )
			{
				GetMinMaxAlphaHQ( block, k, col1, col2 );

				// Write out alpha data. Try and find minimum error for the two encoding methods.
				error1 = FindAlphaIndices( block, k, col1[k], col2[k], alphaIndices1 );
				error2 = FindAlphaIndices( block, k, col2[k], col1[k], alphaIndices2 );

				if( error1 < error2 )
				{

					EmitByte( col1[k] );
					EmitByte( col2[k] );
					EmitByte( alphaIndices1[0] );
					EmitByte( alphaIndices1[1] );
					EmitByte( alphaIndices1[2] );
					EmitByte( alphaIndices1[3] );
					EmitByte( alphaIndices1[4] );
					EmitByte( alphaIndices1[5] );

				}
				else
				{

					EmitByte( col2[k] );
					EmitByte( col1[k] );
					EmitByte( alphaIndices2[0] );
					EmitByte( alphaIndices2[1] );
					EmitByte( alphaIndices2[2] );
					EmitByte( alphaIndices2[3] );
					EmitByte( alphaIndices2[4] );
					EmitByte( alphaIndices2[5] );
				}
			}

			//idLib::Printf( "\r%3d%%", ( j * width + i ) * 100 / ( width * height ) );
		}
		outData += dstPadding;
		inBuf += srcPadding;
	}

	//idLib::Printf( "\r100%%\n" );
}

/*
========================
idDxtEncoder::GetMinMaxBBox

Takes the extents of the bounding box of the colors in the 4x4 block in RGB space.
Also finds the minimum and maximum alpha values.

params:	colorBlock	- 4*4 input tile, 4 bytes per pixel
paramO:	minColor	- 4 byte Min color found
paramO:	maxColor	- 4 byte Max color found
========================
*/
ID_INLINE void idDxtEncoder::GetMinMaxBBox( const byte* colorBlock, byte* minColor, byte* maxColor ) const
{

	minColor[0] = minColor[1] = minColor[2] = minColor[3] = 255;
	maxColor[0] = maxColor[1] = maxColor[2] = maxColor[3] = 0;

	for( int i = 0; i < 16; i++ )
	{
		if( colorBlock[i * 4 + 0] < minColor[0] )
		{
			minColor[0] = colorBlock[i * 4 + 0];
		}
		if( colorBlock[i * 4 + 1] < minColor[1] )
		{
			minColor[1] = colorBlock[i * 4 + 1];
		}
		if( colorBlock[i * 4 + 2] < minColor[2] )
		{
			minColor[2] = colorBlock[i * 4 + 2];
		}
		if( colorBlock[i * 4 + 3] < minColor[3] )
		{
			minColor[3] = colorBlock[i * 4 + 3];
		}
		if( colorBlock[i * 4 + 0] > maxColor[0] )
		{
			maxColor[0] = colorBlock[i * 4 + 0];
		}
		if( colorBlock[i * 4 + 1] > maxColor[1] )
		{
			maxColor[1] = colorBlock[i * 4 + 1];
		}
		if( colorBlock[i * 4 + 2] > maxColor[2] )
		{
			maxColor[2] = colorBlock[i * 4 + 2];
		}
		if( colorBlock[i * 4 + 3] > maxColor[3] )
		{
			maxColor[3] = colorBlock[i * 4 + 3];
		}
	}
}

/*
========================
idDxtEncoder::InsetColorsBBox
========================
*/
ID_INLINE void idDxtEncoder::InsetColorsBBox( byte* minColor, byte* maxColor ) const
{
	byte inset[4];

	inset[0] = ( maxColor[0] - minColor[0] ) >> INSET_COLOR_SHIFT;
	inset[1] = ( maxColor[1] - minColor[1] ) >> INSET_COLOR_SHIFT;
	inset[2] = ( maxColor[2] - minColor[2] ) >> INSET_COLOR_SHIFT;
	inset[3] = ( maxColor[3] - minColor[3] ) >> INSET_ALPHA_SHIFT;

	minColor[0] = ( minColor[0] + inset[0] <= 255 ) ? minColor[0] + inset[0] : 255;
	minColor[1] = ( minColor[1] + inset[1] <= 255 ) ? minColor[1] + inset[1] : 255;
	minColor[2] = ( minColor[2] + inset[2] <= 255 ) ? minColor[2] + inset[2] : 255;
	minColor[3] = ( minColor[3] + inset[3] <= 255 ) ? minColor[3] + inset[3] : 255;

	maxColor[0] = ( maxColor[0] >= inset[0] ) ? maxColor[0] - inset[0] : 0;
	maxColor[1] = ( maxColor[1] >= inset[1] ) ? maxColor[1] - inset[1] : 0;
	maxColor[2] = ( maxColor[2] >= inset[2] ) ? maxColor[2] - inset[2] : 0;
	maxColor[3] = ( maxColor[3] >= inset[3] ) ? maxColor[3] - inset[3] : 0;
}

/*
========================
idDxtEncoder::SelectColorsDiagonal
========================
*/
void idDxtEncoder::SelectColorsDiagonal( const byte* colorBlock, byte* minColor, byte* maxColor ) const
{

	byte mid0 = byte( ( ( int ) minColor[0] + maxColor[0] + 1 ) >> 1 );
	byte mid1 = byte( ( ( int ) minColor[1] + maxColor[1] + 1 ) >> 1 );
	byte mid2 = byte( ( ( int ) minColor[2] + maxColor[2] + 1 ) >> 1 );

#if 0

	// using the covariance is the best way to select the diagonal
	int side0 = 0;
	int side1 = 0;
	for( int i = 0; i < 16; i++ )
	{
		int b0 = colorBlock[i * 4 + 0] - mid0;
		int b1 = colorBlock[i * 4 + 1] - mid1;
		int b2 = colorBlock[i * 4 + 2] - mid2;
		side0 += ( b0 * b1 );
		side1 += ( b1 * b2 );
	}
	byte mask0 = -( side0 < 0 );
	byte mask1 = -( side1 < 0 );

#else

	// calculating the covariance of just the sign bits is much faster and gives almost the same result
	int side0 = 0;
	int side1 = 0;
	for( int i = 0; i < 16; i++ )
	{
		byte b0 = colorBlock[i * 4 + 0] >= mid0;
		byte b1 = colorBlock[i * 4 + 1] >= mid1;
		byte b2 = colorBlock[i * 4 + 2] >= mid2;
		side0 += ( b0 ^ b1 );
		side1 += ( b1 ^ b2 );
	}
	byte mask0 = -( side0 > 8 );
	byte mask1 = -( side1 > 8 );

#endif

	byte c0 = minColor[0];
	byte c1 = maxColor[0];
	byte c2 = minColor[2];
	byte c3 = maxColor[2];

	c0 ^= c1;
	mask0 &= c0;
	c1 ^= mask0;
	c0 ^= c1;

	c2 ^= c3;
	mask1 &= c2;
	c3 ^= mask1;
	c2 ^= c3;

	minColor[0] = c0;
	maxColor[0] = c1;
	minColor[2] = c2;
	maxColor[2] = c3;

	if( ColorTo565( minColor ) > ColorTo565( maxColor ) )
	{
		SwapValues( minColor[0], maxColor[0] );
		SwapValues( minColor[1], maxColor[1] );
		SwapValues( minColor[2], maxColor[2] );
	}
}

/*
========================
idDxtEncoder::EmitColorIndices

params:	colorBlock	- 16 pixel block for which find color indexes
paramO:	minColor	- Min color found
paramO:	maxColor	- Max color found
return: 4 byte color index block
========================
*/
void idDxtEncoder::EmitColorIndices( const byte* colorBlock, const byte* minColor, const byte* maxColor )
{
#if 1

	ALIGN16( uint16 colors[4][4] );
	unsigned int result = 0;

	colors[0][0] = ( maxColor[0] & C565_5_MASK ) | ( maxColor[0] >> 5 );
	colors[0][1] = ( maxColor[1] & C565_6_MASK ) | ( maxColor[1] >> 6 );
	colors[0][2] = ( maxColor[2] & C565_5_MASK ) | ( maxColor[2] >> 5 );
	colors[0][3] = 0;
	colors[1][0] = ( minColor[0] & C565_5_MASK ) | ( minColor[0] >> 5 );
	colors[1][1] = ( minColor[1] & C565_6_MASK ) | ( minColor[1] >> 6 );
	colors[1][2] = ( minColor[2] & C565_5_MASK ) | ( minColor[2] >> 5 );
	colors[1][3] = 0;
	colors[2][0] = ( 2 * colors[0][0] + 1 * colors[1][0] ) / 3;
	colors[2][1] = ( 2 * colors[0][1] + 1 * colors[1][1] ) / 3;
	colors[2][2] = ( 2 * colors[0][2] + 1 * colors[1][2] ) / 3;
	colors[2][3] = 0;
	colors[3][0] = ( 1 * colors[0][0] + 2 * colors[1][0] ) / 3;
	colors[3][1] = ( 1 * colors[0][1] + 2 * colors[1][1] ) / 3;
	colors[3][2] = ( 1 * colors[0][2] + 2 * colors[1][2] ) / 3;
	colors[3][3] = 0;

	// uses sum of absolute differences instead of squared distance to find the best match
	for( int i = 15; i >= 0; i-- )
	{
		int c0, c1, c2, c3, m, d0, d1, d2, d3;

		c0 = colorBlock[i * 4 + 0];
		c1 = colorBlock[i * 4 + 1];
		c2 = colorBlock[i * 4 + 2];
		c3 = colorBlock[i * 4 + 3];

		m = colors[0][0] - c0;
		d0 = abs( m );
		m = colors[1][0] - c0;
		d1 = abs( m );
		m = colors[2][0] - c0;
		d2 = abs( m );
		m = colors[3][0] - c0;
		d3 = abs( m );

		m = colors[0][1] - c1;
		d0 += abs( m );
		m = colors[1][1] - c1;
		d1 += abs( m );
		m = colors[2][1] - c1;
		d2 += abs( m );
		m = colors[3][1] - c1;
		d3 += abs( m );

		m = colors[0][2] - c2;
		d0 += abs( m );
		m = colors[1][2] - c2;
		d1 += abs( m );
		m = colors[2][2] - c2;
		d2 += abs( m );
		m = colors[3][2] - c2;
		d3 += abs( m );

#if 0
		int b0 = d0 > d2;
		int b1 = d1 > d3;
		int b2 = d0 > d3;
		int b3 = d1 > d2;
		int b4 = d0 > d1;
		int b5 = d2 > d3;

		result |= ( ( !b3 & b4 ) | ( b2 & b5 ) | ( ( ( b0 & b3 ) | ( b1 & b2 ) ) << 1 ) ) << ( i << 1 );
#else
		bool b0 = d0 > d3;
		bool b1 = d1 > d2;
		bool b2 = d0 > d2;
		bool b3 = d1 > d3;
		bool b4 = d2 > d3;

		int x0 = b1 & b2;
		int x1 = b0 & b3;
		int x2 = b0 & b4;

		result |= ( x2 | ( ( x0 | x1 ) << 1 ) ) << ( i << 1 );
#endif
	}

	EmitUInt( result );

#elif 1

	byte colors[4][4];
	unsigned int indexes[16];

	colors[0][0] = ( maxColor[0] & C565_5_MASK ) | ( maxColor[0] >> 6 );
	colors[0][1] = ( maxColor[1] & C565_6_MASK ) | ( maxColor[1] >> 5 );
	colors[0][2] = ( maxColor[2] & C565_5_MASK ) | ( maxColor[2] >> 6 );
	colors[0][3] = 0;
	colors[1][0] = ( minColor[0] & C565_5_MASK ) | ( minColor[0] >> 6 );
	colors[1][1] = ( minColor[1] & C565_6_MASK ) | ( minColor[1] >> 5 );
	colors[1][2] = ( minColor[2] & C565_5_MASK ) | ( minColor[2] >> 6 );
	colors[1][3] = 0;
	colors[2][0] = ( 2 * colors[0][0] + 1 * colors[1][0] ) / 3;
	colors[2][1] = ( 2 * colors[0][1] + 1 * colors[1][1] ) / 3;
	colors[2][2] = ( 2 * colors[0][2] + 1 * colors[1][2] ) / 3;
	colors[2][3] = 0;
	colors[3][0] = ( 1 * colors[0][0] + 2 * colors[1][0] ) / 3;
	colors[3][1] = ( 1 * colors[0][1] + 2 * colors[1][1] ) / 3;
	colors[3][2] = ( 1 * colors[0][2] + 2 * colors[1][2] ) / 3;
	colors[3][3] = 0;

	for( int i = 0; i < 16; i++ )
	{
		int c0, c1, c2, m, d, minDist;

		c0 = colorBlock[i * 4 + 0];
		c1 = colorBlock[i * 4 + 1];
		c2 = colorBlock[i * 4 + 2];

		m = colors[0][0] - c0;
		d = m * m;
		m = colors[0][1] - c1;
		d += m * m;
		m = colors[0][2] - c2;
		d += m * m;

		minDist = d;
		indexes[i] = 0;

		m = colors[1][0] - c0;
		d = m * m;
		m = colors[1][1] - c1;
		d += m * m;
		m = colors[1][2] - c2;
		d += m * m;

		if( d < minDist )
		{
			minDist = d;
			indexes[i] = 1;
		}

		m = colors[2][0] - c0;
		d = m * m;
		m = colors[2][1] - c1;
		d += m * m;
		m = colors[2][2] - c2;
		d += m * m;

		if( d < minDist )
		{
			minDist = d;
			indexes[i] = 2;
		}

		m = colors[3][0] - c0;
		d = m * m;
		m = colors[3][1] - c1;
		d += m * m;
		m = colors[3][2] - c2;
		d += m * m;

		if( d < minDist )
		{
			minDist = d;
			indexes[i] = 3;
		}
	}

	unsigned int result = 0;
	for( int i = 0; i < 16; i++ )
	{
		result |= ( indexes[i] << ( unsigned int )( i << 1 ) );
	}

	EmitUInt( result );

#else

	byte colors[4][4];
	unsigned int indexes[16];

	colors[0][0] = ( maxColor[0] & C565_5_MASK ) | ( maxColor[0] >> 6 );
	colors[0][1] = ( maxColor[1] & C565_6_MASK ) | ( maxColor[1] >> 5 );
	colors[0][2] = ( maxColor[2] & C565_5_MASK ) | ( maxColor[2] >> 6 );
	colors[0][3] = 0;
	colors[1][0] = ( minColor[0] & C565_5_MASK ) | ( minColor[0] >> 6 );
	colors[1][1] = ( minColor[1] & C565_6_MASK ) | ( minColor[1] >> 5 );
	colors[1][2] = ( minColor[2] & C565_5_MASK ) | ( minColor[2] >> 6 );
	colors[1][3] = 0;
	colors[2][0] = ( 2 * colors[0][0] + 1 * colors[1][0] ) / 3;
	colors[2][1] = ( 2 * colors[0][1] + 1 * colors[1][1] ) / 3;
	colors[2][2] = ( 2 * colors[0][2] + 1 * colors[1][2] ) / 3;
	colors[2][3] = 0;
	colors[3][0] = ( 1 * colors[0][0] + 2 * colors[1][0] ) / 3;
	colors[3][1] = ( 1 * colors[0][1] + 2 * colors[1][1] ) / 3;
	colors[3][2] = ( 1 * colors[0][2] + 2 * colors[1][2] ) / 3;
	colors[3][3] = 0;

	for( int i = 0; i < 16; i++ )
	{
		unsigned int minDist = ( 255 * 255 ) * 4;
		for( int j = 0; j < 4; j++ )
		{
			unsigned int dist = ColorDistance( &colorBlock[i * 4], &colors[j][0] );
			if( dist < minDist )
			{
				minDist = dist;
				indexes[i] = j;
			}
		}
	}

	unsigned int result = 0;
	for( int i = 0; i < 16; i++ )
	{
		result |= ( indexes[i] << ( unsigned int )( i << 1 ) );
	}

	EmitUInt( result );

#endif
}

/*
========================
idDxtEncoder::EmitColorAlphaIndices

params:	colorBlock	- 16 pixel block for which find color indexes
paramO:	minColor	- Min color found
paramO:	maxColor	- Max color found
return: 4 byte color index block
========================
*/
void idDxtEncoder::EmitColorAlphaIndices( const byte* colorBlock, const byte* minColor, const byte* maxColor )
{
	ALIGN16( uint16 colors[4][4] );
	unsigned int result = 0;

	colors[0][0] = ( minColor[0] & C565_5_MASK ) | ( minColor[0] >> 5 );
	colors[0][1] = ( minColor[1] & C565_6_MASK ) | ( minColor[1] >> 6 );
	colors[0][2] = ( minColor[2] & C565_5_MASK ) | ( minColor[2] >> 5 );
	colors[0][3] = 255;
	colors[1][0] = ( maxColor[0] & C565_5_MASK ) | ( maxColor[0] >> 5 );
	colors[1][1] = ( maxColor[1] & C565_6_MASK ) | ( maxColor[1] >> 6 );
	colors[1][2] = ( maxColor[2] & C565_5_MASK ) | ( maxColor[2] >> 5 );
	colors[1][3] = 255;
	colors[2][0] = ( colors[0][0] + colors[1][0] ) / 2;
	colors[2][1] = ( colors[0][1] + colors[1][1] ) / 2;
	colors[2][2] = ( colors[0][2] + colors[1][2] ) / 2;
	colors[2][3] = 255;
	colors[3][0] = 0;
	colors[3][1] = 0;
	colors[3][2] = 0;
	colors[3][3] = 0;

	// uses sum of absolute differences instead of squared distance to find the best match
	for( int i = 15; i >= 0; i-- )
	{
		int c0, c1, c2, c3, m, d0, d1, d2;

		c0 = colorBlock[i * 4 + 0];
		c1 = colorBlock[i * 4 + 1];
		c2 = colorBlock[i * 4 + 2];
		c3 = colorBlock[i * 4 + 3];

		m = colors[0][0] - c0;
		d0 = abs( m );
		m = colors[1][0] - c0;
		d1 = abs( m );
		m = colors[2][0] - c0;
		d2 = abs( m );

		m = colors[0][1] - c1;
		d0 += abs( m );
		m = colors[1][1] - c1;
		d1 += abs( m );
		m = colors[2][1] - c1;
		d2 += abs( m );

		m = colors[0][2] - c2;
		d0 += abs( m );
		m = colors[1][2] - c2;
		d1 += abs( m );
		m = colors[2][2] - c2;
		d2 += abs( m );

		unsigned int b0 = d2 > d0;
		unsigned int b1 = d2 > d1;
		unsigned int b2 = d1 > d0;
		unsigned int b3 = c3 < 128;

		// DG: add some parenthesis to appease (often rightly) warning compiler
		result |= ( ( ( ( b0 & b1 ) | b3 ) << 1 ) | ( ( b2 ^ b1 ) | b3 ) ) << ( i << 1 );
		// DG end
	}

	EmitUInt( result );
}

/*
========================
idDxtEncoder::EmitCTX1Indices

params:	colorBlock	- 16 pixel block for which find color indexes
paramO:	minColor	- Min color found
paramO:	maxColor	- Max color found
return: 4 byte color index block
========================
*/
void idDxtEncoder::EmitCTX1Indices( const byte* colorBlock, const byte* minColor, const byte* maxColor )
{
	ALIGN16( uint16 colors[4][2] );
	unsigned int result = 0;

	colors[0][0] = maxColor[0];
	colors[0][1] = maxColor[1];
	colors[1][0] = minColor[0];
	colors[1][1] = minColor[1];

	colors[2][0] = ( 2 * colors[0][0] + 1 * colors[1][0] ) / 3;
	colors[2][1] = ( 2 * colors[0][1] + 1 * colors[1][1] ) / 3;
	colors[3][0] = ( 1 * colors[0][0] + 2 * colors[1][0] ) / 3;
	colors[3][1] = ( 1 * colors[0][1] + 2 * colors[1][1] ) / 3;

	for( int i = 15; i >= 0; i-- )
	{
		int c0, c1, m, d0, d1, d2, d3;

		c0 = colorBlock[i * 4 + 0];
		c1 = colorBlock[i * 4 + 1];

		m = colors[0][0] - c0;
		d0 = abs( m );
		m = colors[1][0] - c0;
		d1 = abs( m );
		m = colors[2][0] - c0;
		d2 = abs( m );
		m = colors[3][0] - c0;
		d3 = abs( m );

		m = colors[0][1] - c1;
		d0 += abs( m );
		m = colors[1][1] - c1;
		d1 += abs( m );
		m = colors[2][1] - c1;
		d2 += abs( m );
		m = colors[3][1] - c1;
		d3 += abs( m );

		bool b0 = d0 > d3;
		bool b1 = d1 > d2;
		bool b2 = d0 > d2;
		bool b3 = d1 > d3;
		bool b4 = d2 > d3;

		int x0 = b1 & b2;
		int x1 = b0 & b3;
		int x2 = b0 & b4;

		result |= ( x2 | ( ( x0 | x1 ) << 1 ) ) << ( i << 1 );
	}

	EmitUInt( result );
}

/*
========================
idDxtEncoder::EmitAlphaIndices

params:	colorBlock	- 16 pixel block for which find alpha indexes
paramO:	minAlpha	- Min alpha found
paramO:	maxAlpha	- Max alpha found
========================
*/
void idDxtEncoder::EmitAlphaIndices( const byte* colorBlock, const int offset, const byte minAlpha, const byte maxAlpha )
{

	assert( maxAlpha >= minAlpha );

	const int ALPHA_RANGE = 7;

#if 1

	byte ab1, ab2, ab3, ab4, ab5, ab6, ab7;
	ALIGN16( byte indexes[16] );

	ab1 = ( 13 * maxAlpha +  1 * minAlpha + ALPHA_RANGE ) / ( ALPHA_RANGE * 2 );
	ab2 = ( 11 * maxAlpha +  3 * minAlpha + ALPHA_RANGE ) / ( ALPHA_RANGE * 2 );
	ab3 = ( 9 * maxAlpha +  5 * minAlpha + ALPHA_RANGE ) / ( ALPHA_RANGE * 2 );
	ab4 = ( 7 * maxAlpha +  7 * minAlpha + ALPHA_RANGE ) / ( ALPHA_RANGE * 2 );
	ab5 = ( 5 * maxAlpha +  9 * minAlpha + ALPHA_RANGE ) / ( ALPHA_RANGE * 2 );
	ab6 = ( 3 * maxAlpha + 11 * minAlpha + ALPHA_RANGE ) / ( ALPHA_RANGE * 2 );
	ab7 = ( 1 * maxAlpha + 13 * minAlpha + ALPHA_RANGE ) / ( ALPHA_RANGE * 2 );

	colorBlock += offset;

	for( int i = 0; i < 16; i++ )
	{
		byte a = colorBlock[i * 4];
		int b1 = ( a >= ab1 );
		int b2 = ( a >= ab2 );
		int b3 = ( a >= ab3 );
		int b4 = ( a >= ab4 );
		int b5 = ( a >= ab5 );
		int b6 = ( a >= ab6 );
		int b7 = ( a >= ab7 );
		int index = ( 8 - b1 - b2 - b3 - b4 - b5 - b6 - b7 ) & 7;
		indexes[i] = byte( index ^ ( 2 > index ) );
	}

	EmitByte( ( indexes[ 0] >> 0 ) | ( indexes[ 1] << 3 ) | ( indexes[ 2] << 6 ) );
	EmitByte( ( indexes[ 2] >> 2 ) | ( indexes[ 3] << 1 ) | ( indexes[ 4] << 4 ) | ( indexes[ 5] << 7 ) );
	EmitByte( ( indexes[ 5] >> 1 ) | ( indexes[ 6] << 2 ) | ( indexes[ 7] << 5 ) );

	EmitByte( ( indexes[ 8] >> 0 ) | ( indexes[ 9] << 3 ) | ( indexes[10] << 6 ) );
	EmitByte( ( indexes[10] >> 2 ) | ( indexes[11] << 1 ) | ( indexes[12] << 4 ) | ( indexes[13] << 7 ) );
	EmitByte( ( indexes[13] >> 1 ) | ( indexes[14] << 2 ) | ( indexes[15] << 5 ) );

#elif 0

	ALIGN16( byte indexes[16] );
	byte delta = maxAlpha - minAlpha;
	byte half = delta >> 1;
	byte bias = delta / ( 2 * ALPHA_RANGE );
	byte bottom = minAlpha + bias;
	byte top = maxAlpha - bias;

	colorBlock += offset;

	for( int i = 0; i < 16; i++ )
	{
		byte a = colorBlock[i * 4];
		if( a <= bottom )
		{
			indexes[i] = 1;
		}
		else if( a >= top )
		{
			indexes[i] = 0;
		}
		else
		{
			indexes[i] = ( ALPHA_RANGE + 1 ) + ( ( minAlpha - a ) * ALPHA_RANGE - half ) / delta;
		}
	}

	EmitByte( ( indexes[ 0] >> 0 ) | ( indexes[ 1] << 3 ) | ( indexes[ 2] << 6 ) );
	EmitByte( ( indexes[ 2] >> 2 ) | ( indexes[ 3] << 1 ) | ( indexes[ 4] << 4 ) | ( indexes[ 5] << 7 ) );
	EmitByte( ( indexes[ 5] >> 1 ) | ( indexes[ 6] << 2 ) | ( indexes[ 7] << 5 ) );

	EmitByte( ( indexes[ 8] >> 0 ) | ( indexes[ 9] << 3 ) | ( indexes[10] << 6 ) );
	EmitByte( ( indexes[10] >> 2 ) | ( indexes[11] << 1 ) | ( indexes[12] << 4 ) | ( indexes[13] << 7 ) );
	EmitByte( ( indexes[13] >> 1 ) | ( indexes[14] << 2 ) | ( indexes[15] << 5 ) );

#elif 0

	ALIGN16( byte indexes[16] );
	byte delta = maxAlpha - minAlpha;
	byte half = delta >> 1;
	byte bias = delta / ( 2 * ALPHA_RANGE );
	byte bottom = minAlpha + bias;
	byte top = maxAlpha - bias;

	colorBlock += offset;

	for( int i = 0; i < 16; i++ )
	{
		byte a = colorBlock[i * 4];
		int index = ( ALPHA_RANGE + 1 ) + ( ( minAlpha - a ) * ALPHA_RANGE - half ) / delta;
		int c0 = a > bottom;
		int c1 = a < top;
		indexes[i] = ( index & -( c0 & c1 ) ) | ( c0 ^ 1 );
	}

	EmitByte( ( indexes[ 0] >> 0 ) | ( indexes[ 1] << 3 ) | ( indexes[ 2] << 6 ) );
	EmitByte( ( indexes[ 2] >> 2 ) | ( indexes[ 3] << 1 ) | ( indexes[ 4] << 4 ) | ( indexes[ 5] << 7 ) );
	EmitByte( ( indexes[ 5] >> 1 ) | ( indexes[ 6] << 2 ) | ( indexes[ 7] << 5 ) );

	EmitByte( ( indexes[ 8] >> 0 ) | ( indexes[ 9] << 3 ) | ( indexes[10] << 6 ) );
	EmitByte( ( indexes[10] >> 2 ) | ( indexes[11] << 1 ) | ( indexes[12] << 4 ) | ( indexes[13] << 7 ) );
	EmitByte( ( indexes[13] >> 1 ) | ( indexes[14] << 2 ) | ( indexes[15] << 5 ) );

#else

	ALIGN16( byte indexes[16] );
	ALIGN16( byte alphas[8] );

	alphas[0] = maxAlpha;
	alphas[1] = minAlpha;
	alphas[2] = ( 6 * maxAlpha + 1 * minAlpha ) / ALPHA_RANGE;
	alphas[3] = ( 5 * maxAlpha + 2 * minAlpha ) / ALPHA_RANGE;
	alphas[4] = ( 4 * maxAlpha + 3 * minAlpha ) / ALPHA_RANGE;
	alphas[5] = ( 3 * maxAlpha + 4 * minAlpha ) / ALPHA_RANGE;
	alphas[6] = ( 2 * maxAlpha + 5 * minAlpha ) / ALPHA_RANGE;
	alphas[7] = ( 1 * maxAlpha + 6 * minAlpha ) / ALPHA_RANGE;

	colorBlock += offset;

	for( int i = 0; i < 16; i++ )
	{
		int minDist = INT_MAX;
		byte a = colorBlock[i * 4];
		for( int j = 0; j < 8; j++ )
		{
			int dist = abs( a - alphas[j] );
			if( dist < minDist )
			{
				minDist = dist;
				indexes[i] = j;
			}
		}
	}

	EmitByte( ( indexes[ 0] >> 0 ) | ( indexes[ 1] << 3 ) | ( indexes[ 2] << 6 ) );
	EmitByte( ( indexes[ 2] >> 2 ) | ( indexes[ 3] << 1 ) | ( indexes[ 4] << 4 ) | ( indexes[ 5] << 7 ) );
	EmitByte( ( indexes[ 5] >> 1 ) | ( indexes[ 6] << 2 ) | ( indexes[ 7] << 5 ) );

	EmitByte( ( indexes[ 8] >> 0 ) | ( indexes[ 9] << 3 ) | ( indexes[10] << 6 ) );
	EmitByte( ( indexes[10] >> 2 ) | ( indexes[11] << 1 ) | ( indexes[12] << 4 ) | ( indexes[13] << 7 ) );
	EmitByte( ( indexes[13] >> 1 ) | ( indexes[14] << 2 ) | ( indexes[15] << 5 ) );

#endif
}

/*
========================
idDxtEncoder::CompressImageDXT1Fast_Generic

params:	inBuf		- image to compress
paramO:	outBuf		- result of compression
params:	width		- width of image
params:	height		- height of image
========================
*/
void idDxtEncoder::CompressImageDXT1Fast_Generic( const byte* inBuf, byte* outBuf, int width, int height )
{
	ALIGN16( byte block[64] );
	ALIGN16( byte minColor[4] );
	ALIGN16( byte maxColor[4] );

	assert( width >= 4 && ( width & 3 ) == 0 );
	assert( height >= 4 && ( height & 3 ) == 0 );

	this->width = width;
	this->height = height;
	this->outData = outBuf;

	for( int j = 0; j < height; j += 4, inBuf += width * 4 * 4 )
	{
		for( int i = 0; i < width; i += 4 )
		{
			common->LoadPacifierBinarizeProgressIncrement( 16 );

			ExtractBlock( inBuf + i * 4, width, block );

			GetMinMaxBBox( block, minColor, maxColor );
			//SelectColorsDiagonal( block, minColor, maxColor );
			InsetColorsBBox( minColor, maxColor );

			EmitUShort( ColorTo565( maxColor ) );
			EmitUShort( ColorTo565( minColor ) );

			EmitColorIndices( block, minColor, maxColor );
		}
		outData += dstPadding;
		inBuf += srcPadding;
	}
}

/*
========================
idDxtEncoder::CompressImageDXT1AlphaFast_Generic

params:	inBuf		- image to compress
paramO:	outBuf		- result of compression
params:	width		- width of image
params:	height		- height of image
========================
*/
void idDxtEncoder::CompressImageDXT1AlphaFast_Generic( const byte* inBuf, byte* outBuf, int width, int height )
{
	ALIGN16( byte block[64] );
	ALIGN16( byte minColor[4] );
	ALIGN16( byte maxColor[4] );

	assert( width >= 4 && ( width & 3 ) == 0 );
	assert( height >= 4 && ( height & 3 ) == 0 );

	this->width = width;
	this->height = height;
	this->outData = outBuf;

	for( int j = 0; j < height; j += 4, inBuf += width * 4 * 4 )
	{
		for( int i = 0; i < width; i += 4 )
		{
			common->LoadPacifierBinarizeProgressIncrement( 16 );

			ExtractBlock( inBuf + i * 4, width, block );

			GetMinMaxBBox( block, minColor, maxColor );
			byte minAlpha = minColor[3];
			//SelectColorsDiagonal( block, minColor, maxColor );
			InsetColorsBBox( minColor, maxColor );

			if( minAlpha >= 128 )
			{
				EmitUShort( ColorTo565( maxColor ) );
				EmitUShort( ColorTo565( minColor ) );
				EmitColorIndices( block, minColor, maxColor );
			}
			else
			{
				EmitUShort( ColorTo565( minColor ) );
				EmitUShort( ColorTo565( maxColor ) );
				EmitColorAlphaIndices( block, minColor, maxColor );
			}
		}
		outData += dstPadding;
		inBuf += srcPadding;
	}
}

/*
========================
idDxtEncoder::CompressImageDXT5Fast_Generic

params:	inBuf		- image to compress
paramO:	outBuf		- result of compression
params:	width		- width of image
params:	height		- height of image
========================
*/
void idDxtEncoder::CompressImageDXT5Fast_Generic( const byte* inBuf, byte* outBuf, int width, int height )
{
	ALIGN16( byte block[64] );
	ALIGN16( byte minColor[4] );
	ALIGN16( byte maxColor[4] );

	assert( width >= 4 && ( width & 3 ) == 0 );
	assert( height >= 4 && ( height & 3 ) == 0 );

	this->width = width;
	this->height = height;
	this->outData = outBuf;

	for( int j = 0; j < height; j += 4, inBuf += width * 4 * 4 )
	{
		for( int i = 0; i < width; i += 4 )
		{
			common->LoadPacifierBinarizeProgressIncrement( 16 );

			ExtractBlock( inBuf + i * 4, width, block );

			GetMinMaxBBox( block, minColor, maxColor );
			//SelectColorsDiagonal( block, minColor, maxColor );
			InsetColorsBBox( minColor, maxColor );

			EmitByte( maxColor[3] );
			EmitByte( minColor[3] );

			EmitAlphaIndices( block, 3, minColor[3], maxColor[3] );

#ifdef NVIDIA_7X_HARDWARE_BUG_FIX
			// the colors are already always guaranteed to be sorted properly
#endif

			EmitUShort( ColorTo565( maxColor ) );
			EmitUShort( ColorTo565( minColor ) );

			EmitColorIndices( block, minColor, maxColor );
		}
		outData += dstPadding;
		inBuf += srcPadding;
	}
}

/*
========================
idDxtEncoder::ScaleYCoCg
========================
*/
void idDxtEncoder::ScaleYCoCg( byte* colorBlock, byte* minColor, byte* maxColor ) const
{
	int m0 = abs( minColor[0] - 128 );
	int m1 = abs( minColor[1] - 128 );
	int m2 = abs( maxColor[0] - 128 );
	int m3 = abs( maxColor[1] - 128 );

	if( m1 > m0 )
	{
		m0 = m1;
	}
	if( m3 > m2 )
	{
		m2 = m3;
	}
	if( m2 > m0 )
	{
		m0 = m2;
	}

	const int s0 = 128 / 2 - 1;
	const int s1 = 128 / 4 - 1;

	int mask0 = -( m0 <= s0 );
	int mask1 = -( m0 <= s1 );
	int scale = 1 + ( 1 & mask0 ) + ( 2 & mask1 );

	minColor[0] = byte( ( minColor[0] - 128 ) * scale + 128 );
	minColor[1] = byte( ( minColor[1] - 128 ) * scale + 128 );
	minColor[2] = byte( ( scale - 1 ) << 3 );
	maxColor[0] = byte( ( maxColor[0] - 128 ) * scale + 128 );
	maxColor[1] = byte( ( maxColor[1] - 128 ) * scale + 128 );
	maxColor[2] = byte( ( scale - 1 ) << 3 );

	for( int i = 0; i < 16; i++ )
	{
		colorBlock[i * 4 + 0] = byte( ( colorBlock[i * 4 + 0] - 128 ) * scale + 128 );
		colorBlock[i * 4 + 1] = byte( ( colorBlock[i * 4 + 1] - 128 ) * scale + 128 );
	}
}

/*
========================
idDxtEncoder::InsetYCoCgBBox
========================
*/
ID_INLINE void idDxtEncoder::InsetYCoCgBBox( byte* minColor, byte* maxColor ) const
{

#if 0

	byte inset[4];

	inset[0] = ( maxColor[0] - minColor[0] ) >> INSET_COLOR_SHIFT;
	inset[1] = ( maxColor[1] - minColor[1] ) >> INSET_COLOR_SHIFT;
	inset[3] = ( maxColor[3] - minColor[3] ) >> INSET_ALPHA_SHIFT;

	minColor[0] = ( minColor[0] + inset[0] <= 255 ) ? minColor[0] + inset[0] : 255;
	minColor[1] = ( minColor[1] + inset[1] <= 255 ) ? minColor[1] + inset[1] : 255;
	minColor[3] = ( minColor[3] + inset[3] <= 255 ) ? minColor[3] + inset[3] : 255;

	maxColor[0] = ( maxColor[0] >= inset[0] ) ? maxColor[0] - inset[0] : 0;
	maxColor[1] = ( maxColor[1] >= inset[1] ) ? maxColor[1] - inset[1] : 0;
	maxColor[3] = ( maxColor[3] >= inset[3] ) ? maxColor[3] - inset[3] : 0;

	minColor[0] = ( minColor[0] & C565_5_MASK ) | ( minColor[0] >> 5 );
	minColor[1] = ( minColor[1] & C565_6_MASK ) | ( minColor[1] >> 6 );

	maxColor[0] = ( maxColor[0] & C565_5_MASK ) | ( maxColor[0] >> 5 );
	maxColor[1] = ( maxColor[1] & C565_6_MASK ) | ( maxColor[1] >> 6 );

#elif 0

	float inset[4];
	float minf[4];
	float maxf[4];

	for( int i = 0; i < 4; i++ )
	{
		minf[i] = minColor[i] / 255.0f;
		maxf[i] = maxColor[i] / 255.0f;
	}

	inset[0] = ( maxf[0] - minf[0] ) / 16.0f;
	inset[1] = ( maxf[1] - minf[1] ) / 16.0f;
	inset[2] = ( maxf[2] - minf[2] ) / 16.0f;
	inset[3] = ( maxf[3] - minf[3] ) / 32.0f;

	for( int i = 0; i < 4; i++ )
	{
		minf[i] = ( minf[i] + inset[i] <= 1.0f ) ? minf[i] + inset[i] : 1.0f;
		maxf[i] = ( maxf[i] >= inset[i] ) ? maxf[i] - inset[i] : 0;
	}

	minColor[0] = ( ( int )floor( minf[0] * 31 ) ) & ( ( 1 << 5 ) - 1 );
	minColor[1] = ( ( int )floor( minf[1] * 63 ) ) & ( ( 1 << 6 ) - 1 );

	maxColor[0] = ( ( int )ceil( maxf[0] * 31 ) ) & ( ( 1 << 5 ) - 1 );
	maxColor[1] = ( ( int )ceil( maxf[1] * 63 ) ) & ( ( 1 << 6 ) - 1 );

	minColor[0] = ( minColor[0] << 3 ) | ( minColor[0] >> 2 );
	minColor[1] = ( minColor[1] << 2 ) | ( minColor[1] >> 4 );

	maxColor[0] = ( maxColor[0] << 3 ) | ( maxColor[0] >> 2 );
	maxColor[1] = ( maxColor[1] << 2 ) | ( maxColor[1] >> 4 );

	minColor[3] = ( int )floor( minf[3] * 255.0f );
	maxColor[3] = ( int )ceil( maxf[3] * 255.0f );

#elif 0

	int inset[4];
	int mini[4];
	int maxi[4];

	inset[0] = ( maxColor[0] - minColor[0] );
	inset[1] = ( maxColor[1] - minColor[1] );
	inset[3] = ( maxColor[3] - minColor[3] );

	mini[0] = ( minColor[0] << INSET_COLOR_SHIFT ) + inset[0];
	mini[1] = ( minColor[1] << INSET_COLOR_SHIFT ) + inset[1];
	mini[3] = ( minColor[3] << INSET_ALPHA_SHIFT ) + inset[3];

	maxi[0] = ( maxColor[0] << INSET_COLOR_SHIFT ) - inset[0];
	maxi[1] = ( maxColor[1] << INSET_COLOR_SHIFT ) - inset[1];
	maxi[3] = ( maxColor[3] << INSET_ALPHA_SHIFT ) - inset[3];

	mini[0] = ( mini[0] - ( ( 1 << ( 3 ) ) - 1 ) ) >> ( INSET_COLOR_SHIFT + 3 );
	mini[1] = ( mini[1] - ( ( 1 << ( 3 ) ) - 1 ) ) >> ( INSET_COLOR_SHIFT + 2 );
	mini[3] = ( mini[3] - ( ( 1 << ( 2 ) ) - 1 ) ) >> ( INSET_ALPHA_SHIFT + 0 );

	maxi[0] = ( maxi[0] + ( ( 1 << ( 3 ) ) - 1 ) ) >> ( INSET_COLOR_SHIFT + 3 );
	maxi[1] = ( maxi[1] + ( ( 1 << ( 3 ) ) - 1 ) ) >> ( INSET_COLOR_SHIFT + 2 );
	maxi[3] = ( maxi[3] + ( ( 1 << ( 2 ) ) - 1 ) ) >> ( INSET_ALPHA_SHIFT + 0 );

	if( mini[0] < 0 )
	{
		mini[0] = 0;
	}
	if( mini[1] < 0 )
	{
		mini[1] = 0;
	}
	if( mini[3] < 0 )
	{
		mini[3] = 0;
	}

	if( maxi[0] > 31 )
	{
		maxi[0] = 31;
	}
	if( maxi[1] > 63 )
	{
		maxi[1] = 63;
	}
	if( maxi[3] > 255 )
	{
		maxi[3] = 255;
	}

	minColor[0] = ( mini[0] << 3 ) | ( mini[0] >> 2 );
	minColor[1] = ( mini[1] << 2 ) | ( mini[1] >> 4 );
	minColor[3] = mini[3];

	maxColor[0] = ( maxi[0] << 3 ) | ( maxi[0] >> 2 );
	maxColor[1] = ( maxi[1] << 2 ) | ( maxi[1] >> 4 );
	maxColor[3] = maxi[3];

#elif 1

	int inset[4];
	int mini[4];
	int maxi[4];

	inset[0] = ( maxColor[0] - minColor[0] ) - ( ( 1 << ( INSET_COLOR_SHIFT - 1 ) ) - 1 );
	inset[1] = ( maxColor[1] - minColor[1] ) - ( ( 1 << ( INSET_COLOR_SHIFT - 1 ) ) - 1 );
	inset[3] = ( maxColor[3] - minColor[3] ) - ( ( 1 << ( INSET_ALPHA_SHIFT - 1 ) ) - 1 );

	mini[0] = ( ( minColor[0] << INSET_COLOR_SHIFT ) + inset[0] ) >> INSET_COLOR_SHIFT;
	mini[1] = ( ( minColor[1] << INSET_COLOR_SHIFT ) + inset[1] ) >> INSET_COLOR_SHIFT;
	mini[3] = ( ( minColor[3] << INSET_ALPHA_SHIFT ) + inset[3] ) >> INSET_ALPHA_SHIFT;

	maxi[0] = ( ( maxColor[0] << INSET_COLOR_SHIFT ) - inset[0] ) >> INSET_COLOR_SHIFT;
	maxi[1] = ( ( maxColor[1] << INSET_COLOR_SHIFT ) - inset[1] ) >> INSET_COLOR_SHIFT;
	maxi[3] = ( ( maxColor[3] << INSET_ALPHA_SHIFT ) - inset[3] ) >> INSET_ALPHA_SHIFT;

	mini[0] = ( mini[0] >= 0 ) ? mini[0] : 0;
	mini[1] = ( mini[1] >= 0 ) ? mini[1] : 0;
	mini[3] = ( mini[3] >= 0 ) ? mini[3] : 0;

	maxi[0] = ( maxi[0] <= 255 ) ? maxi[0] : 255;
	maxi[1] = ( maxi[1] <= 255 ) ? maxi[1] : 255;
	maxi[3] = ( maxi[3] <= 255 ) ? maxi[3] : 255;

	minColor[0] = byte( ( mini[0] & C565_5_MASK ) | ( mini[0] >> 5 ) );
	minColor[1] = byte( ( mini[1] & C565_6_MASK ) | ( mini[1] >> 6 ) );
	minColor[3] = byte( mini[3] );

	maxColor[0] = byte( ( maxi[0] & C565_5_MASK ) | ( maxi[0] >> 5 ) );
	maxColor[1] = byte( ( maxi[1] & C565_6_MASK ) | ( maxi[1] >> 6 ) );
	maxColor[3] = byte( maxi[3] );

#endif
}

/*
========================
idDxtEncoder::InsetYCoCgAlpaBBox
========================
*/
ID_INLINE void idDxtEncoder::InsetYCoCgAlpaBBox( byte* minColor, byte* maxColor ) const
{
	int inset[4];
	int mini[4];
	int maxi[4];

	inset[0] = ( maxColor[0] - minColor[0] ) - ( ( 1 << ( INSET_COLOR_SHIFT - 1 ) ) - 1 );
	inset[1] = ( maxColor[1] - minColor[1] ) - ( ( 1 << ( INSET_COLOR_SHIFT - 1 ) ) - 1 );
	inset[2] = ( maxColor[2] - minColor[2] ) - ( ( 1 << ( INSET_COLOR_SHIFT - 1 ) ) - 1 );
	inset[3] = ( maxColor[3] - minColor[3] ) - ( ( 1 << ( INSET_ALPHA_SHIFT - 1 ) ) - 1 );

	mini[0] = ( ( minColor[0] << INSET_COLOR_SHIFT ) + inset[0] ) >> INSET_COLOR_SHIFT;
	mini[1] = ( ( minColor[1] << INSET_COLOR_SHIFT ) + inset[1] ) >> INSET_COLOR_SHIFT;
	mini[2] = ( ( minColor[2] << INSET_COLOR_SHIFT ) + inset[2] ) >> INSET_COLOR_SHIFT;
	mini[3] = ( ( minColor[3] << INSET_ALPHA_SHIFT ) + inset[3] ) >> INSET_ALPHA_SHIFT;

	maxi[0] = ( ( maxColor[0] << INSET_COLOR_SHIFT ) - inset[0] ) >> INSET_COLOR_SHIFT;
	maxi[1] = ( ( maxColor[1] << INSET_COLOR_SHIFT ) - inset[1] ) >> INSET_COLOR_SHIFT;
	maxi[2] = ( ( maxColor[2] << INSET_COLOR_SHIFT ) - inset[2] ) >> INSET_COLOR_SHIFT;
	maxi[3] = ( ( maxColor[3] << INSET_ALPHA_SHIFT ) - inset[3] ) >> INSET_ALPHA_SHIFT;

	mini[0] = ( mini[0] >= 0 ) ? mini[0] : 0;
	mini[1] = ( mini[1] >= 0 ) ? mini[1] : 0;
	mini[2] = ( mini[2] >= 0 ) ? mini[2] : 0;
	mini[3] = ( mini[3] >= 0 ) ? mini[3] : 0;

	maxi[0] = ( maxi[0] <= 255 ) ? maxi[0] : 255;
	maxi[1] = ( maxi[1] <= 255 ) ? maxi[1] : 255;
	maxi[2] = ( maxi[2] <= 255 ) ? maxi[2] : 255;
	maxi[3] = ( maxi[3] <= 255 ) ? maxi[3] : 255;

	minColor[0] = byte( ( mini[0] & C565_5_MASK ) | ( mini[0] >> 5 ) );
	minColor[1] = byte( ( mini[1] & C565_6_MASK ) | ( mini[1] >> 6 ) );
	minColor[2] = byte( ( mini[2] & C565_5_MASK ) | ( mini[2] >> 5 ) );
	minColor[3] = byte( mini[3] );

	maxColor[0] = byte( ( maxi[0] & C565_5_MASK ) | ( maxi[0] >> 5 ) );
	maxColor[1] = byte( ( maxi[1] & C565_6_MASK ) | ( maxi[1] >> 6 ) );
	maxColor[2] = byte( ( maxi[2] & C565_5_MASK ) | ( maxi[2] >> 5 ) );
	maxColor[3] = byte( maxi[3] );
}

/*
========================
idDxtEncoder::SelectYCoCgDiagonal
========================
*/
void idDxtEncoder::SelectYCoCgDiagonal( const byte* colorBlock, byte* minColor, byte* maxColor ) const
{
	byte side = 0;

	byte mid0 = byte( ( ( int ) minColor[0] + maxColor[0] + 1 ) >> 1 );
	byte mid1 = byte( ( ( int ) minColor[1] + maxColor[1] + 1 ) >> 1 );

	for( int i = 0; i < 16; i++ )
	{
		byte b0 = colorBlock[i * 4 + 0] >= mid0;
		byte b1 = colorBlock[i * 4 + 1] >= mid1;
		side += ( b0 ^ b1 );
	}

	byte mask = -( side > 8 );

#if defined NVIDIA_7X_HARDWARE_BUG_FIX
	mask &= -( minColor[0] != maxColor[0] );
#endif

	byte c0 = minColor[1];
	byte c1 = maxColor[1];

	c0 ^= c1;
	mask &= c0;
	c1 ^= mask;
	c0 ^= c1;

	minColor[1] = c0;
	maxColor[1] = c1;
}

/*
========================
idDxtEncoder::CompressYCoCgDXT5Fast_Generic

params:	inBuf		- image to compress
paramO:	outBuf		- result of compression
params:	width		- width of image
params:	height		- height of image
========================
*/
void idDxtEncoder::CompressYCoCgDXT5Fast_Generic( const byte* inBuf, byte* outBuf, int width, int height )
{
	ALIGN16( byte block[64] );
	ALIGN16( byte minColor[4] );
	ALIGN16( byte maxColor[4] );

	//assert( HasConstantValuePer4x4Block( inBuf, width, height, 2 ) );

	assert( width >= 4 && ( width & 3 ) == 0 );
	assert( height >= 4 && ( height & 3 ) == 0 );

	this->width = width;
	this->height = height;
	this->outData = outBuf;

	for( int j = 0; j < height; j += 4, inBuf += width * 4 * 4 )
	{
		for( int i = 0; i < width; i += 4 )
		{
			common->LoadPacifierBinarizeProgressIncrement( 16 );

			ExtractBlock( inBuf + i * 4, width, block );

			GetMinMaxBBox( block, minColor, maxColor );
			ScaleYCoCg( block, minColor, maxColor );
			InsetYCoCgBBox( minColor, maxColor );
			SelectYCoCgDiagonal( block, minColor, maxColor );

			EmitByte( maxColor[3] );
			EmitByte( minColor[3] );

			EmitAlphaIndices( block, 3, minColor[3], maxColor[3] );

#ifdef NVIDIA_7X_HARDWARE_BUG_FIX
			// the colors are already sorted when selecting the diagonal
#endif

			EmitUShort( ColorTo565( maxColor ) );
			EmitUShort( ColorTo565( minColor ) );

			EmitColorIndices( block, minColor, maxColor );
		}
		outData += dstPadding;
		inBuf += srcPadding;
	}
}

/*
========================
idDxtEncoder::CompressYCoCgAlphaDXT5Fast

params:	inBuf		- image to compress
paramO:	outBuf		- result of compression
params:	width		- width of image
params:	height		- height of image
========================
*/
void idDxtEncoder::CompressYCoCgAlphaDXT5Fast( const byte* inBuf, byte* outBuf, int width, int height )
{
	ALIGN16( byte block[64] );
	ALIGN16( byte minColor[4] );
	ALIGN16( byte maxColor[4] );

	assert( width >= 4 && ( width & 3 ) == 0 );
	assert( height >= 4 && ( height & 3 ) == 0 );

	this->width = width;
	this->height = height;
	this->outData = outBuf;

	for( int j = 0; j < height; j += 4, inBuf += width * 4 * 4 )
	{
		for( int i = 0; i < width; i += 4 )
		{
			common->LoadPacifierBinarizeProgressIncrement( 16 );

			ExtractBlock( inBuf + i * 4, width, block );

			// scale down the chroma of texels that are close to gray with low luminance
			for( int k = 0; k < 16; k++ )
			{
				if( abs( block[k * 4 + 0] - 132 ) <= 8 &&
						abs( block[k * 4 + 2] - 132 ) <= 8 &&
						block[k * 4 + 3] < 96 )
				{
					block[k * 4 + 0] = ( block[k * 4 + 0] - 132 ) / 2 + 132;
					block[k * 4 + 2] = ( block[k * 4 + 2] - 132 ) / 2 + 132;
				}
			}

			GetMinMaxBBox( block, minColor, maxColor );
			InsetYCoCgAlpaBBox( minColor, maxColor );
			SelectColorsDiagonal( block, minColor, maxColor );

			EmitByte( maxColor[3] );
			EmitByte( minColor[3] );

			EmitAlphaIndices( block, 3, minColor[3], maxColor[3] );

#ifdef NVIDIA_7X_HARDWARE_BUG_FIX
			// the colors are already sorted when selecting the diagonal
#endif

			EmitUShort( ColorTo565( maxColor ) );
			EmitUShort( ColorTo565( minColor ) );

			EmitColorIndices( block, minColor, maxColor );
		}
		outData += dstPadding;
		inBuf += srcPadding;
	}
}

/*
========================
idDxtEncoder::CompressYCoCgCTX1DXT5AFast_Generic

params:	inBuf		- image to compress
paramO:	outBuf		- result of compression
params:	width		- width of image
params:	height		- height of image
========================
*/
void idDxtEncoder::CompressYCoCgCTX1DXT5AFast_Generic( const byte* inBuf, byte* outBuf, int width, int height )
{
	ALIGN16( byte block[64] );
	ALIGN16( byte minColor[4] );
	ALIGN16( byte maxColor[4] );

	assert( HasConstantValuePer4x4Block( inBuf, width, height, 2 ) );

	assert( width >= 4 && ( width & 3 ) == 0 );
	assert( height >= 4 && ( height & 3 ) == 0 );

	this->width = width;
	this->height = height;
	this->outData = outBuf;

	for( int j = 0; j < height; j += 4, inBuf += width * 4 * 4 )
	{
		for( int i = 0; i < width; i += 4 )
		{
			common->LoadPacifierBinarizeProgressIncrement( 16 );

			ExtractBlock( inBuf + i * 4, width, block );

			GetMinMaxBBox( block, minColor, maxColor );
			SelectYCoCgDiagonal( block, minColor, maxColor );
			InsetColorsBBox( minColor, maxColor );

			EmitByte( maxColor[3] );
			EmitByte( minColor[3] );

			EmitAlphaIndices( block, 3, minColor[3], maxColor[3] );

			EmitByte( maxColor[0] );
			EmitByte( maxColor[1] );
			EmitByte( minColor[0] );
			EmitByte( minColor[1] );

			EmitCTX1Indices( block, minColor, maxColor );
		}
		outData += dstPadding;
		inBuf += srcPadding;
	}
}

/*
========================
idDxtEncoder::EmitGreenIndices

params:	block		- block for which to find green indices
paramO:	minGreen	- Min green found
paramO:	maxGreen	- Max green found
========================
*/
void idDxtEncoder::EmitGreenIndices( const byte* block, const int offset, const byte minGreen, const byte maxGreen )
{

	assert( maxGreen >= minGreen );

	const int COLOR_RANGE = 3;

#if 1

	byte yb1 = ( 5 * maxGreen + 1 * minGreen + COLOR_RANGE ) / ( 2 * COLOR_RANGE );
	byte yb2 = ( 3 * maxGreen + 3 * minGreen + COLOR_RANGE ) / ( 2 * COLOR_RANGE );
	byte yb3 = ( 1 * maxGreen + 5 * minGreen + COLOR_RANGE ) / ( 2 * COLOR_RANGE );

	unsigned int result = 0;

	block += offset;

	for( int i = 15; i >= 0; i-- )
	{
		result <<= 2;
		byte y = block[i * 4];
		int b1 = ( y >= yb1 );
		int b2 = ( y >= yb2 );
		int b3 = ( y >= yb3 );
		int index = ( 4 - b1 - b2 - b3 ) & 3;
		index ^= ( 2 > index );
		result |= index;
	}

	EmitUInt( result );

#else

	byte green[4];

	green[0] = maxGreen;
	green[1] = minGreen;
	green[2] = ( 2 * green[0] + 1 * green[1] ) / 3;
	green[3] = ( 1 * green[0] + 2 * green[1] ) / 3;

	unsigned int result = 0;

	block += offset;

	for( int i = 15; i >= 0; i-- )
	{
		result <<= 2;
		byte y = block[i * 4];
		int minDist = INT_MAX;
		int index;
		for( int j = 0; j < 4; j++ )
		{
			int dist = abs( y - green[j] );
			if( dist < minDist )
			{
				minDist = dist;
				index = j;
			}
		}
		result |= index;
	}

	EmitUInt( result );

#endif
}

/*
========================
idDxtEncoder::InsetNormalsBBoxDXT5
========================
*/
void idDxtEncoder::InsetNormalsBBoxDXT5( byte* minNormal, byte* maxNormal ) const
{
	int inset[4];
	int mini[4];
	int maxi[4];

	inset[3] = ( maxNormal[3] - minNormal[3] ) - ( ( 1 << ( INSET_ALPHA_SHIFT - 1 ) ) - 1 );
	inset[1] = ( maxNormal[1] - minNormal[1] ) - ( ( 1 << ( INSET_COLOR_SHIFT - 1 ) ) - 1 );

	mini[3] = ( ( minNormal[3] << INSET_ALPHA_SHIFT ) + inset[3] ) >> INSET_ALPHA_SHIFT;
	mini[1] = ( ( minNormal[1] << INSET_COLOR_SHIFT ) + inset[1] ) >> INSET_COLOR_SHIFT;

	maxi[3] = ( ( maxNormal[3] << INSET_ALPHA_SHIFT ) - inset[3] ) >> INSET_ALPHA_SHIFT;
	maxi[1] = ( ( maxNormal[1] << INSET_COLOR_SHIFT ) - inset[1] ) >> INSET_COLOR_SHIFT;

	mini[3] = ( mini[3] >= 0 ) ? mini[3] : 0;
	mini[1] = ( mini[1] >= 0 ) ? mini[1] : 0;

	maxi[3] = ( maxi[3] <= 255 ) ? maxi[3] : 255;
	maxi[1] = ( maxi[1] <= 255 ) ? maxi[1] : 255;

	minNormal[3] = byte( mini[3] );
	minNormal[1] = byte( ( mini[1] & C565_6_MASK ) | ( mini[1] >> 6 ) );

	maxNormal[3] = byte( maxi[3] );
	maxNormal[1] = byte( ( maxi[1] & C565_6_MASK ) | ( maxi[1] >> 6 ) );
}

/*
========================
idDxtEncoder::InsetNormalsBBox3Dc
========================
*/
void idDxtEncoder::InsetNormalsBBox3Dc( byte* minNormal, byte* maxNormal ) const
{
	int inset[4];
	int mini[4];
	int maxi[4];

	inset[0] = ( maxNormal[0] - minNormal[0] ) - ( ( 1 << ( INSET_ALPHA_SHIFT - 1 ) ) - 1 );
	inset[1] = ( maxNormal[1] - minNormal[1] ) - ( ( 1 << ( INSET_ALPHA_SHIFT - 1 ) ) - 1 );

	mini[0] = ( ( minNormal[0] << INSET_ALPHA_SHIFT ) + inset[0] ) >> INSET_ALPHA_SHIFT;
	mini[1] = ( ( minNormal[1] << INSET_ALPHA_SHIFT ) + inset[1] ) >> INSET_ALPHA_SHIFT;

	maxi[0] = ( ( maxNormal[0] << INSET_ALPHA_SHIFT ) - inset[0] ) >> INSET_ALPHA_SHIFT;
	maxi[1] = ( ( maxNormal[1] << INSET_ALPHA_SHIFT ) - inset[1] ) >> INSET_ALPHA_SHIFT;

	mini[0] = ( mini[0] >= 0 ) ? mini[0] : 0;
	mini[1] = ( mini[1] >= 0 ) ? mini[1] : 0;

	maxi[0] = ( maxi[0] <= 255 ) ? maxi[0] : 255;
	maxi[1] = ( maxi[1] <= 255 ) ? maxi[1] : 255;

	minNormal[0] = ( byte )mini[0];
	minNormal[1] = ( byte )mini[1];

	maxNormal[0] = ( byte )maxi[0];
	maxNormal[1] = ( byte )maxi[1];
}

/*
========================
idDxtEncoder::CompressNormalMapDXT5Fast_Generic

params:	inBuf		- image to compress
paramO:	outBuf		- result of compression
params:	width		- width of image
params:	height		- height of image
========================
*/
void idDxtEncoder::CompressNormalMapDXT5Fast_Generic( const byte* inBuf, byte* outBuf, int width, int height )
{
	ALIGN16( byte block[64] );
	ALIGN16( byte normal1[4] );
	ALIGN16( byte normal2[4] );

	assert( width >= 4 && ( width & 3 ) == 0 );
	assert( height >= 4 && ( height & 3 ) == 0 );

	this->width = width;
	this->height = height;
	this->outData = outBuf;

	for( int j = 0; j < height; j += 4, inBuf += width * 4 * 4 )
	{
		for( int i = 0; i < width; i += 4 )
		{
			common->LoadPacifierBinarizeProgressIncrement( 16 );

			ExtractBlock( inBuf + i * 4, width, block );

			GetMinMaxBBox( block, normal1, normal2 );
			InsetNormalsBBoxDXT5( normal1, normal2 );

			// Write out Nx into alpha channel.
			EmitByte( normal2[3] );
			EmitByte( normal1[3] );
			EmitAlphaIndices( block, 3, normal1[3], normal2[3] );

			// Write out Ny into green channel.
			EmitUShort( ColorTo565( block[0], normal2[1], block[2] ) );
			EmitUShort( ColorTo565( block[0], normal1[1], block[2] ) );
			EmitGreenIndices( block, 1, normal1[1], normal2[1] );
		}
		outData += dstPadding;
		inBuf += srcPadding;
	}
}

/*
========================
idDxtEncoder::CompressImageDXN1Fast_Generic

params:	inBuf		- image to compress
paramO:	outBuf		- result of compression
params:	width		- width of image
params:	height		- height of image
========================
*/
void idDxtEncoder::CompressImageDXN1Fast_Generic( const byte* inBuf, byte* outBuf, int width, int height )
{
	ALIGN16( byte block[64] );
	ALIGN16( byte min[4] );
	ALIGN16( byte max[4] );

	assert( width >= 4 && ( width & 3 ) == 0 );
	assert( height >= 4 && ( height & 3 ) == 0 );

	this->width = width;
	this->height = height;
	this->outData = outBuf;

	for( int j = 0; j < height; j += 4, inBuf += width * 4 * 4 )
	{
		for( int i = 0; i < width; i += 4 )
		{
			common->LoadPacifierBinarizeProgressIncrement( 16 );

			ExtractBlock( inBuf + i * 4, width, block );

			GetMinMaxBBox( block, min, max );
			InsetNormalsBBox3Dc( min, max );

			// Write out an alpha channel.
			EmitByte( max[0] );
			EmitByte( min[0] );
			EmitAlphaIndices( block, 0, min[0], max[0] );
		}
		outData += dstPadding;
		inBuf += srcPadding;
	}
}

/*
========================
idDxtEncoder::CompressNormalMapDXN2Fast_Generic

params:	inBuf		- image to compress
paramO:	outBuf		- result of compression
params:	width		- width of image
params:	height		- height of image
========================
*/
void idDxtEncoder::CompressNormalMapDXN2Fast_Generic( const byte* inBuf, byte* outBuf, int width, int height )
{
	ALIGN16( byte block[64] );
	ALIGN16( byte normal1[4] );
	ALIGN16( byte normal2[4] );

	assert( width >= 4 && ( width & 3 ) == 0 );
	assert( height >= 4 && ( height & 3 ) == 0 );

	this->width = width;
	this->height = height;
	this->outData = outBuf;

	for( int j = 0; j < height; j += 4, inBuf += width * 4 * 4 )
	{
		for( int i = 0; i < width; i += 4 )
		{
			common->LoadPacifierBinarizeProgressIncrement( 16 );

			ExtractBlock( inBuf + i * 4, width, block );

			GetMinMaxBBox( block, normal1, normal2 );
			InsetNormalsBBox3Dc( normal1, normal2 );

			// Write out Nx as an alpha channel.
			EmitByte( normal2[0] );
			EmitByte( normal1[0] );
			EmitAlphaIndices( block, 0, normal1[0], normal2[0] );

			// Write out Ny as an alpha channel.
			EmitByte( normal2[1] );
			EmitByte( normal1[1] );
			EmitAlphaIndices( block, 1, normal1[1], normal2[1] );
		}
		outData += dstPadding;
		inBuf += srcPadding;
	}
}

/*
========================
idDxtEncoder::DecodeDXNAlphaValues
========================
*/
void idDxtEncoder::DecodeDXNAlphaValues( const byte* inBuf, byte* values )
{
	int i;
	unsigned int indices;
	byte alphas[8];

	if( inBuf[0] <= inBuf[1] )
	{
		alphas[0] = inBuf[0];
		alphas[1] = inBuf[1];
		alphas[2] = ( 4 * alphas[0] + 1 * alphas[1] ) / 5;
		alphas[3] = ( 3 * alphas[0] + 2 * alphas[1] ) / 5;
		alphas[4] = ( 2 * alphas[0] + 3 * alphas[1] ) / 5;
		alphas[5] = ( 1 * alphas[0] + 4 * alphas[1] ) / 5;
		alphas[6] = 0;
		alphas[7] = 255;
	}
	else
	{
		alphas[0] = inBuf[0];
		alphas[1] = inBuf[1];
		alphas[2] = ( 6 * alphas[0] + 1 * alphas[1] ) / 7;
		alphas[3] = ( 5 * alphas[0] + 2 * alphas[1] ) / 7;
		alphas[4] = ( 4 * alphas[0] + 3 * alphas[1] ) / 7;
		alphas[5] = ( 3 * alphas[0] + 4 * alphas[1] ) / 7;
		alphas[6] = ( 2 * alphas[0] + 5 * alphas[1] ) / 7;
		alphas[7] = ( 1 * alphas[0] + 6 * alphas[1] ) / 7;
	}

	indices = ( int )inBuf[2] | ( ( int )inBuf[3] << 8 ) | ( ( int )inBuf[4] << 16 );
	for( i = 0; i < 8; i++ )
	{
		values[i] = alphas[indices & 7];
		indices >>= 3;
	}

	indices = ( int )inBuf[5] | ( ( int )inBuf[6] << 8 ) | ( ( int )inBuf[7] << 16 );
	for( i = 8; i < 16; i++ )
	{
		values[i] = alphas[indices & 7];
		indices >>= 3;
	}
}

/*
========================
idDxtEncoder::EncodeNormalRGBIndices

params:	values	- 16 normal block for which to find normal Y indices
paramO:	min		- Min grayscale value
paramO:	max		- Max grayscale value
========================
*/
void idDxtEncoder::EncodeNormalRGBIndices( byte* outBuf, const byte min, const byte max, const byte* values )
{

	const int COLOR_RANGE = 3;

	byte maskedMin, maskedMax, mid, yb1, yb2, yb3;

	maskedMax = max & C565_6_MASK;
	maskedMin = min & C565_6_MASK;
	mid = ( maskedMax - maskedMin ) / ( 2 * COLOR_RANGE );

	yb1 = maskedMax - mid;
	yb2 = ( 2 * maskedMax + 1 * maskedMin ) / COLOR_RANGE - mid;
	yb3 = ( 1 * maskedMax + 2 * maskedMin ) / COLOR_RANGE - mid;

	unsigned int result = 0;

	for( int i = 15; i >= 0; i-- )
	{
		result <<= 2;
		byte y = values[i];
		int b1 = ( y >= yb1 );
		int b2 = ( y >= yb2 );
		int b3 = ( y >= yb3 );
		int index = ( 4 - b1 - b2 - b3 ) & 3;
		index ^= ( 2 > index );
		result |= index;
	}

	unsigned short maskedMax5 = ( max & C565_5_MASK ) >> 3;
	unsigned short maskedMin5 = ( min & C565_5_MASK ) >> 3;

	unsigned short smax = ( maskedMax5 << 11 ) | ( maskedMax << 3 ) | maskedMax5;
	unsigned short smin = ( maskedMin5 << 11 ) | ( maskedMin << 3 ) | maskedMin5;

	outBuf[0] = byte( ( smax >> 0 ) & 0xFF );
	outBuf[1] = byte( ( smax >> 8 ) & 0xFF );
	outBuf[2] = byte( ( smin >> 0 ) & 0xFF );
	outBuf[3] = byte( ( smin >> 8 ) & 0xFF );

	outBuf[4] = byte( ( result >>  0 ) & 0xFF );
	outBuf[5] = byte( ( result >>  8 ) & 0xFF );
	outBuf[6] = byte( ( result >> 16 ) & 0xFF );
	outBuf[7] = byte( ( result >> 24 ) & 0xFF );
}

/*
========================
idDxtEncoder::ConvertNormalMapDXN2_DXT5

params:	inBuf		- normal map compressed in DXN2 format
paramO:	outBuf		- result of compression in DXT5 format
params:	width		- width of image
params:	height		- height of image
========================
*/
void idDxtEncoder::ConvertNormalMapDXN2_DXT5( const byte* inBuf, byte* outBuf, int width, int height )
{
	ALIGN16( byte values[16] );

	this->width = width;
	this->height = height;
	this->outData = outBuf;

	if( width > 4 && ( width & 3 ) != 0 )
	{
		return;
	}
	if( height > 4 && ( height & 3 ) != 0 )
	{
		return;
	}

	if( width < 4 || height < 4 )
	{
		assert( 0 );
		return;
	}

	for( int j = 0; j < height; j += 4 )
	{
		for( int i = 0; i < width; i += 4, inBuf += 16, outBuf += 16 )
		{
			common->LoadPacifierBinarizeProgressIncrement( 16 );

			// decode normal Y stored as a DXT5 alpha channel
			DecodeDXNAlphaValues( inBuf + 0, values );

			// copy normal X
			memcpy( outBuf + 0, inBuf + 8, 8 );

			// get the min/max Y
			byte minNormalY = 255;
			byte maxNormalY = 0;
			for( int i = 0; i < 16; i++ )
			{
				if( values[i] < minNormalY )
				{
					minNormalY = values[i];
				}
				if( values[i] > maxNormalY )
				{
					maxNormalY = values[i];
				}
			}

			// encode normal Y into DXT5 color channels
			EncodeNormalRGBIndices( outBuf + 8, minNormalY, maxNormalY, values );
		}
		outData += dstPadding;
		inBuf += srcPadding;
	}
}

/*
========================
idDxtEncoder::DecodeNormalYValues
========================
*/
void idDxtEncoder::DecodeNormalYValues( const byte* inBuf, byte& min, byte& max, byte* values )
{
	int i;
	unsigned int indexes;
	unsigned short normal0, normal1;
	byte normalsY[4];

	normal0 = inBuf[0] | ( inBuf[1] << 8 );
	normal1 = inBuf[2] | ( inBuf[3] << 8 );

	assert( normal0 >= normal1 );

	normalsY[0] = GreenFrom565( normal0 );
	normalsY[1] = GreenFrom565( normal1 );
	normalsY[2] = ( 2 * normalsY[0] + 1 * normalsY[1] ) / 3;
	normalsY[3] = ( 1 * normalsY[0] + 2 * normalsY[1] ) / 3;

	indexes = ( unsigned int )inBuf[4] | ( ( unsigned int )inBuf[5] << 8 ) | ( ( unsigned int )inBuf[6] << 16 ) | ( ( unsigned int )inBuf[7] << 24 );
	for( i = 0; i < 16; i++ )
	{
		values[i] = normalsY[indexes & 3];
		indexes >>= 2;
	}

	max = normalsY[0];
	min = normalsY[1];
}

/*
========================
idDxtEncoder::EncodeDXNAlphaValues
========================
*/
void idDxtEncoder::EncodeDXNAlphaValues( byte* outBuf, const byte min, const byte max, const byte* values )
{
	int i;
	byte alphas[8];
	int j;
	unsigned int indexes[16];

	alphas[0] = max;
	alphas[1] = min;
	alphas[2] = ( 6 * alphas[0] + 1 * alphas[1] ) / 7;
	alphas[3] = ( 5 * alphas[0] + 2 * alphas[1] ) / 7;
	alphas[4] = ( 4 * alphas[0] + 3 * alphas[1] ) / 7;
	alphas[5] = ( 3 * alphas[0] + 4 * alphas[1] ) / 7;
	alphas[6] = ( 2 * alphas[0] + 5 * alphas[1] ) / 7;
	alphas[7] = ( 1 * alphas[0] + 6 * alphas[1] ) / 7;

	int error = 0;
	for( i = 0; i < 16; i++ )
	{
		int minDist = MAX_TYPE( int );
		byte a = values[i];
		for( j = 0; j < 8; j++ )
		{
			int dist = AlphaDistance( a, alphas[j] );
			if( dist < minDist )
			{
				minDist = dist;
				indexes[i] = j;
			}
		}
		error += minDist;
	}

	outBuf[0] = max;
	outBuf[1] = min;

	outBuf[2] = byte( ( indexes[ 0] >> 0 ) | ( indexes[ 1] << 3 ) | ( indexes[ 2] << 6 ) );
	outBuf[3] = byte( ( indexes[ 2] >> 2 ) | ( indexes[ 3] << 1 ) | ( indexes[ 4] << 4 ) | ( indexes[ 5] << 7 ) );
	outBuf[4] = byte( ( indexes[ 5] >> 1 ) | ( indexes[ 6] << 2 ) | ( indexes[ 7] << 5 ) );

	outBuf[5] = byte( ( indexes[ 8] >> 0 ) | ( indexes[ 9] << 3 ) | ( indexes[10] << 6 ) );
	outBuf[6] = byte( ( indexes[10] >> 2 ) | ( indexes[11] << 1 ) | ( indexes[12] << 4 ) | ( indexes[13] << 7 ) );
	outBuf[7] = byte( ( indexes[13] >> 1 ) | ( indexes[14] << 2 ) | ( indexes[15] << 5 ) );
}

/*
========================
idDxtEncoder::ConvertNormalMapDXT5_DXN2

params:	inBuf		- image to compress
paramO:	outBuf		- result of compression
params:	width		- width of image
params:	height		- height of image
========================
*/
void idDxtEncoder::ConvertNormalMapDXT5_DXN2( const byte* inBuf, byte* outBuf, int width, int height )
{
	ALIGN16( byte values[16] );
	byte minNormalY, maxNormalY;

	this->width = width;
	this->height = height;
	this->outData = outBuf;

	if( width > 4 && ( width & 3 ) != 0 )
	{
		return;
	}
	if( height > 4 && ( height & 3 ) != 0 )
	{
		return;
	}

	if( width < 4 || height < 4 )
	{
		assert( 0 );
		return;
	}

	for( int j = 0; j < height; j += 4 )
	{
		for( int i = 0; i < width; i += 4, inBuf += 16, outBuf += 16 )
		{
			common->LoadPacifierBinarizeProgressIncrement( 16 );

			// decode normal Y stored as a DXT5 alpha channel
			DecodeNormalYValues( inBuf + 8, minNormalY, maxNormalY, values );

			memcpy( outBuf + 8, inBuf + 0, 8 );

			// encode normal Y into DXT5 green channel
			EncodeDXNAlphaValues( outBuf + 0, minNormalY, maxNormalY, values );
		}
		outData += dstPadding;
		inBuf += srcPadding;
	}
}

/*
========================
idDxtEncoder::ConvertImageDXN1_DXT1

params:	inBuf		- normal map compressed in DXN1 format
paramO:	outBuf		- result of compression in DXT1 format
params:	width		- width of image
params:	height		- height of image
========================
*/
void idDxtEncoder::ConvertImageDXN1_DXT1( const byte* inBuf, byte* outBuf, int width, int height )
{
	ALIGN16( byte values[16] );

	this->width = width;
	this->height = height;
	this->outData = outBuf;

	if( width > 4 && ( width & 3 ) != 0 )
	{
		return;
	}
	if( height > 4 && ( height & 3 ) != 0 )
	{
		return;
	}

	if( width < 4 || height < 4 )
	{
		assert( 0 );
		return;
	}

	for( int j = 0; j < height; j += 4 )
	{
		for( int i = 0; i < width; i += 4, inBuf += 8, outBuf += 8 )
		{
			common->LoadPacifierBinarizeProgressIncrement( 16 );

			// decode single channel stored as a DXT5 alpha channel
			DecodeDXNAlphaValues( inBuf + 0, values );

			// get the min/max
			byte min = 255;
			byte max = 0;

			// Dustin: corrected iteration
			for( int k = 0; k < 16; k++ )
			{
				if( values[k] < min )
				{
					min = values[k];
				}
				if( values[k] > max )
				{
					max = values[k];
				}
			}

			// encode single channel into DXT1
			EncodeNormalRGBIndices( outBuf + 0, min, max, values );
		}
		outData += dstPadding;
		inBuf += srcPadding;
	}
}


// RB begin
#include <cmath>
#include <algorithm>

#include "../../libs/mesa/format_r11g11b10f.h"

typedef union
{
	uint32	i;
	byte	b[4];
} convert_uint32_t;

static const float HALF_MAX = 65504.0f;

// helper function: Mean Squared Logarithmic Error (MSLE)
static float CalcMSLE( const float* a, const float* b )
{
	float delta[3];
	for( int i = 0; i < 3; ++i )
	{
		delta[i] = log2f( ( b[i] + 1.0f ) / ( a[i] + 1.0f ) );
	}
	float deltaSq[3] = { delta[0]* delta[0], delta[1]* delta[1], delta[2]* delta[2] };
	return deltaSq[0] + deltaSq[1] + deltaSq[2]; // without luminance for simplicity
}

// Quantize to 10 Bit
static void Quantize10( float* out, const float* in )
{
	const float scale = 1023.0f; // 10 Bit
	for( int i = 0; i < 3; ++i )
	{
		out[i] = std::floor( in[i] * scale / HALF_MAX ); // scale to 0-1023
		out[i] = std::max( 0.0f, std::min( out[i], scale ) );
	}
}

// Dequantize from 10 Bit
static void Unquantize10( float* out, const float* in )
{
	const float scale = 1023.0f;
	for( int i = 0; i < 3; ++i )
	{
		out[i] = in[i] * HALF_MAX / scale;
	}
}

// Calc Index berechnen (4 Bit, 0-15)
static uint32_t ComputeIndex4( float texelPos, float endPoint0Pos, float endPoint1Pos )
{
	float r = ( texelPos - endPoint0Pos ) / ( endPoint1Pos - endPoint0Pos );
	return ( uint32_t )std::clamp( r * 14.93333f + 0.03333f + 0.5f, 0.0f, 15.0f );
}

/*
========================
idDxtEncoder::EncodeBC6HMode11
Mode 11: 10-bit End pints, 4-bit Indizes, no partitioning
========================
*/
void idDxtEncoder::EncodeBC6HMode11( const float* block, byte* outBlock, float& msle )
{
	// find min/max endpoints
	float blockMin[3] = { block[0], block[1], block[2] };
	float blockMax[3] = { block[0], block[1], block[2] };
	for( int i = 1; i < 16; ++i )
	{
		for( int c = 0; c < 3; ++c )
		{
			blockMin[c] = std::min( blockMin[c], block[i * 3 + c] );
			blockMax[c] = std::max( blockMax[c], block[i * 3 + c] );
		}
	}

	// calc direction
	float blockDir[3];
	float dirLen = 0.0f;
	for( int c = 0; c < 3; ++c )
	{
		blockDir[c] = blockMax[c] - blockMin[c];
		dirLen += blockDir[c];
	}
	for( int c = 0; c < 3; ++c )
	{
		blockDir[c] /= dirLen ? dirLen : 1.0f;
	}

	// end points
	float endPoint0Pos = 0.0f, endPoint1Pos = 0.0f;
	for( int c = 0; c < 3; ++c )
	{
		endPoint0Pos += blockMin[c] * blockDir[c];
		endPoint1Pos += blockMax[c] * blockDir[c];
	}

	// test if end points need to be swapped (Fixup-Index)
	float fixupTexelPos = 0.0f;
	for( int c = 0; c < 3; ++c )
	{
		fixupTexelPos += block[c] * blockDir[c];
	}
	uint32_t fixupIndex = ComputeIndex4( fixupTexelPos, endPoint0Pos, endPoint1Pos );
	if( fixupIndex > 7 )
	{
		std::swap( endPoint0Pos, endPoint1Pos );
		std::swap( blockMin[0], blockMax[0] );
		std::swap( blockMin[1], blockMax[1] );
		std::swap( blockMin[2], blockMax[2] );
	}

	// quantize end points
	float endpoint0[3], endpoint1[3];
	Quantize10( endpoint0, blockMin );
	Quantize10( endpoint1, blockMax );

	// calc indizes
	uint32_t indices[16];
	for( int i = 0; i < 16; ++i )
	{
		float texelPos = 0.0f;
		for( int c = 0; c < 3; ++c )
		{
			texelPos += block[i * 3 + c] * blockDir[c];
		}
		indices[i] = ComputeIndex4( texelPos, endPoint0Pos, endPoint1Pos );
	}

	// calc error metric (MSLE)
	float endpoint0Unq[3], endpoint1Unq[3];
	Unquantize10( endpoint0Unq, endpoint0 );
	Unquantize10( endpoint1Unq, endpoint1 );
	msle = 0.0f;
	for( int i = 0; i < 16; ++i )
	{
		float weight = ( float )indices[i] * 64.0f / 15.0f;
		float texelUnc[3];
		for( int c = 0; c < 3; ++c )
		{
			texelUnc[c] = ( endpoint0Unq[c] * ( 64.0f - weight ) + endpoint1Unq[c] * weight + 32.0f ) * ( 31.0f / 4096.0f );
		}
		msle += CalcMSLE( block + i * 3, texelUnc );
	}

	// Bitpacking for Mode 11 (128 Bit)
	uint32_t blockWords[4] = {0};
	blockWords[0] = 0x03; // Mode 11: 00011
	blockWords[0] |= ( ( uint32_t )endpoint0[0] << 5 );
	blockWords[0] |= ( ( uint32_t )endpoint0[1] << 15 );
	blockWords[0] |= ( ( uint32_t )endpoint0[2] << 25 );
	blockWords[1] |= ( ( uint32_t )endpoint0[2] >> 7 );
	blockWords[1] |= ( ( uint32_t )endpoint1[0] << 3 );
	blockWords[1] |= ( ( uint32_t )endpoint1[1] << 13 );
	blockWords[1] |= ( ( uint32_t )endpoint1[2] << 23 );
	blockWords[2] |= ( ( uint32_t )endpoint1[2] >> 9 );
	blockWords[2] |= ( indices[0] << 1 ) | ( indices[1] << 4 ) | ( indices[2] << 8 ) | ( indices[3] << 12 ) |
					 ( indices[4] << 16 ) | ( indices[5] << 20 ) | ( indices[6] << 24 ) | ( indices[7] << 28 );
	blockWords[3] = ( indices[8] << 0 ) | ( indices[9] << 4 ) | ( indices[10] << 8 ) | ( indices[11] << 12 ) |
					( indices[12] << 16 ) | ( indices[13] << 20 ) | ( indices[14] << 24 ) | ( indices[15] << 28 );

	// move to output
	memcpy( outBlock, blockWords, 16 );
}

/*
========================
idDxtEncoder::ConvertR11G11B10_BC6
Fallback without SIMD, uses mode 11
========================
*/
void idDxtEncoder::CompressImageR11G11B10_BC6Fast_Generic( const byte* inBuf, byte* outBuf, int width, int height )
{
	if( width < 4 || height < 4 || ( width & 3 ) != 0 || ( height & 3 ) != 0 )
	{
		idLib::Warning( "Invalid dimensions for BC6H-compression: %dx%d", width, height );
		return;
	}

	this->width = width;
	this->height = height;
	this->outData = outBuf;

	float* block = new float[16 * 3]; // tmp 4x4 block

	for( int j = 0; j < height; j += 4 )
	{
		for( int i = 0; i < width; i += 4 )
		{
			// extract and convert 4x4 block
			for( int y = 0; y < 4; ++y )
			{
				for( int x = 0; x < 4; ++x )
				{
					const byte* pixel = inBuf + ( ( j + y ) * width + ( i + x ) ) * 4;

					convert_uint32_t p;
					p.b[0] = pixel[0];
					p.b[1] = pixel[1];
					p.b[2] = pixel[2];
					p.b[3] = pixel[3];
					uint32_t packed = p.i;
					//uint32_t packed = ( pixel[0] << 24 ) | ( pixel[1] << 16 ) | ( pixel[2] << 8 ) | pixel[3];

					float rgb[3];
					r11g11b10f_to_float3( packed, rgb );
					//float r = ( float )( ( packed >> 21 ) & 0x7FF ) / 2047.0f;
					//float g = ( float )( ( packed >> 10 ) & 0x7FF ) / 2047.0f;
					//float b = ( float )( packed & 0x3FF ) / 1023.0f;
					int idx = ( y * 4 + x ) * 3;
					block[idx + 0] = rgb[0];
					block[idx + 1] = rgb[1];
					block[idx + 2] = rgb[2];
				}
			}

			// compress using mode 11
			float msle;
			EncodeBC6HMode11( block, outData, msle );
			outData += 16;
		}
		outData += dstPadding;
		inBuf += srcPadding * 4; // srcPadding per row, 4 rows per block
	}

	delete[] block;
}


#if !defined( DMAP )

#if 1

#if defined(USE_INTRINSICS_SSE) || defined(USE_INTRINSICS_NEON)
#include "../../libs/ispc_texcomp/ispc_texcomp.h"

/*
========================
ConvertR11G11B10ImageToFP16
Converts the entire image from R11G11B10 to FP16
========================
*/
static void ConvertR11G11B10ImageToFP16( const byte* inBuf, int width, int height, halfFloat_t* outBuf )
{
	for( int y = 0; y < height; ++y )
	{
		for( int x = 0; x < width; ++x )
		{
			const byte* pixel = inBuf + ( y * width + x ) * 4;

			convert_uint32_t p;
			p.b[0] = pixel[0];
			p.b[1] = pixel[1];
			p.b[2] = pixel[2];
			p.b[3] = pixel[3];
			uint32_t packed = p.i;
			//uint32_t packed = ( pixel[0] << 24 ) | ( pixel[1] << 16 ) | ( pixel[2] << 8 ) | pixel[3];

			float rgb[3];
			r11g11b10f_to_float3( packed, rgb );

			int idx = ( y * width + x ) * 4;
			outBuf[idx + 0] = F32toF16( rgb[0] );
			outBuf[idx + 1] = F32toF16( rgb[1] );
			outBuf[idx + 2] = F32toF16( rgb[2] );
			outBuf[idx + 3] = F32toF16( 1.0f ); // Alpha
		}
	}
}

/*
========================
idDxtEncoder::CompressImageR11G11B10_BC6Fast_SIMD
ISPC-Variant with ISPCTextureCompressor for BC6H
========================
*/
void idDxtEncoder::CompressImageR11G11B10_BC6Fast_SIMD( const byte* inBuf, byte* outBuf, int width, int height )
{
	if( width < 4 || height < 4 || ( width & 3 ) != 0 || ( height & 3 ) != 0 )
	{
		idLib::Warning( "Invalid dimensions for BC6H compression: %dx%d", width, height );
		return;
	}

	this->width = width;
	this->height = height;
	this->outData = outBuf;

	bc6h_enc_settings settings;
	GetProfile_bc6h_basic( &settings );

	halfFloat_t* fp16Buf = nullptr;
	try
	{
		fp16Buf = new halfFloat_t[width * height * 4];
	}
	catch( const std::bad_alloc& e )
	{
		idLib::Error( "Couldn't alloc FP16 buffer for BC6H compression: %s", e.what() );
		return;
	}

	ConvertR11G11B10ImageToFP16( inBuf, width, height, fp16Buf );

	rgba_surface surface;
	surface.ptr = reinterpret_cast<unsigned char*>( fp16Buf );
	surface.width = width;
	surface.height = height;
	surface.stride = width * 4 * sizeof( halfFloat_t );

	CompressBlocksBC6H( &surface, outBuf, &settings );

	delete[] fp16Buf;
}
#endif // #if defined(USE_INTRINSICS_SSE) || defined(USE_INTRINSICS_NEON)

#else

#if defined(USE_INTRINSICS_SSE)
#include "../../libs/compressonator/include/compressonator.h"

/*
========================
ConvertR11G11B10ToFP32
Converts the entire image from R11G11B10 to FP32
========================
*/
static void ConvertR11G11B10ToFP32( const byte* inBuf, int width, int height, float* fp32Buf )
{
	for( int y = 0; y < height; ++y )
	{
		for( int x = 0; x < width; ++x )
		{
			const byte* pixel = inBuf + ( y * width + x ) * 4;

			convert_uint32_t p;
			p.b[0] = pixel[0];
			p.b[1] = pixel[1];
			p.b[2] = pixel[2];
			p.b[3] = pixel[3];
			uint32_t packed = p.i;

			//uint32_t packed = ( pixel[0] << 24 ) | ( pixel[1] << 16 ) | ( pixel[2] << 8 ) | pixel[3];
			float rgb[3];
			r11g11b10f_to_float3( packed, rgb );

			//int idx = ( y * width + x ) * 3; // RGB, no Alpha
			int idx = ( y * width + x ) * 4;
			fp32Buf[idx + 0] = rgb[0];
			fp32Buf[idx + 1] = rgb[1];
			fp32Buf[idx + 2] = rgb[2];
			fp32Buf[idx + 3] = 1.0f;
		}
	}
}

/*
========================
idDxtEncoder::CompressImageR11G11B10_BC6Fast_SIMD
Compressonator-based variant for BC6H compression with CMP_FORMAT_RGBA_32F, single call, multi-threaded
========================
*/
void idDxtEncoder::CompressImageR11G11B10_BC6Fast_SIMD( const byte* inBuf, byte* outBuf, int width, int height )
{
	// Validation
	if( width < 4 || height < 4 || ( width & 3 ) != 0 || ( height & 3 ) != 0 )
	{
		idLib::Warning( "Invalid dimensions for BC6H compression: %dx%d", width, height );
		return;
	}

	this->width = width;
	this->height = height;
	this->outData = outBuf;

	// Calculate block count and output size
	int numBlocksX = width / 4;
	int numBlocksY = height / 4;
	int expectedOutputSize = numBlocksX * numBlocksY * 16; // 16 bytes per 4x4 block

	// Temporary FP32 buffer for the entire image
	float* fp32Buf = nullptr;
	try
	{
		fp32Buf = new float[width * height * 4]; // RGBA, 4 channels
	}
	catch( const std::bad_alloc& e )
	{
		idLib::Error( "Failed to allocate FP32 buffer for BC6H compression: %s", e.what() );
		return;
	}

	// Convert R11G11B10 to FP32
	ConvertR11G11B10ToFP32( inBuf, width, height, fp32Buf );

	// Compressonator configuration
	CMP_Texture srcTexture = {0};
	srcTexture.dwSize = sizeof( CMP_Texture );
	srcTexture.dwWidth = width;
	srcTexture.dwHeight = height;
	srcTexture.format = CMP_FORMAT_RGBA_32F; // FP32 RGBA
	srcTexture.dwPitch = width * 4 * sizeof( float ); // Bytes per row
	srcTexture.dwDataSize = width * height * 4 * sizeof( float ); // Total image data size
	srcTexture.pData = reinterpret_cast<CMP_BYTE*>( fp32Buf );

	CMP_Texture destTexture = {0};
	destTexture.dwSize = sizeof( CMP_Texture );
	destTexture.dwWidth = width;
	destTexture.dwHeight = height;
	destTexture.format = CMP_FORMAT_BC6H;
	destTexture.dwDataSize = expectedOutputSize; // Total output size
	destTexture.pData = outBuf;

	CMP_CompressOptions options = {0};
	options.dwSize = sizeof( CMP_CompressOptions );
	options.fquality = 0.5f; // Medium quality for speed (0.0 to 1.0)
	options.dwnumThreads = 0; // 0 = auto-detect number of threads based on CPU cores
	options.nEncodeWith = CMP_GPU_OCL;

	// Execute compression (single call)
	CMP_ERROR cmp_status = CMP_ConvertTexture( &srcTexture, &destTexture, &options, nullptr );
	if( cmp_status != CMP_OK )
	{
		idLib::Warning( "BC6H compression failed with error code %d", cmp_status );
		memset( outBuf, 0, expectedOutputSize ); // Fill with zeros on error
	}
	else
	{
		// Add padding
		if( dstPadding > 0 )
		{
			byte* endPtr = outBuf + expectedOutputSize;
			for( int y = 0; y < numBlocksY; ++y )
			{
				byte* rowEnd = outBuf + ( y * numBlocksX * 16 );
				byte* rowNext = rowEnd + numBlocksX * 16;
				if( rowNext <= endPtr )
				{
					memset( rowEnd, 0, dstPadding );
				}
			}
		}
	}

	delete[] fp32Buf;
}
#endif // #if defined(USE_INTRINSICS_SSE)

#endif

#endif // #if !defined( DMAP )

void idDxtEncoder::CompressImageR11G11B10_BC6HQ( const byte* inBuf, byte* outBuf, int width, int height )
{
	// TODO
	idLib::FatalError( "idDxtEncoder::CompressImageR11G11B10_BC6HQ not implemented" );
}
// RB end