doom3-bfg/neo/renderer/DXT/DXTEncoder_SSE2.cpp
2013-05-29 13:12:13 -05:00

2778 lines
97 KiB
C++

/*
===========================================================================
Doom 3 BFG Edition GPL Source Code
Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company.
This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").
Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Doom 3 BFG Edition Source Code. If not, see <http://www.gnu.org/licenses/>.
In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code. If not, please request a copy in writing from id Software at the address below.
If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
===========================================================================
*/
/*
================================================================================================
Contains the DxtEncoder implementation for SSE2.
================================================================================================
*/
#pragma hdrstop
#include "DXTCodec_local.h"
#include "DXTCodec.h"
#if defined( ID_WIN_X86_SSE2_INTRIN ) || ( ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) ) )
//#define TEST_COMPRESSION
#ifdef TEST_COMPRESSION
#include <malloc.h>
#endif
#define INSET_COLOR_SHIFT 4 // inset the bounding box with ( range >> shift )
#define INSET_ALPHA_SHIFT 5 // inset alpha channel
#define C565_5_MASK 0xF8 // 0xFF minus last three bits
#define C565_6_MASK 0xFC // 0xFF minus last two bits
#define NVIDIA_7X_HARDWARE_BUG_FIX // keep the DXT5 colors sorted as: max, min
#if !defined( R_SHUFFLE_D )
#define R_SHUFFLE_D( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
#endif
typedef uint16 word;
typedef uint32 dword;
ALIGN16( static __m128i SIMD_SSE2_zero ) = { 0, 0, 0, 0 };
ALIGN16( static dword SIMD_SSE2_dword_byte_mask[4] ) = { 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF };
ALIGN16( static dword SIMD_SSE2_dword_word_mask[4] ) = { 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF };
ALIGN16( static dword SIMD_SSE2_dword_red_mask[4] ) = { 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF };
ALIGN16( static dword SIMD_SSE2_dword_green_mask[4] ) = { 0x0000FF00, 0x0000FF00, 0x0000FF00, 0x0000FF00 };
ALIGN16( static dword SIMD_SSE2_dword_blue_mask[4] ) = { 0x00FF0000, 0x00FF0000, 0x00FF0000, 0x00FF0000 };
ALIGN16( static dword SIMD_SSE2_dword_colorMask_1010[4] ) = { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 };
ALIGN16( static dword SIMD_SSE2_dword_colorMask_0100[4] ) = { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 };
ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask0[4] ) = { 7<<0, 0, 7<<0, 0 };
ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask1[4] ) = { 7<<3, 0, 7<<3, 0 };
ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask2[4] ) = { 7<<6, 0, 7<<6, 0 };
ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask3[4] ) = { 7<<9, 0, 7<<9, 0 };
ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask4[4] ) = { 7<<12, 0, 7<<12, 0 };
ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask5[4] ) = { 7<<15, 0, 7<<15, 0 };
ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask6[4] ) = { 7<<18, 0, 7<<18, 0 };
ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask7[4] ) = { 7<<21, 0, 7<<21, 0 };
ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask0[4] ) = { 3<<0, 0, 3<<0, 0 };
ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask1[4] ) = { 3<<2, 0, 3<<2, 0 };
ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask2[4] ) = { 3<<4, 0, 3<<4, 0 };
ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask3[4] ) = { 3<<6, 0, 3<<6, 0 };
ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask4[4] ) = { 3<<8, 0, 3<<8, 0 };
ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask5[4] ) = { 3<<10, 0, 3<<10, 0 };
ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask6[4] ) = { 3<<12, 0, 3<<12, 0 };
ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask7[4] ) = { 3<<14, 0, 3<<14, 0 };
ALIGN16( static word SIMD_SSE2_word_0[8] ) = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
ALIGN16( static word SIMD_SSE2_word_1[8] ) = { 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 };
ALIGN16( static word SIMD_SSE2_word_2[8] ) = { 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002 };
ALIGN16( static word SIMD_SSE2_word_3[8] ) = { 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003 };
ALIGN16( static word SIMD_SSE2_word_7[8] ) = { 0x0007, 0x0007, 0x0007, 0x0007, 0x0007, 0x0007, 0x0007, 0x0007 };
ALIGN16( static word SIMD_SSE2_word_8[8] ) = { 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008 };
ALIGN16( static word SIMD_SSE2_word_31[8] ) = { 31, 31, 31, 31, 31, 31, 31, 31 };
ALIGN16( static word SIMD_SSE2_word_63[8] ) = { 63, 63, 63, 63, 63, 63, 63, 63 };
ALIGN16( static word SIMD_SSE2_word_127[8] ) = { 127, 127, 127, 127, 127, 127, 127, 127 };
ALIGN16( static word SIMD_SSE2_word_255[8] ) = { 255, 255, 255, 255, 255, 255, 255, 255 };
ALIGN16( static word SIMD_SSE2_word_center_128[8] ) = { 128, 128, 0, 0, 0, 0, 0, 0 };
ALIGN16( static word SIMD_SSE2_word_div_by_3[8] ) = { (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1 };
ALIGN16( static word SIMD_SSE2_word_div_by_6[8] ) = { (1<<16)/6+1, (1<<16)/6+1, (1<<16)/6+1, (1<<16)/6+1, (1<<16)/6+1, (1<<16)/6+1, (1<<16)/6+1, (1<<16)/6+1 };
ALIGN16( static word SIMD_SSE2_word_div_by_14[8] ) = { (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1 };
ALIGN16( static word SIMD_SSE2_word_scale_7_9_11_13[8] ) = { 7, 7, 9, 9, 11, 11, 13, 13 };
ALIGN16( static word SIMD_SSE2_word_scale_7_5_3_1[8] ) = { 7, 7, 5, 5, 3, 3, 1, 1 };
ALIGN16( static word SIMD_SSE2_word_scale_5_3_1[8] ) = { 5, 3, 1, 0, 5, 3, 1, 0 };
ALIGN16( static word SIMD_SSE2_word_scale_1_3_5[8] ) = { 1, 3, 5, 0, 1, 3, 5, 0 };
ALIGN16( static word SIMD_SSE2_word_insetShift[8] ) = { 1 << ( 16 - INSET_COLOR_SHIFT ), 1 << ( 16 - INSET_COLOR_SHIFT ), 1 << ( 16 - INSET_COLOR_SHIFT ), 1 << ( 16 - INSET_ALPHA_SHIFT ), 0, 0, 0, 0 };
ALIGN16( static word SIMD_SSE2_word_insetYCoCgRound[8] ) = { ((1<<(INSET_COLOR_SHIFT-1))-1), ((1<<(INSET_COLOR_SHIFT-1))-1), ((1<<(INSET_COLOR_SHIFT-1))-1), ((1<<(INSET_ALPHA_SHIFT-1))-1), 0, 0, 0, 0 };
ALIGN16( static word SIMD_SSE2_word_insetYCoCgMask[8] ) = { 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF };
ALIGN16( static word SIMD_SSE2_word_insetYCoCgShiftUp[8] ) = { 1 << INSET_COLOR_SHIFT, 1 << INSET_COLOR_SHIFT, 1 << INSET_COLOR_SHIFT, 1 << INSET_ALPHA_SHIFT, 0, 0, 0, 0 };
ALIGN16( static word SIMD_SSE2_word_insetYCoCgShiftDown[8] ) = { 1 << ( 16 - INSET_COLOR_SHIFT ), 1 << ( 16 - INSET_COLOR_SHIFT ), 1 << ( 16 - INSET_COLOR_SHIFT ), 1 << ( 16 - INSET_ALPHA_SHIFT ), 0, 0, 0, 0 };
ALIGN16( static word SIMD_SSE2_word_insetYCoCgQuantMask[8] ) = { C565_5_MASK, C565_6_MASK, C565_5_MASK, 0xFF, C565_5_MASK, C565_6_MASK, C565_5_MASK, 0xFF };
ALIGN16( static word SIMD_SSE2_word_insetYCoCgRep[8] ) = { 1 << ( 16 - 5 ), 1 << ( 16 - 6 ), 1 << ( 16 - 5 ), 0, 1 << ( 16 - 5 ), 1 << ( 16 - 6 ), 1 << ( 16 - 5 ), 0 };
ALIGN16( static word SIMD_SSE2_word_insetNormalDXT5Round[8] ) = { 0, ((1<<(INSET_COLOR_SHIFT-1))-1), 0, ((1<<(INSET_ALPHA_SHIFT-1))-1), 0, 0, 0, 0 };
ALIGN16( static word SIMD_SSE2_word_insetNormalDXT5Mask[8] ) = { 0x0000, 0xFFFF, 0x0000, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000 };
ALIGN16( static word SIMD_SSE2_word_insetNormalDXT5ShiftUp[8] ) = { 1, 1 << INSET_COLOR_SHIFT, 1, 1 << INSET_ALPHA_SHIFT, 1, 1, 1, 1 };
ALIGN16( static word SIMD_SSE2_word_insetNormalDXT5ShiftDown[8] ) = { 0, 1 << ( 16 - INSET_COLOR_SHIFT ), 0, 1 << ( 16 - INSET_ALPHA_SHIFT ), 0, 0, 0, 0 };
ALIGN16( static word SIMD_SSE2_word_insetNormalDXT5QuantMask[8] ) = { 0xFF, C565_6_MASK, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
ALIGN16( static word SIMD_SSE2_word_insetNormalDXT5Rep[8] ) = { 0, 1 << ( 16 - 6 ), 0, 0, 0, 0, 0, 0 };
ALIGN16( static word SIMD_SSE2_word_insetNormal3DcRound[8] ) = { ((1<<(INSET_ALPHA_SHIFT-1))-1), ((1<<(INSET_ALPHA_SHIFT-1))-1), 0, 0, 0, 0, 0, 0 };
ALIGN16( static word SIMD_SSE2_word_insetNormal3DcMask[8] ) = { 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
ALIGN16( static word SIMD_SSE2_word_insetNormal3DcShiftUp[8] ) = { 1 << INSET_ALPHA_SHIFT, 1 << INSET_ALPHA_SHIFT, 1, 1, 1, 1, 1, 1 };
ALIGN16( static word SIMD_SSE2_word_insetNormal3DcShiftDown[8] ) = { 1 << ( 16 - INSET_ALPHA_SHIFT ), 1 << ( 16 - INSET_ALPHA_SHIFT ), 0, 0, 0, 0, 0, 0 };
ALIGN16( static byte SIMD_SSE2_byte_0[16] ) = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
ALIGN16( static byte SIMD_SSE2_byte_1[16] ) = { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 };
ALIGN16( static byte SIMD_SSE2_byte_2[16] ) = { 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 };
ALIGN16( static byte SIMD_SSE2_byte_3[16] ) = { 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 };
ALIGN16( static byte SIMD_SSE2_byte_4[16] ) = { 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 };
ALIGN16( static byte SIMD_SSE2_byte_7[16] ) = { 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 };
ALIGN16( static byte SIMD_SSE2_byte_8[16] ) = { 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08 };
ALIGN16( static byte SIMD_SSE2_byte_not[16] ) = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
ALIGN16( static byte SIMD_SSE2_byte_colorMask[16] ) = { C565_5_MASK, C565_6_MASK, C565_5_MASK, 0x00, 0x00, 0x00, 0x00, 0x00, C565_5_MASK, C565_6_MASK, C565_5_MASK, 0x00, 0x00, 0x00, 0x00, 0x00 };
ALIGN16( static byte SIMD_SSE2_byte_colorMask2[16] ) = { 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00 };
ALIGN16( static byte SIMD_SSE2_byte_ctx1Mask[16] ) = { 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
ALIGN16( static byte SIMD_SSE2_byte_diagonalMask[16] ) = { 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
ALIGN16( static byte SIMD_SSE2_byte_scale_mask0[16] ) = { 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF };
ALIGN16( static byte SIMD_SSE2_byte_scale_mask1[16] ) = { 0xFF, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00 };
ALIGN16( static byte SIMD_SSE2_byte_scale_mask2[16] ) = { 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00 };
ALIGN16( static byte SIMD_SSE2_byte_scale_mask3[16] ) = { 0xFF, 0xFF, 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00 };
ALIGN16( static byte SIMD_SSE2_byte_scale_mask4[16] ) = { 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00 };
ALIGN16( static byte SIMD_SSE2_byte_minus_128_0[16] ) = { (byte)-128, (byte)-128, 0, 0, (byte)-128, (byte)-128, 0, 0, (byte)-128, (byte)-128, 0, 0, (byte)-128, (byte)-128, 0, 0 };
/*
========================
idDxtEncoder::ExtractBlock_SSE2
params: inPtr - input image, 4 bytes per pixel
paramO: colorBlock - 4*4 output tile, 4 bytes per pixel
========================
*/
ID_INLINE void idDxtEncoder::ExtractBlock_SSE2( const byte * inPtr, int width, byte * colorBlock ) const {
#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
__asm {
mov esi, inPtr
mov edi, colorBlock
mov eax, width
shl eax, 2
movdqa xmm0, xmmword ptr [esi]
movdqa xmmword ptr [edi+ 0], xmm0
movdqa xmm1, xmmword ptr [esi+eax] // + 4 * width
movdqa xmmword ptr [edi+16], xmm1
movdqa xmm2, xmmword ptr [esi+eax*2] // + 8 * width
add esi, eax
movdqa xmmword ptr [edi+32], xmm2
movdqa xmm3, xmmword ptr [esi+eax*2] // + 12 * width
movdqa xmmword ptr [edi+48], xmm3
}
#elif defined ( ID_WIN_X86_SSE2_INTRIN )
*((__m128i *)(&colorBlock[ 0])) = _mm_load_si128( (__m128i *)( inPtr + width * 4 * 0 ) );
*((__m128i *)(&colorBlock[16])) = _mm_load_si128( (__m128i *)( inPtr + width * 4 * 1 ) );
*((__m128i *)(&colorBlock[32])) = _mm_load_si128( (__m128i *)( inPtr + width * 4 * 2 ) );
*((__m128i *)(&colorBlock[48])) = _mm_load_si128( (__m128i *)( inPtr + width * 4 * 3 ) );
#else
assert( false );
#endif
}
/*
========================
idDxtEncoder::GetMinMaxBBox_SSE2
Takes the extents of the bounding box of the colors in the 4x4 block.
params: colorBlock - 4*4 input tile, 4 bytes per pixel
paramO: minColor - Min 4 byte output color
paramO: maxColor - Max 4 byte output color
========================
*/
ID_INLINE void idDxtEncoder::GetMinMaxBBox_SSE2( const byte * colorBlock, byte * minColor, byte * maxColor ) const {
#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
__asm {
mov eax, colorBlock
mov esi, minColor
mov edi, maxColor
movdqa xmm0, xmmword ptr [eax+ 0]
movdqa xmm1, xmmword ptr [eax+ 0]
pminub xmm0, xmmword ptr [eax+16]
pmaxub xmm1, xmmword ptr [eax+16]
pminub xmm0, xmmword ptr [eax+32]
pmaxub xmm1, xmmword ptr [eax+32]
pminub xmm0, xmmword ptr [eax+48]
pmaxub xmm1, xmmword ptr [eax+48]
pshufd xmm3, xmm0, R_SHUFFLE_D( 2, 3, 2, 3 )
pshufd xmm4, xmm1, R_SHUFFLE_D( 2, 3, 2, 3 )
pminub xmm0, xmm3
pmaxub xmm1, xmm4
pshuflw xmm6, xmm0, R_SHUFFLE_D( 2, 3, 2, 3 )
pshuflw xmm7, xmm1, R_SHUFFLE_D( 2, 3, 2, 3 )
pminub xmm0, xmm6
pmaxub xmm1, xmm7
movd dword ptr [esi], xmm0
movd dword ptr [edi], xmm1
}
#elif defined ( ID_WIN_X86_SSE2_INTRIN )
__m128i block0 = *((__m128i *)(&colorBlock[ 0]));
__m128i block1 = *((__m128i *)(&colorBlock[16]));
__m128i block2 = *((__m128i *)(&colorBlock[32]));
__m128i block3 = *((__m128i *)(&colorBlock[48]));
__m128i max1 = _mm_max_epu8( block0, block1 );
__m128i min1 = _mm_min_epu8( block0, block1 );
__m128i max2 = _mm_max_epu8( block2, block3 );
__m128i min2 = _mm_min_epu8( block2, block3 );
__m128i max3 = _mm_max_epu8( max1, max2 );
__m128i min3 = _mm_min_epu8( min1, min2 );
__m128i max4 = _mm_shuffle_epi32( max3, R_SHUFFLE_D( 2, 3, 2, 3 ) );
__m128i min4 = _mm_shuffle_epi32( min3, R_SHUFFLE_D( 2, 3, 2, 3 ) );
__m128i max5 = _mm_max_epu8( max3, max4 );
__m128i min5 = _mm_min_epu8( min3, min4 );
__m128i max6 = _mm_shufflelo_epi16( max5, R_SHUFFLE_D( 2, 3, 2, 3 ) );
__m128i min6 = _mm_shufflelo_epi16( min5, R_SHUFFLE_D( 2, 3, 2, 3 ) );
max6 = _mm_max_epu8( max5, max6 );
min6 = _mm_min_epu8( min5, min6 );
*((int *)maxColor) = _mm_cvtsi128_si32( max6 );
*((int *)minColor) = _mm_cvtsi128_si32( min6 );
#else
assert( false );
#endif
}
/*
========================
idDxtEncoder::InsetColorsBBox_SSE2
========================
*/
ID_INLINE void idDxtEncoder::InsetColorsBBox_SSE2( byte * minColor, byte * maxColor ) const {
#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
__asm {
mov esi, minColor
mov edi, maxColor
movd xmm0, dword ptr [esi]
movd xmm1, dword ptr [edi]
punpcklbw xmm0, SIMD_SSE2_byte_0
punpcklbw xmm1, SIMD_SSE2_byte_0
movdqa xmm2, xmm1
psubw xmm2, xmm0
pmulhw xmm2, SIMD_SSE2_word_insetShift
paddw xmm0, xmm2
psubw xmm1, xmm2
packuswb xmm0, xmm0
packuswb xmm1, xmm1
movd dword ptr [esi], xmm0
movd dword ptr [edi], xmm1
}
#elif defined ( ID_WIN_X86_SSE2_INTRIN )
__m128i min = _mm_cvtsi32_si128( *(int *)minColor );
__m128i max = _mm_cvtsi32_si128( *(int *)maxColor );
__m128i xmm0 = _mm_unpacklo_epi8( min, *(__m128i *)SIMD_SSE2_byte_0 );
__m128i xmm1 = _mm_unpacklo_epi8( max, *(__m128i *)SIMD_SSE2_byte_0 );
__m128i xmm2 = _mm_sub_epi16( xmm1, xmm0 );
xmm2 = _mm_mulhi_epi16( xmm2, *(__m128i *)SIMD_SSE2_word_insetShift );
xmm0 = _mm_add_epi16( xmm0, xmm2 );
xmm1 = _mm_sub_epi16( xmm1, xmm2 );
xmm0 = _mm_packus_epi16( xmm0, xmm0 );
xmm1 = _mm_packus_epi16( xmm1, xmm1 );
*((int *)minColor) = _mm_cvtsi128_si32( xmm0 );
*((int *)maxColor) = _mm_cvtsi128_si32( xmm1 );
#else
assert( false );
#endif
}
/*
========================
idDxtEncoder::EmitColorIndices_SSE2
params: colorBlock - 16 pixel block for which to find color indices
paramO: minColor - Min alpha found
paramO: maxColor - Max alpha found
return: 4 byte color index block
========================
*/
void idDxtEncoder::EmitColorIndices_SSE2( const byte * colorBlock, const byte * minColor_, const byte * maxColor_ ) {
#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
ALIGN16( byte color0[16] );
ALIGN16( byte color1[16] );
ALIGN16( byte color2[16] );
ALIGN16( byte color3[16] );
ALIGN16( byte result[16] );
byte *outPtr = outData;
__asm {
mov esi, maxColor_
mov edi, minColor_
pxor xmm7, xmm7
movdqa result, xmm7
movd xmm0, dword ptr [esi]
pand xmm0, SIMD_SSE2_byte_colorMask
punpcklbw xmm0, xmm7
pshuflw xmm4, xmm0, R_SHUFFLE_D( 0, 3, 2, 3 )
pshuflw xmm5, xmm0, R_SHUFFLE_D( 3, 1, 3, 3 )
psrlw xmm4, 5
psrlw xmm5, 6
por xmm0, xmm4
por xmm0, xmm5
movd xmm1, dword ptr [edi]
pand xmm1, SIMD_SSE2_byte_colorMask
punpcklbw xmm1, xmm7
pshuflw xmm4, xmm1, R_SHUFFLE_D( 0, 3, 2, 3 )
pshuflw xmm5, xmm1, R_SHUFFLE_D( 3, 1, 3, 3 )
psrlw xmm4, 5
psrlw xmm5, 6
por xmm1, xmm4
por xmm1, xmm5
movdqa xmm2, xmm0
packuswb xmm2, xmm7
pshufd xmm2, xmm2, R_SHUFFLE_D( 0, 1, 0, 1 )
movdqa color0, xmm2
movdqa xmm6, xmm0
paddw xmm6, xmm0
paddw xmm6, xmm1
pmulhw xmm6, SIMD_SSE2_word_div_by_3 // * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
packuswb xmm6, xmm7
pshufd xmm6, xmm6, R_SHUFFLE_D( 0, 1, 0, 1 )
movdqa color2, xmm6
movdqa xmm3, xmm1
packuswb xmm3, xmm7
pshufd xmm3, xmm3, R_SHUFFLE_D( 0, 1, 0, 1 )
movdqa color1, xmm3
paddw xmm1, xmm1
paddw xmm0, xmm1
pmulhw xmm0, SIMD_SSE2_word_div_by_3 // * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
packuswb xmm0, xmm7
pshufd xmm0, xmm0, R_SHUFFLE_D( 0, 1, 0, 1 )
movdqa color3, xmm0
mov eax, 32
mov esi, colorBlock
loop1: // iterates 2 times
movq xmm3, qword ptr [esi+eax+0]
pshufd xmm3, xmm3, R_SHUFFLE_D( 0, 2, 1, 3 ) // punpckldq xmm4, SIMD_SSE2_dword_0
movq xmm5, qword ptr [esi+eax+8]
pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 ) // punpckldq xmm5, SIMD_SSE2_dword_0
movdqa xmm0, xmm3
movdqa xmm6, xmm5
psadbw xmm0, color0
psadbw xmm6, color0
packssdw xmm0, xmm6
movdqa xmm1, xmm3
movdqa xmm6, xmm5
psadbw xmm1, color1
psadbw xmm6, color1
packssdw xmm1, xmm6
movdqa xmm2, xmm3
movdqa xmm6, xmm5
psadbw xmm2, color2
psadbw xmm6, color2
packssdw xmm2, xmm6
psadbw xmm3, color3
psadbw xmm5, color3
packssdw xmm3, xmm5
movq xmm4, qword ptr [esi+eax+16]
pshufd xmm4, xmm4, R_SHUFFLE_D( 0, 2, 1, 3 )
movq xmm5, qword ptr [esi+eax+24]
pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 )
movdqa xmm6, xmm4
movdqa xmm7, xmm5
psadbw xmm6, color0
psadbw xmm7, color0
packssdw xmm6, xmm7
packssdw xmm0, xmm6 // d1
movdqa xmm6, xmm4
movdqa xmm7, xmm5
psadbw xmm6, color1
psadbw xmm7, color1
packssdw xmm6, xmm7
packssdw xmm1, xmm6 // d1
movdqa xmm6, xmm4
movdqa xmm7, xmm5
psadbw xmm6, color2
psadbw xmm7, color2
packssdw xmm6, xmm7
packssdw xmm2, xmm6 // d2
psadbw xmm4, color3
psadbw xmm5, color3
packssdw xmm4, xmm5
packssdw xmm3, xmm4 // d3
movdqa xmm7, result
pslld xmm7, 16
movdqa xmm4, xmm0
movdqa xmm5, xmm1
pcmpgtw xmm0, xmm3 // b0
pcmpgtw xmm1, xmm2 // b1
pcmpgtw xmm4, xmm2 // b2
pcmpgtw xmm5, xmm3 // b3
pcmpgtw xmm2, xmm3 // b4
pand xmm4, xmm1 // x0
pand xmm5, xmm0 // x1
pand xmm2, xmm0 // x2
por xmm4, xmm5
pand xmm2, SIMD_SSE2_word_1
pand xmm4, SIMD_SSE2_word_2
por xmm2, xmm4
pshufd xmm5, xmm2, R_SHUFFLE_D( 2, 3, 0, 1 )
punpcklwd xmm2, SIMD_SSE2_word_0
punpcklwd xmm5, SIMD_SSE2_word_0
pslld xmm5, 8
por xmm7, xmm5
por xmm7, xmm2
movdqa result, xmm7
sub eax, 32
jge loop1
mov esi, outPtr
pshufd xmm4, xmm7, R_SHUFFLE_D( 1, 2, 3, 0 )
pshufd xmm5, xmm7, R_SHUFFLE_D( 2, 3, 0, 1 )
pshufd xmm6, xmm7, R_SHUFFLE_D( 3, 0, 1, 2 )
pslld xmm4, 2
pslld xmm5, 4
pslld xmm6, 6
por xmm7, xmm4
por xmm7, xmm5
por xmm7, xmm6
movd dword ptr [esi], xmm7
}
outData += 4;
#elif defined ( ID_WIN_X86_SSE2_INTRIN )
__m128c zero = SIMD_SSE2_zero;
__m128c result = SIMD_SSE2_zero;
__m128c color0, color1, color2, color3;
__m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
__m128c minColor = _mm_cvtsi32_si128( *(int *)minColor_ );
__m128c maxColor = _mm_cvtsi32_si128( *(int *)maxColor_ );
__m128c blocka[2], blockb[2];
blocka[0] = *((__m128i *)(&colorBlock[ 0]));
blocka[1] = *((__m128i *)(&colorBlock[32]));
blockb[0] = *((__m128i *)(&colorBlock[16]));
blockb[1] = *((__m128i *)(&colorBlock[48]));
temp0 = _mm_and_si128( maxColor, (const __m128i &)SIMD_SSE2_byte_colorMask );
temp0 = _mm_unpacklo_epi8( temp0, zero );
temp4 = _mm_shufflelo_epi16( temp0, R_SHUFFLE_D( 0, 3, 2, 3 ) );
temp5 = _mm_shufflelo_epi16( temp0, R_SHUFFLE_D( 3, 1, 3, 3 ) );
temp4 = _mm_srli_epi16( temp4, 5 );
temp5 = _mm_srli_epi16( temp5, 6 );
temp0 = _mm_or_si128( temp0, temp4 );
temp0 = _mm_or_si128( temp0, temp5 );
temp1 = _mm_and_si128( minColor, (const __m128i &)SIMD_SSE2_byte_colorMask );
temp1 = _mm_unpacklo_epi8( temp1, zero );
temp4 = _mm_shufflelo_epi16( temp1, R_SHUFFLE_D( 0, 3, 2, 3 ) );
temp5 = _mm_shufflelo_epi16( temp1, R_SHUFFLE_D( 3, 1, 3, 3 ) );
temp4 = _mm_srli_epi16( temp4, 5 );
temp5 = _mm_srli_epi16( temp5, 6 );
temp1 = _mm_or_si128( temp1, temp4 );
temp1 = _mm_or_si128( temp1, temp5 );
temp2 = _mm_packus_epi16( temp0, zero );
color0 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 0, 1, 0, 1 ) );
temp6 = _mm_add_epi16( temp0, temp0 );
temp6 = _mm_add_epi16( temp6, temp1 );
temp6 = _mm_mulhi_epi16( temp6, (const __m128i &)SIMD_SSE2_word_div_by_3 ); // * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
temp6 = _mm_packus_epi16( temp6, zero );
color2 = _mm_shuffle_epi32( temp6, R_SHUFFLE_D( 0, 1, 0, 1 ) );
temp3 = _mm_packus_epi16( temp1, zero );
color1 = _mm_shuffle_epi32( temp3, R_SHUFFLE_D( 0, 1, 0, 1 ) );
temp1 = _mm_add_epi16( temp1, temp1 );
temp0 = _mm_add_epi16( temp0, temp1 );
temp0 = _mm_mulhi_epi16( temp0, (const __m128i &)SIMD_SSE2_word_div_by_3 ); // * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
temp0 = _mm_packus_epi16( temp0, zero );
color3 = _mm_shuffle_epi32( temp0, R_SHUFFLE_D( 0, 1, 0, 1 ) );
for ( int i = 1; i >= 0; i-- ) {
// Load block
temp3 = _mm_shuffle_epi32( blocka[i], R_SHUFFLE_D( 0, 2, 1, 3 ) );
temp5 = _mm_shuffle_ps( blocka[i], zero, R_SHUFFLE_D( 2, 3, 0, 1 ) );
temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 2, 1, 3 ) );
temp0 = _mm_sad_epu8( temp3, color0 );
temp6 = _mm_sad_epu8( temp5, color0 );
temp0 = _mm_packs_epi32( temp0, temp6 );
temp1 = _mm_sad_epu8( temp3, color1 );
temp6 = _mm_sad_epu8( temp5, color1 );
temp1 = _mm_packs_epi32( temp1, temp6 );
temp2 = _mm_sad_epu8( temp3, color2 );
temp6 = _mm_sad_epu8( temp5, color2 );
temp2 = _mm_packs_epi32( temp2, temp6 );
temp3 = _mm_sad_epu8( temp3, color3 );
temp5 = _mm_sad_epu8( temp5, color3 );
temp3 = _mm_packs_epi32( temp3, temp5 );
// Load block
temp4 = _mm_shuffle_epi32( blockb[i], R_SHUFFLE_D( 0, 2, 1, 3 ) );
temp5 = _mm_shuffle_ps( blockb[i], zero, R_SHUFFLE_D( 2, 3, 0, 1 ) );
temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 2, 1, 3 ) );
temp6 = _mm_sad_epu8( temp4, color0 );
temp7 = _mm_sad_epu8( temp5, color0 );
temp6 = _mm_packs_epi32( temp6, temp7 );
temp0 = _mm_packs_epi32( temp0, temp6 ); // d0
temp6 = _mm_sad_epu8( temp4, color1 );
temp7 = _mm_sad_epu8( temp5, color1 );
temp6 = _mm_packs_epi32( temp6, temp7 );
temp1 = _mm_packs_epi32( temp1, temp6 ); // d1
temp6 = _mm_sad_epu8( temp4, color2 );
temp7 = _mm_sad_epu8( temp5, color2 );
temp6 = _mm_packs_epi32( temp6, temp7 );
temp2 = _mm_packs_epi32( temp2, temp6 ); // d2
temp4 = _mm_sad_epu8( temp4, color3 );
temp5 = _mm_sad_epu8( temp5, color3 );
temp4 = _mm_packs_epi32( temp4, temp5 );
temp3 = _mm_packs_epi32( temp3, temp4 ); // d3
temp7 = _mm_slli_epi32( result, 16 );
temp4 = _mm_cmpgt_epi16( temp0, temp2 ); // b2
temp5 = _mm_cmpgt_epi16( temp1, temp3 ); // b3
temp0 = _mm_cmpgt_epi16( temp0, temp3 ); // b0
temp1 = _mm_cmpgt_epi16( temp1, temp2 ); // b1
temp2 = _mm_cmpgt_epi16( temp2, temp3 ); // b4
temp4 = _mm_and_si128( temp4, temp1 ); // x0
temp5 = _mm_and_si128( temp5, temp0 ); // x1
temp2 = _mm_and_si128( temp2, temp0 ); // x2
temp4 = _mm_or_si128( temp4, temp5 );
temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_word_1 );
temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_word_2 );
temp2 = _mm_or_si128( temp2, temp4 );
temp5 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 2, 3, 0, 1 ) );
temp2 = _mm_unpacklo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_0 );
temp5 = _mm_unpacklo_epi16( temp5, (const __m128i &)SIMD_SSE2_word_0 );
temp5 = _mm_slli_epi32( temp5, 8 );
temp7 = _mm_or_si128( temp7, temp5 );
result = _mm_or_si128( temp7, temp2 );
}
temp4 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 1, 2, 3, 0 ) );
temp5 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 2, 3, 0, 1 ) );
temp6 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 3, 0, 1, 2 ) );
temp4 = _mm_slli_epi32( temp4, 2 );
temp5 = _mm_slli_epi32( temp5, 4 );
temp6 = _mm_slli_epi32( temp6, 6 );
temp7 = _mm_or_si128( result, temp4 );
temp7 = _mm_or_si128( temp7, temp5 );
temp7 = _mm_or_si128( temp7, temp6 );
unsigned int out = _mm_cvtsi128_si32( temp7 );
EmitUInt( out );
#else
assert( false );
#endif
}
/*
========================
idDxtEncoder::EmitColorAlphaIndices_SSE2
params: colorBlock - 16 pixel block for which find color indexes
paramO: minColor - Min color found
paramO: maxColor - Max color found
return: 4 byte color index block
========================
*/
void idDxtEncoder::EmitColorAlphaIndices_SSE2( const byte *colorBlock, const byte *minColor_, const byte *maxColor_ ) {
#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
ALIGN16( byte color0[16] );
ALIGN16( byte color1[16] );
ALIGN16( byte color2[16] );
ALIGN16( byte color3[16] );
ALIGN16( byte result[16] );
byte *outPtr = outData;
__asm {
mov esi, maxColor_
mov edi, minColor_
pxor xmm7, xmm7
movdqa result, xmm7
movd xmm0, dword ptr [esi]
pand xmm0, SIMD_SSE2_byte_colorMask
punpcklbw xmm0, xmm7
pshuflw xmm4, xmm0, R_SHUFFLE_D( 0, 3, 2, 3 )
pshuflw xmm5, xmm0, R_SHUFFLE_D( 3, 1, 3, 3 )
psrlw xmm4, 5
psrlw xmm5, 6
por xmm0, xmm4
por xmm0, xmm5
movd xmm1, dword ptr [edi]
pand xmm1, SIMD_SSE2_byte_colorMask
punpcklbw xmm1, xmm7
pshuflw xmm4, xmm1, R_SHUFFLE_D( 0, 3, 2, 3 )
pshuflw xmm5, xmm1, R_SHUFFLE_D( 3, 1, 3, 3 )
psrlw xmm4, 5
psrlw xmm5, 6
por xmm1, xmm4
por xmm1, xmm5
movdqa xmm2, xmm0
packuswb xmm2, xmm7
pshufd xmm2, xmm2, R_SHUFFLE_D( 0, 1, 0, 1 )
movdqa color0, xmm2
movdqa xmm6, xmm0
paddw xmm6, xmm1
psrlw xmm6, 1
packuswb xmm6, xmm7
pshufd xmm6, xmm6, R_SHUFFLE_D( 0, 1, 0, 1 )
movdqa color2, xmm6
movdqa xmm3, xmm1
packuswb xmm3, xmm7
pshufd xmm3, xmm3, R_SHUFFLE_D( 0, 1, 0, 1 )
movdqa color1, xmm3
movdqa color3, xmm7
mov eax, 32
mov esi, colorBlock
loop1: // iterates 2 times
movq xmm3, qword ptr [esi+eax+0]
pshufd xmm3, xmm3, R_SHUFFLE_D( 0, 2, 1, 3 )
movq xmm5, qword ptr [esi+eax+8]
pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 )
movdqa xmm0, xmm3
movdqa xmm6, xmm5
psadbw xmm0, color0
psadbw xmm6, color0
packssdw xmm0, xmm6
movdqa xmm1, xmm3
movdqa xmm6, xmm5
psadbw xmm1, color1
psadbw xmm6, color1
packssdw xmm1, xmm6
movdqa xmm2, xmm3
movdqa xmm6, xmm5
psadbw xmm2, color2
psadbw xmm6, color2
packssdw xmm2, xmm6
shufps xmm3, xmm5, R_SHUFFLE_D( 0, 2, 0, 2 )
psrld xmm3, 24
packssdw xmm3, xmm3
movq xmm4, qword ptr [esi+eax+16]
pshufd xmm4, xmm4, R_SHUFFLE_D( 0, 2, 1, 3 )
movq xmm5, qword ptr [esi+eax+24]
pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 )
movdqa xmm6, xmm4
movdqa xmm7, xmm5
psadbw xmm6, color0
psadbw xmm7, color0
packssdw xmm6, xmm7
packssdw xmm0, xmm6 // d1
movdqa xmm6, xmm4
movdqa xmm7, xmm5
psadbw xmm6, color1
psadbw xmm7, color1
packssdw xmm6, xmm7
packssdw xmm1, xmm6 // d1
movdqa xmm6, xmm4
movdqa xmm7, xmm5
psadbw xmm6, color2
psadbw xmm7, color2
packssdw xmm6, xmm7
packssdw xmm2, xmm6 // d2
shufps xmm4, xmm5, R_SHUFFLE_D( 0, 2, 0, 2 )
psrld xmm4, 24
packssdw xmm4, xmm4
punpcklqdq xmm3, xmm4 // c3
movdqa xmm7, result
pslld xmm7, 16
movdqa xmm4, xmm2
pcmpgtw xmm2, xmm0 // b0
pcmpgtw xmm4, xmm1 // b1
pcmpgtw xmm1, xmm0 // b2
pmaxsw xmm3, SIMD_SSE2_word_127 // b3
pcmpeqw xmm3, SIMD_SSE2_word_127
pand xmm2, xmm4
por xmm2, xmm3 // b0 & b1 | b3
pxor xmm1, xmm4
por xmm1, xmm3 // b2 ^ b1 | b3
pand xmm2, SIMD_SSE2_word_2
pand xmm1, SIMD_SSE2_word_1
por xmm2, xmm1
pshufd xmm5, xmm2, R_SHUFFLE_D( 2, 3, 0, 1 )
punpcklwd xmm2, SIMD_SSE2_word_0
punpcklwd xmm5, SIMD_SSE2_word_0
pslld xmm5, 8
por xmm7, xmm5
por xmm7, xmm2
movdqa result, xmm7
sub eax, 32
jge loop1
mov esi, outPtr
pshufd xmm4, xmm7, R_SHUFFLE_D( 1, 2, 3, 0 )
pshufd xmm5, xmm7, R_SHUFFLE_D( 2, 3, 0, 1 )
pshufd xmm6, xmm7, R_SHUFFLE_D( 3, 0, 1, 2 )
pslld xmm4, 2
pslld xmm5, 4
pslld xmm6, 6
por xmm7, xmm4
por xmm7, xmm5
por xmm7, xmm6
movd dword ptr [esi], xmm7
}
outData += 4;
#elif defined ( ID_WIN_X86_SSE2_INTRIN )
__m128c zero = SIMD_SSE2_zero;
__m128c result = SIMD_SSE2_zero;
__m128c color0, color1, color2;
__m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
__m128c minColor = _mm_cvtsi32_si128( *(int *)minColor_ );
__m128c maxColor = _mm_cvtsi32_si128( *(int *)maxColor_ );
__m128c blocka[2], blockb[2];
blocka[0] = *((__m128i *)(&colorBlock[ 0]));
blocka[1] = *((__m128i *)(&colorBlock[32]));
blockb[0] = *((__m128i *)(&colorBlock[16]));
blockb[1] = *((__m128i *)(&colorBlock[48]));
temp0 = _mm_and_si128( maxColor, *(__m128c*)SIMD_SSE2_byte_colorMask );
temp0 = _mm_unpacklo_epi8( temp0, zero );
temp4 = _mm_shufflelo_epi16( temp0, R_SHUFFLE_D( 0, 3, 2, 3 ) );
temp5 = _mm_shufflelo_epi16( temp0, R_SHUFFLE_D( 3, 1, 3, 3 ) );
temp4 = _mm_srli_epi16( temp4, 5 );
temp5 = _mm_srli_epi16( temp5, 6 );
temp0 = _mm_or_si128( temp0, temp4 );
temp0 = _mm_or_si128( temp0, temp5 );
temp1 = _mm_and_si128( minColor, *(__m128c*)SIMD_SSE2_byte_colorMask );
temp1 = _mm_unpacklo_epi8( temp1, zero );
temp4 = _mm_shufflelo_epi16( temp1, R_SHUFFLE_D( 0, 3, 2, 3 ) );
temp5 = _mm_shufflelo_epi16( temp1, R_SHUFFLE_D( 3, 1, 3, 3 ) );
temp4 = _mm_srli_epi16( temp4, 5 );
temp5 = _mm_srli_epi16( temp5, 6 );
temp1 = _mm_or_si128( temp1, temp4 );
temp1 = _mm_or_si128( temp1, temp5 );
temp2 = _mm_packus_epi16( temp0, zero );
color0 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 0, 1, 0, 1 ) );
temp6 = _mm_add_epi16( temp0, temp0 );
temp6 = _mm_srli_epi16( temp6, 1 ); // diff from color
temp6 = _mm_packus_epi16( temp6, zero );
color2 = _mm_shuffle_epi32( temp6, R_SHUFFLE_D( 0, 1, 0, 1 ) );
temp3 = _mm_packus_epi16( temp1, zero );
color1 = _mm_shuffle_epi32( temp3, R_SHUFFLE_D( 0, 1, 0, 1 ) );
// not used
//color3 = zero;
for ( int i = 1; i >= 0; i-- ) {
// Load block
temp3 = _mm_shuffle_epi32( blocka[i], R_SHUFFLE_D( 0, 2, 1, 3 ) );
temp5 = _mm_shuffle_ps( blocka[i], zero, R_SHUFFLE_D( 2, 3, 0, 1 ) );
temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 2, 1, 3 ) );
temp0 = _mm_sad_epu8( temp3, color0 );
temp6 = _mm_sad_epu8( temp5, color0 );
temp0 = _mm_packs_epi32( temp0, temp6 );
temp1 = _mm_sad_epu8( temp3, color1 );
temp6 = _mm_sad_epu8( temp5, color1 );
temp1 = _mm_packs_epi32( temp1, temp6 );
temp2 = _mm_sad_epu8( temp3, color2 );
temp6 = _mm_sad_epu8( temp5, color2 );
temp2 = _mm_packs_epi32( temp2, temp6 );
// diff from color
temp3 = _mm_shuffle_ps( temp3, temp5, R_SHUFFLE_D( 0, 2, 0, 2 ) );
temp3 = _mm_srli_epi32( temp3, 24 );
temp3 = _mm_packs_epi32( temp3, temp3 );
// Load block
temp4 = _mm_shuffle_epi32( blockb[i], R_SHUFFLE_D( 0, 2, 1, 3 ) );
temp5 = _mm_shuffle_ps( blockb[i], zero, R_SHUFFLE_D( 2, 3, 0, 1 ) );
temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 2, 1, 3 ) );
temp6 = _mm_sad_epu8( temp4, color0 );
temp7 = _mm_sad_epu8( temp5, color0 );
temp6 = _mm_packs_epi32( temp6, temp7 );
temp0 = _mm_packs_epi32( temp0, temp6 ); // d0
temp6 = _mm_sad_epu8( temp4, color1 );
temp7 = _mm_sad_epu8( temp5, color1 );
temp6 = _mm_packs_epi32( temp6, temp7 );
temp1 = _mm_packs_epi32( temp1, temp6 ); // d1
temp6 = _mm_sad_epu8( temp4, color2 );
temp7 = _mm_sad_epu8( temp5, color2 );
temp6 = _mm_packs_epi32( temp6, temp7 );
temp2 = _mm_packs_epi32( temp2, temp6 ); // d2
// diff from color
temp4 = _mm_shuffle_ps( temp4, temp5, R_SHUFFLE_D( 0, 2, 0, 2 ) ); // c3
temp4 = _mm_srli_epi32( temp4, 24 );
temp4 = _mm_packs_epi32( temp4, temp4 );
temp3 = _mm_unpacklo_epi64( temp3, temp4 );
temp7 = _mm_slli_epi32( result, 16 );
// diff from color
temp4 = _mm_cmpgt_epi16( temp2, temp1 ); // b1
temp2 = _mm_cmpgt_epi16( temp2, temp0 ); // b0
temp1 = _mm_cmpgt_epi16( temp1, temp0 ); // b2
temp3 = _mm_max_epi16( temp3, (const __m128i &)SIMD_SSE2_word_127 ); // b3
temp3 = _mm_cmpeq_epi16( temp3, (const __m128i &)SIMD_SSE2_word_127 );
temp2 = _mm_and_si128( temp2, temp4 );
temp2 = _mm_or_si128( temp2, temp3 ); // b0 & b1 | b3
temp1 = _mm_xor_si128( temp1, temp4 );
temp1 = _mm_or_si128( temp1, temp3 ); // b2 ^ b1 | b3
temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_word_2 );
temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_word_1 );
temp2 = _mm_or_si128( temp2, temp1 );
temp5 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 2, 3, 0, 1 ) );
temp2 = _mm_unpacklo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_0 );
temp5 = _mm_unpacklo_epi16( temp5, (const __m128i &)SIMD_SSE2_word_0 );
temp5 = _mm_slli_epi32( temp5, 8 );
temp7 = _mm_or_si128( temp7, temp5 );
result = _mm_or_si128( temp7, temp2 );
}
temp4 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 1, 2, 3, 0 ) );
temp5 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 2, 3, 0, 1 ) );
temp6 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 3, 0, 1, 2 ) );
temp4 = _mm_slli_epi32( temp4, 2 );
temp5 = _mm_slli_epi32( temp5, 4 );
temp6 = _mm_slli_epi32( temp6, 6 );
temp7 = _mm_or_si128( result, temp4 );
temp7 = _mm_or_si128( temp7, temp5 );
temp7 = _mm_or_si128( temp7, temp6 );
unsigned int out = _mm_cvtsi128_si32( temp7 );
EmitUInt( out );
#else
assert( false );
#endif
}
/*
========================
idDxtEncoder::EmitCoCgIndices_SSE2
params: colorBlock - 16 pixel block for which to find color indices
paramO: minColor - Min alpha found
paramO: maxColor - Max alpha found
return: 4 byte color index block
========================
*/
void idDxtEncoder::EmitCoCgIndices_SSE2( const byte *colorBlock, const byte *minColor_, const byte *maxColor_ ) {
#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
ALIGN16( byte color0[16] );
ALIGN16( byte color1[16] );
ALIGN16( byte color2[16] );
ALIGN16( byte color3[16] );
ALIGN16( byte result[16] );
byte *outPtr = outData;
__asm {
mov esi, maxColor_
mov edi, minColor_
pxor xmm7, xmm7
movdqa result, xmm7
movd xmm0, dword ptr [esi]
pand xmm0, SIMD_SSE2_byte_colorMask2
pshufd xmm0, xmm0, R_SHUFFLE_D( 0, 1, 0, 1 )
movdqa color0, xmm0
movd xmm1, dword ptr [edi]
pand xmm1, SIMD_SSE2_byte_colorMask2
pshufd xmm1, xmm1, R_SHUFFLE_D( 0, 1, 0, 1 )
movdqa color1, xmm1
punpcklbw xmm0, xmm7
punpcklbw xmm1, xmm7
movdqa xmm6, xmm1
paddw xmm1, xmm0
paddw xmm0, xmm1
pmulhw xmm0, SIMD_SSE2_word_div_by_3 // * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
packuswb xmm0, xmm7
pshufd xmm0, xmm0, R_SHUFFLE_D( 0, 1, 0, 1 )
movdqa color2, xmm0
paddw xmm1, xmm6
pmulhw xmm1, SIMD_SSE2_word_div_by_3 // * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
packuswb xmm1, xmm7
pshufd xmm1, xmm1, R_SHUFFLE_D( 0, 1, 0, 1 )
movdqa color3, xmm1
mov eax, 32
mov esi, colorBlock
loop1: // iterates 2 times
movq xmm3, qword ptr [esi+eax+0]
pshufd xmm3, xmm3, R_SHUFFLE_D( 0, 2, 1, 3 ) // punpckldq xmm4, SIMD_SSE2_dword_0
movq xmm5, qword ptr [esi+eax+8]
pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 ) // punpckldq xmm5, SIMD_SSE2_dword_0
movdqa xmm0, xmm3
movdqa xmm6, xmm5
psadbw xmm0, color0
psadbw xmm6, color0
packssdw xmm0, xmm6
movdqa xmm1, xmm3
movdqa xmm6, xmm5
psadbw xmm1, color1
psadbw xmm6, color1
packssdw xmm1, xmm6
movdqa xmm2, xmm3
movdqa xmm6, xmm5
psadbw xmm2, color2
psadbw xmm6, color2
packssdw xmm2, xmm6
psadbw xmm3, color3
psadbw xmm5, color3
packssdw xmm3, xmm5
movq xmm4, qword ptr [esi+eax+16]
pshufd xmm4, xmm4, R_SHUFFLE_D( 0, 2, 1, 3 )
movq xmm5, qword ptr [esi+eax+24]
pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 )
movdqa xmm6, xmm4
movdqa xmm7, xmm5
psadbw xmm6, color0
psadbw xmm7, color0
packssdw xmm6, xmm7
packssdw xmm0, xmm6 // d1
movdqa xmm6, xmm4
movdqa xmm7, xmm5
psadbw xmm6, color1
psadbw xmm7, color1
packssdw xmm6, xmm7
packssdw xmm1, xmm6 // d1
movdqa xmm6, xmm4
movdqa xmm7, xmm5
psadbw xmm6, color2
psadbw xmm7, color2
packssdw xmm6, xmm7
packssdw xmm2, xmm6 // d2
psadbw xmm4, color3
psadbw xmm5, color3
packssdw xmm4, xmm5
packssdw xmm3, xmm4 // d3
movdqa xmm7, result
pslld xmm7, 16
movdqa xmm4, xmm0
movdqa xmm5, xmm1
pcmpgtw xmm0, xmm3 // b0
pcmpgtw xmm1, xmm2 // b1
pcmpgtw xmm4, xmm2 // b2
pcmpgtw xmm5, xmm3 // b3
pcmpgtw xmm2, xmm3 // b4
pand xmm4, xmm1 // x0
pand xmm5, xmm0 // x1
pand xmm2, xmm0 // x2
por xmm4, xmm5
pand xmm2, SIMD_SSE2_word_1
pand xmm4, SIMD_SSE2_word_2
por xmm2, xmm4
pshufd xmm5, xmm2, R_SHUFFLE_D( 2, 3, 0, 1 )
punpcklwd xmm2, SIMD_SSE2_word_0
punpcklwd xmm5, SIMD_SSE2_word_0
pslld xmm5, 8
por xmm7, xmm5
por xmm7, xmm2
movdqa result, xmm7
sub eax, 32
jge loop1
mov esi, outPtr
pshufd xmm4, xmm7, R_SHUFFLE_D( 1, 2, 3, 0 )
pshufd xmm5, xmm7, R_SHUFFLE_D( 2, 3, 0, 1 )
pshufd xmm6, xmm7, R_SHUFFLE_D( 3, 0, 1, 2 )
pslld xmm4, 2
pslld xmm5, 4
pslld xmm6, 6
por xmm7, xmm4
por xmm7, xmm5
por xmm7, xmm6
movd dword ptr [esi], xmm7
}
outData += 4;
#elif defined ( ID_WIN_X86_SSE2_INTRIN )
__m128c zero = SIMD_SSE2_zero;
__m128c result = SIMD_SSE2_zero;
__m128c color0, color1, color2, color3;
__m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
__m128c minColor = _mm_cvtsi32_si128( *(int *)minColor_ );
__m128c maxColor = _mm_cvtsi32_si128( *(int *)maxColor_ );
__m128c blocka[2], blockb[2];
blocka[0] = *((__m128i *)(&colorBlock[ 0]));
blocka[1] = *((__m128i *)(&colorBlock[32]));
blockb[0] = *((__m128i *)(&colorBlock[16]));
blockb[1] = *((__m128i *)(&colorBlock[48]));
temp7 = zero;
temp0 = maxColor;
temp0 = _mm_and_si128( temp0, *(__m128c*)SIMD_SSE2_byte_colorMask2 );
color0 = _mm_shuffle_epi32( temp0, R_SHUFFLE_D( 0, 1, 0, 1 ) );
temp1 = minColor;
temp1 = _mm_and_si128( temp1, *(__m128c*)SIMD_SSE2_byte_colorMask2 );
color1 = _mm_shuffle_epi32( temp1, R_SHUFFLE_D( 0, 1, 0, 1 ) );
temp0 = _mm_unpacklo_epi8( color0, zero );
temp1 = _mm_unpacklo_epi8( color1, zero );
temp6 = _mm_add_epi16( temp1, temp0 );
temp0 = _mm_add_epi16( temp0, temp6 );
temp0 = _mm_mulhi_epi16( temp0, (const __m128i &)SIMD_SSE2_word_div_by_3 ); // * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
temp0 = _mm_packus_epi16( temp0, zero );
color2 = _mm_shuffle_epi32( temp0, R_SHUFFLE_D( 0, 1, 0, 1 ) );
temp1 = _mm_add_epi16( temp1, temp6 );
temp1 = _mm_mulhi_epi16( temp1, (const __m128i &)SIMD_SSE2_word_div_by_3 ); // * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
temp1 = _mm_packus_epi16( temp1, zero );
color3 = _mm_shuffle_epi32( temp1, R_SHUFFLE_D( 0, 1, 0, 1 ) );
for ( int i = 1; i >= 0; i-- ) {
// Load block
temp3 = _mm_shuffle_epi32( blocka[i], R_SHUFFLE_D( 0, 2, 1, 3 ) );
temp5 = _mm_shuffle_ps( blocka[i], zero, R_SHUFFLE_D( 2, 3, 0, 1 ) );
temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 2, 1, 3 ) );
temp0 = _mm_sad_epu8( temp3, color0 );
temp6 = _mm_sad_epu8( temp5, color0 );
temp0 = _mm_packs_epi32( temp0, temp6 );
temp1 = _mm_sad_epu8( temp3, color1 );
temp6 = _mm_sad_epu8( temp5, color1 );
temp1 = _mm_packs_epi32( temp1, temp6 );
temp2 = _mm_sad_epu8( temp3, color2 );
temp6 = _mm_sad_epu8( temp5, color2 );
temp2 = _mm_packs_epi32( temp2, temp6 );
temp3 = _mm_sad_epu8( temp3, color3 );
temp5 = _mm_sad_epu8( temp5, color3 );
temp3 = _mm_packs_epi32( temp3, temp5 );
// Load block
temp4 = _mm_shuffle_epi32( blockb[i], R_SHUFFLE_D( 0, 2, 1, 3 ) );
temp5 = _mm_shuffle_ps( blockb[i], zero, R_SHUFFLE_D( 2, 3, 0, 1 ) );
temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 2, 1, 3 ) );
temp6 = _mm_sad_epu8( temp4, color0 );
temp7 = _mm_sad_epu8( temp5, color0 );
temp6 = _mm_packs_epi32( temp6, temp7 );
temp0 = _mm_packs_epi32( temp0, temp6 ); // d0
temp6 = _mm_sad_epu8( temp4, color1 );
temp7 = _mm_sad_epu8( temp5, color1 );
temp6 = _mm_packs_epi32( temp6, temp7 );
temp1 = _mm_packs_epi32( temp1, temp6 ); // d1
temp6 = _mm_sad_epu8( temp4, color2 );
temp7 = _mm_sad_epu8( temp5, color2 );
temp6 = _mm_packs_epi32( temp6, temp7 );
temp2 = _mm_packs_epi32( temp2, temp6 ); // d2
temp4 = _mm_sad_epu8( temp4, color3 );
temp5 = _mm_sad_epu8( temp5, color3 );
temp4 = _mm_packs_epi32( temp4, temp5 );
temp3 = _mm_packs_epi32( temp3, temp4 ); // d3
temp7 = _mm_slli_epi32( result, 16 );
temp4 = _mm_cmpgt_epi16( temp0, temp2 ); // b2
temp5 = _mm_cmpgt_epi16( temp1, temp3 ); // b3
temp0 = _mm_cmpgt_epi16( temp0, temp3 ); // b0
temp1 = _mm_cmpgt_epi16( temp1, temp2 ); // b1
temp2 = _mm_cmpgt_epi16( temp2, temp3 ); // b4
temp4 = _mm_and_si128( temp4, temp1 ); // x0
temp5 = _mm_and_si128( temp5, temp0 ); // x1
temp2 = _mm_and_si128( temp2, temp0 ); // x2
temp4 = _mm_or_si128( temp4, temp5 );
temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_word_1 );
temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_word_2 );
temp2 = _mm_or_si128( temp2, temp4 );
temp5 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 2, 3, 0, 1 ) );
temp2 = _mm_unpacklo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_0 );
temp5 = _mm_unpacklo_epi16( temp5, (const __m128i &)SIMD_SSE2_word_0 );
temp5 = _mm_slli_epi32( temp5, 8 );
temp7 = _mm_or_si128( temp7, temp5 );
result = _mm_or_si128( temp7, temp2 );
}
temp4 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 1, 2, 3, 0 ) );
temp5 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 2, 3, 0, 1 ) );
temp6 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 3, 0, 1, 2 ) );
temp4 = _mm_slli_epi32( temp4, 2 );
temp5 = _mm_slli_epi32( temp5, 4 );
temp6 = _mm_slli_epi32( temp6, 6 );
temp7 = _mm_or_si128( result, temp4 );
temp7 = _mm_or_si128( temp7, temp5 );
temp7 = _mm_or_si128( temp7, temp6 );
unsigned int out = _mm_cvtsi128_si32( temp7 );
EmitUInt( out );
#else
assert( false );
#endif
}
/*
========================
idDxtEncoder::EmitAlphaIndices_SSE2
params: block - 16 pixel block for which to find alpha indices
paramO: minAlpha - Min alpha found
paramO: maxAlpha - Max alpha found
========================
*/
void idDxtEncoder::EmitAlphaIndices_SSE2( const byte *block, const int minAlpha_, const int maxAlpha_ ) {
#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
assert( maxAlpha_ >= minAlpha_ );
byte *outPtr = outData;
__asm {
mov esi, block
movdqa xmm0, xmmword ptr [esi+ 0]
movdqa xmm5, xmmword ptr [esi+16]
movdqa xmm6, xmmword ptr [esi+32]
movdqa xmm4, xmmword ptr [esi+48]
psrld xmm0, 24
psrld xmm5, 24
psrld xmm6, 24
psrld xmm4, 24
packuswb xmm0, xmm5
packuswb xmm6, xmm4
//---------------------
// ab0 = ( 7 * maxAlpha + 7 * minAlpha + ALPHA_RANGE ) / 14
// ab3 = ( 9 * maxAlpha + 5 * minAlpha + ALPHA_RANGE ) / 14
// ab2 = ( 11 * maxAlpha + 3 * minAlpha + ALPHA_RANGE ) / 14
// ab1 = ( 13 * maxAlpha + 1 * minAlpha + ALPHA_RANGE ) / 14
// ab4 = ( 7 * maxAlpha + 7 * minAlpha + ALPHA_RANGE ) / 14
// ab5 = ( 5 * maxAlpha + 9 * minAlpha + ALPHA_RANGE ) / 14
// ab6 = ( 3 * maxAlpha + 11 * minAlpha + ALPHA_RANGE ) / 14
// ab7 = ( 1 * maxAlpha + 13 * minAlpha + ALPHA_RANGE ) / 14
movd xmm5, maxAlpha_
pshuflw xmm5, xmm5, R_SHUFFLE_D( 0, 0, 0, 0 )
pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 0, 0, 0 )
movdqa xmm7, xmm5
movd xmm2, minAlpha_
pshuflw xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 )
pshufd xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 )
movdqa xmm3, xmm2
pmullw xmm5, SIMD_SSE2_word_scale_7_9_11_13
pmullw xmm7, SIMD_SSE2_word_scale_7_5_3_1
pmullw xmm2, SIMD_SSE2_word_scale_7_5_3_1
pmullw xmm3, SIMD_SSE2_word_scale_7_9_11_13
paddw xmm5, xmm2
paddw xmm7, xmm3
paddw xmm5, SIMD_SSE2_word_7
paddw xmm7, SIMD_SSE2_word_7
pmulhw xmm5, SIMD_SSE2_word_div_by_14 // * ( ( 1 << 16 ) / 14 + 1 ) ) >> 16
pmulhw xmm7, SIMD_SSE2_word_div_by_14 // * ( ( 1 << 16 ) / 14 + 1 ) ) >> 16
pshufd xmm1, xmm5, R_SHUFFLE_D( 3, 3, 3, 3 )
pshufd xmm2, xmm5, R_SHUFFLE_D( 2, 2, 2, 2 )
pshufd xmm3, xmm5, R_SHUFFLE_D( 1, 1, 1, 1 )
packuswb xmm1, xmm1 // ab1
packuswb xmm2, xmm2 // ab2
packuswb xmm3, xmm3 // ab3
packuswb xmm0, xmm6 // alpha block
pshufd xmm4, xmm7, R_SHUFFLE_D( 0, 0, 0, 0 )
pshufd xmm5, xmm7, R_SHUFFLE_D( 1, 1, 1, 1 )
pshufd xmm6, xmm7, R_SHUFFLE_D( 2, 2, 2, 2 )
pshufd xmm7, xmm7, R_SHUFFLE_D( 3, 3, 3, 3 )
packuswb xmm4, xmm4 // ab4
packuswb xmm5, xmm5 // ab5
packuswb xmm6, xmm6 // ab6
packuswb xmm7, xmm7 // ab7
pmaxub xmm1, xmm0
pmaxub xmm2, xmm0
pmaxub xmm3, xmm0
pcmpeqb xmm1, xmm0
pcmpeqb xmm2, xmm0
pcmpeqb xmm3, xmm0
pmaxub xmm4, xmm0
pmaxub xmm5, xmm0
pmaxub xmm6, xmm0
pmaxub xmm7, xmm0
pcmpeqb xmm4, xmm0
pcmpeqb xmm5, xmm0
pcmpeqb xmm6, xmm0
pcmpeqb xmm7, xmm0
movdqa xmm0, SIMD_SSE2_byte_8
paddsb xmm0, xmm1
paddsb xmm2, xmm3
paddsb xmm4, xmm5
paddsb xmm6, xmm7
paddsb xmm0, xmm2
paddsb xmm4, xmm6
paddsb xmm0, xmm4
pand xmm0, SIMD_SSE2_byte_7
movdqa xmm1, SIMD_SSE2_byte_2
pcmpgtb xmm1, xmm0
pand xmm1, SIMD_SSE2_byte_1
pxor xmm0, xmm1
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, xmm0
movdqa xmm4, xmm0
movdqa xmm5, xmm0
movdqa xmm6, xmm0
movdqa xmm7, xmm0
psrlq xmm1, 8- 3
psrlq xmm2, 16- 6
psrlq xmm3, 24- 9
psrlq xmm4, 32-12
psrlq xmm5, 40-15
psrlq xmm6, 48-18
psrlq xmm7, 56-21
pand xmm0, SIMD_SSE2_dword_alpha_bit_mask0
pand xmm1, SIMD_SSE2_dword_alpha_bit_mask1
pand xmm2, SIMD_SSE2_dword_alpha_bit_mask2
pand xmm3, SIMD_SSE2_dword_alpha_bit_mask3
pand xmm4, SIMD_SSE2_dword_alpha_bit_mask4
pand xmm5, SIMD_SSE2_dword_alpha_bit_mask5
pand xmm6, SIMD_SSE2_dword_alpha_bit_mask6
pand xmm7, SIMD_SSE2_dword_alpha_bit_mask7
por xmm0, xmm1
por xmm2, xmm3
por xmm4, xmm5
por xmm6, xmm7
por xmm0, xmm2
por xmm4, xmm6
por xmm0, xmm4
mov esi, outPtr
movd [esi+0], xmm0
pshufd xmm1, xmm0, R_SHUFFLE_D( 2, 3, 0, 1 )
movd [esi+3], xmm1
}
outData += 6;
#elif defined ( ID_WIN_X86_SSE2_INTRIN )
__m128i block0 = *((__m128i *)(&block[ 0]));
__m128i block1 = *((__m128i *)(&block[16]));
__m128i block2 = *((__m128i *)(&block[32]));
__m128i block3 = *((__m128i *)(&block[48]));
__m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
temp0 = _mm_srli_epi32( block0, 24 );
temp5 = _mm_srli_epi32( block1, 24 );
temp6 = _mm_srli_epi32( block2, 24 );
temp4 = _mm_srli_epi32( block3, 24 );
temp0 = _mm_packus_epi16( temp0, temp5 );
temp6 = _mm_packus_epi16( temp6, temp4 );
//---------------------
// ab0 = ( 7 * maxAlpha + 7 * minAlpha + ALPHA_RANGE ) / 14
// ab3 = ( 9 * maxAlpha + 5 * minAlpha + ALPHA_RANGE ) / 14
// ab2 = ( 11 * maxAlpha + 3 * minAlpha + ALPHA_RANGE ) / 14
// ab1 = ( 13 * maxAlpha + 1 * minAlpha + ALPHA_RANGE ) / 14
// ab4 = ( 7 * maxAlpha + 7 * minAlpha + ALPHA_RANGE ) / 14
// ab5 = ( 5 * maxAlpha + 9 * minAlpha + ALPHA_RANGE ) / 14
// ab6 = ( 3 * maxAlpha + 11 * minAlpha + ALPHA_RANGE ) / 14
// ab7 = ( 1 * maxAlpha + 13 * minAlpha + ALPHA_RANGE ) / 14
temp5 = _mm_cvtsi32_si128( maxAlpha_ );
temp5 = _mm_shufflelo_epi16( temp5, R_SHUFFLE_D( 0, 0, 0, 0 ) );
temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 0, 0, 0 ) );
temp2 = _mm_cvtsi32_si128( minAlpha_ );
temp2 = _mm_shufflelo_epi16( temp2, R_SHUFFLE_D( 0, 0, 0, 0 ) );
temp2 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 0, 0, 0, 0 ) );
temp7 = _mm_mullo_epi16( temp5, (const __m128i &)SIMD_SSE2_word_scale_7_5_3_1 );
temp5 = _mm_mullo_epi16( temp5, (const __m128i &)SIMD_SSE2_word_scale_7_9_11_13 );
temp3 = _mm_mullo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_scale_7_9_11_13 );
temp2 = _mm_mullo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_scale_7_5_3_1 );
temp5 = _mm_add_epi16( temp5, temp2 );
temp7 = _mm_add_epi16( temp7, temp3 );
temp5 = _mm_add_epi16( temp5, (const __m128i &)SIMD_SSE2_word_7 );
temp7 = _mm_add_epi16( temp7, (const __m128i &)SIMD_SSE2_word_7 );
temp5 = _mm_mulhi_epi16( temp5, (const __m128i &)SIMD_SSE2_word_div_by_14 );
temp7 = _mm_mulhi_epi16( temp7, (const __m128i &)SIMD_SSE2_word_div_by_14 );
temp1 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 3, 3, 3, 3 ) );
temp2 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 2, 2, 2, 2 ) );
temp3 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 1, 1, 1, 1 ) );
temp1 = _mm_packus_epi16( temp1, temp1 );
temp2 = _mm_packus_epi16( temp2, temp2 );
temp3 = _mm_packus_epi16( temp3, temp3 );
temp0 = _mm_packus_epi16( temp0, temp6 );
temp4 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 0, 0, 0, 0 ) );
temp5 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 1, 1, 1, 1 ) );
temp6 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 2, 2, 2, 2 ) );
temp7 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 3, 3, 3, 3 ) );
temp4 = _mm_packus_epi16( temp4, temp4 );
temp5 = _mm_packus_epi16( temp5, temp5 );
temp6 = _mm_packus_epi16( temp6, temp6 );
temp7 = _mm_packus_epi16( temp7, temp7 );
temp1 = _mm_max_epu8( temp1, temp0 );
temp2 = _mm_max_epu8( temp2, temp0 );
temp3 = _mm_max_epu8( temp3, temp0 );
temp1 = _mm_cmpeq_epi8( temp1, temp0 );
temp2 = _mm_cmpeq_epi8( temp2, temp0 );
temp3 = _mm_cmpeq_epi8( temp3, temp0 );
temp4 = _mm_max_epu8( temp4, temp0 );
temp5 = _mm_max_epu8( temp5, temp0 );
temp6 = _mm_max_epu8( temp6, temp0 );
temp7 = _mm_max_epu8( temp7, temp0 );
temp4 = _mm_cmpeq_epi8( temp4, temp0 );
temp5 = _mm_cmpeq_epi8( temp5, temp0 );
temp6 = _mm_cmpeq_epi8( temp6, temp0 );
temp7 = _mm_cmpeq_epi8( temp7, temp0 );
temp0 = _mm_adds_epi8( (const __m128i &)SIMD_SSE2_byte_8, temp1 );
temp2 = _mm_adds_epi8( temp2, temp3 );
temp4 = _mm_adds_epi8( temp4, temp5 );
temp6 = _mm_adds_epi8( temp6, temp7 );
temp0 = _mm_adds_epi8( temp0, temp2 );
temp4 = _mm_adds_epi8( temp4, temp6 );
temp0 = _mm_adds_epi8( temp0, temp4 );
temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_byte_7 );
temp1 = _mm_cmpgt_epi8( (const __m128i &)SIMD_SSE2_byte_2, temp0 );
temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_byte_1 );
temp0 = _mm_xor_si128( temp0, temp1 );
temp1 = _mm_srli_epi64( temp0, 8 - 3 );
temp2 = _mm_srli_epi64( temp0, 16 - 6 );
temp3 = _mm_srli_epi64( temp0, 24 - 9 );
temp4 = _mm_srli_epi64( temp0, 32 - 12 );
temp5 = _mm_srli_epi64( temp0, 40 - 15 );
temp6 = _mm_srli_epi64( temp0, 48 - 18 );
temp7 = _mm_srli_epi64( temp0, 56 - 21 );
temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask0 );
temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask1 );
temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask2 );
temp3 = _mm_and_si128( temp3, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask3 );
temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask4 );
temp5 = _mm_and_si128( temp5, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask5 );
temp6 = _mm_and_si128( temp6, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask6 );
temp7 = _mm_and_si128( temp7, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask7 );
temp0 = _mm_or_si128( temp0, temp1 );
temp2 = _mm_or_si128( temp2, temp3 );
temp4 = _mm_or_si128( temp4, temp5 );
temp6 = _mm_or_si128( temp6, temp7 );
temp0 = _mm_or_si128( temp0, temp2 );
temp4 = _mm_or_si128( temp4, temp6 );
temp0 = _mm_or_si128( temp0, temp4 );
int out = _mm_cvtsi128_si32( temp0 );
EmitUInt( out );
outData--;
temp1 = _mm_shuffle_epi32( temp0, R_SHUFFLE_D( 2, 3, 0, 1 ) );
out = _mm_cvtsi128_si32( temp1 );
EmitUInt( out );
outData--;
#else
assert( false );
#endif
}
/*
========================
idDxtEncoder::EmitAlphaIndices_SSE2
========================
*/
void idDxtEncoder::EmitAlphaIndices_SSE2( const byte *block, const int channelBitOffset, const int minAlpha_, const int maxAlpha_ ) {
#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
assert( maxAlpha_ >= minAlpha_ );
byte *outPtr = outData;
__asm {
movd xmm7, channelBitOffset
mov esi, block
movdqa xmm0, xmmword ptr [esi+ 0]
movdqa xmm5, xmmword ptr [esi+16]
movdqa xmm6, xmmword ptr [esi+32]
movdqa xmm4, xmmword ptr [esi+48]
psrld xmm0, xmm7
psrld xmm5, xmm7
psrld xmm6, xmm7
psrld xmm4, xmm7
pand xmm0, SIMD_SSE2_dword_byte_mask
pand xmm5, SIMD_SSE2_dword_byte_mask
pand xmm6, SIMD_SSE2_dword_byte_mask
pand xmm4, SIMD_SSE2_dword_byte_mask
packuswb xmm0, xmm5
packuswb xmm6, xmm4
//---------------------
// ab0 = ( 7 * maxAlpha + 7 * minAlpha + ALPHA_RANGE ) / 14
// ab3 = ( 9 * maxAlpha + 5 * minAlpha + ALPHA_RANGE ) / 14
// ab2 = ( 11 * maxAlpha + 3 * minAlpha + ALPHA_RANGE ) / 14
// ab1 = ( 13 * maxAlpha + 1 * minAlpha + ALPHA_RANGE ) / 14
// ab4 = ( 7 * maxAlpha + 7 * minAlpha + ALPHA_RANGE ) / 14
// ab5 = ( 5 * maxAlpha + 9 * minAlpha + ALPHA_RANGE ) / 14
// ab6 = ( 3 * maxAlpha + 11 * minAlpha + ALPHA_RANGE ) / 14
// ab7 = ( 1 * maxAlpha + 13 * minAlpha + ALPHA_RANGE ) / 14
movd xmm5, maxAlpha_
pshuflw xmm5, xmm5, R_SHUFFLE_D( 0, 0, 0, 0 )
pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 0, 0, 0 )
movdqa xmm7, xmm5
movd xmm2, minAlpha_
pshuflw xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 )
pshufd xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 )
movdqa xmm3, xmm2
pmullw xmm5, SIMD_SSE2_word_scale_7_9_11_13
pmullw xmm7, SIMD_SSE2_word_scale_7_5_3_1
pmullw xmm2, SIMD_SSE2_word_scale_7_5_3_1
pmullw xmm3, SIMD_SSE2_word_scale_7_9_11_13
paddw xmm5, xmm2
paddw xmm7, xmm3
paddw xmm5, SIMD_SSE2_word_7
paddw xmm7, SIMD_SSE2_word_7
pmulhw xmm5, SIMD_SSE2_word_div_by_14 // * ( ( 1 << 16 ) / 14 + 1 ) ) >> 16
pmulhw xmm7, SIMD_SSE2_word_div_by_14 // * ( ( 1 << 16 ) / 14 + 1 ) ) >> 16
pshufd xmm1, xmm5, R_SHUFFLE_D( 3, 3, 3, 3 )
pshufd xmm2, xmm5, R_SHUFFLE_D( 2, 2, 2, 2 )
pshufd xmm3, xmm5, R_SHUFFLE_D( 1, 1, 1, 1 )
packuswb xmm1, xmm1 // ab1
packuswb xmm2, xmm2 // ab2
packuswb xmm3, xmm3 // ab3
packuswb xmm0, xmm6 // alpha block
pshufd xmm4, xmm7, R_SHUFFLE_D( 0, 0, 0, 0 )
pshufd xmm5, xmm7, R_SHUFFLE_D( 1, 1, 1, 1 )
pshufd xmm6, xmm7, R_SHUFFLE_D( 2, 2, 2, 2 )
pshufd xmm7, xmm7, R_SHUFFLE_D( 3, 3, 3, 3 )
packuswb xmm4, xmm4 // ab4
packuswb xmm5, xmm5 // ab5
packuswb xmm6, xmm6 // ab6
packuswb xmm7, xmm7 // ab7
pmaxub xmm1, xmm0
pmaxub xmm2, xmm0
pmaxub xmm3, xmm0
pcmpeqb xmm1, xmm0
pcmpeqb xmm2, xmm0
pcmpeqb xmm3, xmm0
pmaxub xmm4, xmm0
pmaxub xmm5, xmm0
pmaxub xmm6, xmm0
pmaxub xmm7, xmm0
pcmpeqb xmm4, xmm0
pcmpeqb xmm5, xmm0
pcmpeqb xmm6, xmm0
pcmpeqb xmm7, xmm0
movdqa xmm0, SIMD_SSE2_byte_8
paddsb xmm0, xmm1
paddsb xmm2, xmm3
paddsb xmm4, xmm5
paddsb xmm6, xmm7
paddsb xmm0, xmm2
paddsb xmm4, xmm6
paddsb xmm0, xmm4
pand xmm0, SIMD_SSE2_byte_7
movdqa xmm1, SIMD_SSE2_byte_2
pcmpgtb xmm1, xmm0
pand xmm1, SIMD_SSE2_byte_1
pxor xmm0, xmm1
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, xmm0
movdqa xmm4, xmm0
movdqa xmm5, xmm0
movdqa xmm6, xmm0
movdqa xmm7, xmm0
psrlq xmm1, 8- 3
psrlq xmm2, 16- 6
psrlq xmm3, 24- 9
psrlq xmm4, 32-12
psrlq xmm5, 40-15
psrlq xmm6, 48-18
psrlq xmm7, 56-21
pand xmm0, SIMD_SSE2_dword_alpha_bit_mask0
pand xmm1, SIMD_SSE2_dword_alpha_bit_mask1
pand xmm2, SIMD_SSE2_dword_alpha_bit_mask2
pand xmm3, SIMD_SSE2_dword_alpha_bit_mask3
pand xmm4, SIMD_SSE2_dword_alpha_bit_mask4
pand xmm5, SIMD_SSE2_dword_alpha_bit_mask5
pand xmm6, SIMD_SSE2_dword_alpha_bit_mask6
pand xmm7, SIMD_SSE2_dword_alpha_bit_mask7
por xmm0, xmm1
por xmm2, xmm3
por xmm4, xmm5
por xmm6, xmm7
por xmm0, xmm2
por xmm4, xmm6
por xmm0, xmm4
mov esi, outPtr
movd [esi+0], xmm0
pshufd xmm1, xmm0, R_SHUFFLE_D( 2, 3, 0, 1 )
movd [esi+3], xmm1
}
outData += 6;
#elif defined ( ID_WIN_X86_SSE2_INTRIN )
__m128i block0 = *((__m128i *)(&block[ 0]));
__m128i block1 = *((__m128i *)(&block[16]));
__m128i block2 = *((__m128i *)(&block[32]));
__m128i block3 = *((__m128i *)(&block[48]));
__m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
temp7 = _mm_cvtsi32_si128( channelBitOffset );
temp0 = _mm_srl_epi32( block0, temp7 );
temp5 = _mm_srl_epi32( block1, temp7 );
temp6 = _mm_srl_epi32( block2, temp7 );
temp4 = _mm_srl_epi32( block3, temp7 );
temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_dword_byte_mask );
temp5 = _mm_and_si128( temp5, (const __m128i &)SIMD_SSE2_dword_byte_mask );
temp6 = _mm_and_si128( temp6, (const __m128i &)SIMD_SSE2_dword_byte_mask );
temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_dword_byte_mask );
temp0 = _mm_packus_epi16( temp0, temp5 );
temp6 = _mm_packus_epi16( temp6, temp4 );
//---------------------
// ab0 = ( 7 * maxAlpha + 7 * minAlpha + ALPHA_RANGE ) / 14
// ab3 = ( 9 * maxAlpha + 5 * minAlpha + ALPHA_RANGE ) / 14
// ab2 = ( 11 * maxAlpha + 3 * minAlpha + ALPHA_RANGE ) / 14
// ab1 = ( 13 * maxAlpha + 1 * minAlpha + ALPHA_RANGE ) / 14
// ab4 = ( 7 * maxAlpha + 7 * minAlpha + ALPHA_RANGE ) / 14
// ab5 = ( 5 * maxAlpha + 9 * minAlpha + ALPHA_RANGE ) / 14
// ab6 = ( 3 * maxAlpha + 11 * minAlpha + ALPHA_RANGE ) / 14
// ab7 = ( 1 * maxAlpha + 13 * minAlpha + ALPHA_RANGE ) / 14
temp5 = _mm_cvtsi32_si128( maxAlpha_ );
temp5 = _mm_shufflelo_epi16( temp5, R_SHUFFLE_D( 0, 0, 0, 0 ) );
temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 0, 0, 0 ) );
temp2 = _mm_cvtsi32_si128( minAlpha_ );
temp2 = _mm_shufflelo_epi16( temp2, R_SHUFFLE_D( 0, 0, 0, 0 ) );
temp2 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 0, 0, 0, 0 ) );
temp7 = _mm_mullo_epi16( temp5, (const __m128i &)SIMD_SSE2_word_scale_7_5_3_1 );
temp5 = _mm_mullo_epi16( temp5, (const __m128i &)SIMD_SSE2_word_scale_7_9_11_13 );
temp3 = _mm_mullo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_scale_7_9_11_13 );
temp2 = _mm_mullo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_scale_7_5_3_1 );
temp5 = _mm_add_epi16( temp5, temp2 );
temp7 = _mm_add_epi16( temp7, temp3 );
temp5 = _mm_add_epi16( temp5, (const __m128i &)SIMD_SSE2_word_7 );
temp7 = _mm_add_epi16( temp7, (const __m128i &)SIMD_SSE2_word_7 );
temp5 = _mm_mulhi_epi16( temp5, (const __m128i &)SIMD_SSE2_word_div_by_14 );
temp7 = _mm_mulhi_epi16( temp7, (const __m128i &)SIMD_SSE2_word_div_by_14 );
temp1 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 3, 3, 3, 3 ) );
temp2 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 2, 2, 2, 2 ) );
temp3 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 1, 1, 1, 1 ) );
temp1 = _mm_packus_epi16( temp1, temp1 );
temp2 = _mm_packus_epi16( temp2, temp2 );
temp3 = _mm_packus_epi16( temp3, temp3 );
temp0 = _mm_packus_epi16( temp0, temp6 );
temp4 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 0, 0, 0, 0 ) );
temp5 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 1, 1, 1, 1 ) );
temp6 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 2, 2, 2, 2 ) );
temp7 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 3, 3, 3, 3 ) );
temp4 = _mm_packus_epi16( temp4, temp4 );
temp5 = _mm_packus_epi16( temp5, temp5 );
temp6 = _mm_packus_epi16( temp6, temp6 );
temp7 = _mm_packus_epi16( temp7, temp7 );
temp1 = _mm_max_epu8( temp1, temp0 );
temp2 = _mm_max_epu8( temp2, temp0 );
temp3 = _mm_max_epu8( temp3, temp0 );
temp1 = _mm_cmpeq_epi8( temp1, temp0 );
temp2 = _mm_cmpeq_epi8( temp2, temp0 );
temp3 = _mm_cmpeq_epi8( temp3, temp0 );
temp4 = _mm_max_epu8( temp4, temp0 );
temp5 = _mm_max_epu8( temp5, temp0 );
temp6 = _mm_max_epu8( temp6, temp0 );
temp7 = _mm_max_epu8( temp7, temp0 );
temp4 = _mm_cmpeq_epi8( temp4, temp0 );
temp5 = _mm_cmpeq_epi8( temp5, temp0 );
temp6 = _mm_cmpeq_epi8( temp6, temp0 );
temp7 = _mm_cmpeq_epi8( temp7, temp0 );
temp0 = _mm_adds_epi8( (const __m128i &)SIMD_SSE2_byte_8, temp1 );
temp2 = _mm_adds_epi8( temp2, temp3 );
temp4 = _mm_adds_epi8( temp4, temp5 );
temp6 = _mm_adds_epi8( temp6, temp7 );
temp0 = _mm_adds_epi8( temp0, temp2 );
temp4 = _mm_adds_epi8( temp4, temp6 );
temp0 = _mm_adds_epi8( temp0, temp4 );
temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_byte_7 );
temp1 = _mm_cmpgt_epi8( (const __m128i &)SIMD_SSE2_byte_2, temp0 );
temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_byte_1 );
temp0 = _mm_xor_si128( temp0, temp1 );
temp1 = _mm_srli_epi64( temp0, 8 - 3 );
temp2 = _mm_srli_epi64( temp0, 16 - 6 );
temp3 = _mm_srli_epi64( temp0, 24 - 9 );
temp4 = _mm_srli_epi64( temp0, 32 - 12 );
temp5 = _mm_srli_epi64( temp0, 40 - 15 );
temp6 = _mm_srli_epi64( temp0, 48 - 18 );
temp7 = _mm_srli_epi64( temp0, 56 - 21 );
temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask0 );
temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask1 );
temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask2 );
temp3 = _mm_and_si128( temp3, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask3 );
temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask4 );
temp5 = _mm_and_si128( temp5, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask5 );
temp6 = _mm_and_si128( temp6, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask6 );
temp7 = _mm_and_si128( temp7, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask7 );
temp0 = _mm_or_si128( temp0, temp1 );
temp2 = _mm_or_si128( temp2, temp3 );
temp4 = _mm_or_si128( temp4, temp5 );
temp6 = _mm_or_si128( temp6, temp7 );
temp0 = _mm_or_si128( temp0, temp2 );
temp4 = _mm_or_si128( temp4, temp6 );
temp0 = _mm_or_si128( temp0, temp4 );
int out = _mm_cvtsi128_si32( temp0 );
EmitUInt( out );
outData--;
temp1 = _mm_shuffle_epi32( temp0, R_SHUFFLE_D( 2, 3, 0, 1 ) );
out = _mm_cvtsi128_si32( temp1 );
EmitUInt( out );
outData--;
#else
assert( false );
#endif
}
/*
========================
idDxtEncoder::CompressImageDXT1Fast_SSE2
params: inBuf - image to compress
paramO: outBuf - result of compression
params: width - width of image
params: height - height of image
========================
*/
void idDxtEncoder::CompressImageDXT1Fast_SSE2( const byte *inBuf, byte *outBuf, int width, int height ) {
ALIGN16( byte block[64] );
ALIGN16( byte minColor[4] );
ALIGN16( byte maxColor[4] );
assert( width >= 4 && ( width & 3 ) == 0 );
assert( height >= 4 && ( height & 3 ) == 0 );
this->width = width;
this->height = height;
this->outData = outBuf;
for ( int j = 0; j < height; j += 4, inBuf += width * 4*4 ) {
for ( int i = 0; i < width; i += 4 ) {
ExtractBlock_SSE2( inBuf + i * 4, width, block );
GetMinMaxBBox_SSE2( block, minColor, maxColor );
InsetColorsBBox_SSE2( minColor, maxColor );
EmitUShort( ColorTo565( maxColor ) );
EmitUShort( ColorTo565( minColor ) );
EmitColorIndices_SSE2( block, minColor, maxColor );
}
outData += dstPadding;
inBuf += srcPadding;
}
#ifdef TEST_COMPRESSION
int tmpDstPadding = dstPadding;
dstPadding = 0;
byte * testOutBuf = (byte *) _alloca16( width * height / 2 );
CompressImageDXT1Fast_Generic( inBuf, testOutBuf, width, height );
for ( int j = 0; j < height/4; j++ ) {
for ( int i = 0; i < width/4; i++ ) {
byte * ptr1 = outBuf + ( j * width/4 + i ) * 8 + j * tmpDstPadding;
byte * ptr2 = testOutBuf + ( j * width/4 + i ) * 8;
for ( int k = 0; k < 8; k++ ) {
assert( ptr1[k] == ptr2[k] );
}
}
}
dstPadding = tmpDstPadding;
#endif
}
/*
========================
idDxtEncoder::CompressImageDXT1AlphaFast_SSE2
params: inBuf - image to compress
paramO: outBuf - result of compression
params: width - width of image
params: height - height of image
========================
*/
void idDxtEncoder::CompressImageDXT1AlphaFast_SSE2( const byte *inBuf, byte *outBuf, int width, int height ) {
ALIGN16( byte block[64] );
ALIGN16( byte minColor[4] );
ALIGN16( byte maxColor[4] );
assert( width >= 4 && ( width & 3 ) == 0 );
assert( height >= 4 && ( height & 3 ) == 0 );
this->width = width;
this->height = height;
this->outData = outBuf;
for ( int j = 0; j < height; j += 4, inBuf += width * 4*4 ) {
for ( int i = 0; i < width; i += 4 ) {
ExtractBlock_SSE2( inBuf + i * 4, width, block );
GetMinMaxBBox_SSE2( block, minColor, maxColor );
byte minAlpha = minColor[3];
InsetColorsBBox_SSE2( minColor, maxColor );
if ( minAlpha >= 128 ) {
EmitUShort( ColorTo565( maxColor ) );
EmitUShort( ColorTo565( minColor ) );
EmitColorIndices_SSE2( block, minColor, maxColor );
} else {
EmitUShort( ColorTo565( minColor ) );
EmitUShort( ColorTo565( maxColor ) );
EmitColorAlphaIndices_SSE2( block, minColor, maxColor );
}
}
outData += dstPadding;
inBuf += srcPadding;
}
#ifdef TEST_COMPRESSION
int tmpDstPadding = dstPadding;
dstPadding = 0;
byte * testOutBuf = (byte *) _alloca16( width * height / 2 );
CompressImageDXT1AlphaFast_Generic( inBuf, testOutBuf, width, height );
for ( int j = 0; j < height/4; j++ ) {
for ( int i = 0; i < width/4; i++ ) {
byte * ptr1 = outBuf + ( j * width/4 + i ) * 8 + j * tmpDstPadding;
byte * ptr2 = testOutBuf + ( j * width/4 + i ) * 8;
for ( int k = 0; k < 8; k++ ) {
assert( ptr1[k] == ptr2[k] );
}
}
}
dstPadding = tmpDstPadding;
#endif
}
/*
========================
idDxtEncoder::CompressImageDXT5Fast_SSE2
params: inBuf - image to compress
paramO: outBuf - result of compression
params: width - width of image
params: height - height of image
========================
*/
void idDxtEncoder::CompressImageDXT5Fast_SSE2( const byte *inBuf, byte *outBuf, int width, int height ) {
ALIGN16( byte block[64] );
ALIGN16( byte minColor[4] );
ALIGN16( byte maxColor[4] );
assert( width >= 4 && ( width & 3 ) == 0 );
assert( height >= 4 && ( height & 3 ) == 0 );
this->width = width;
this->height = height;
this->outData = outBuf;
for ( int j = 0; j < height; j += 4, inBuf += width * 4*4 ) {
for ( int i = 0; i < width; i += 4 ) {
ExtractBlock_SSE2( inBuf + i * 4, width, block );
GetMinMaxBBox_SSE2( block, minColor, maxColor );
InsetColorsBBox_SSE2( minColor, maxColor );
EmitByte( maxColor[3] );
EmitByte( minColor[3] );
EmitAlphaIndices_SSE2( block, minColor[3], maxColor[3] );
EmitUShort( ColorTo565( maxColor ) );
EmitUShort( ColorTo565( minColor ) );
EmitColorIndices_SSE2( block, minColor, maxColor );
}
outData += dstPadding;
inBuf += srcPadding;
}
#ifdef TEST_COMPRESSION
int tmpDstPadding = dstPadding;
dstPadding = 0;
byte * testOutBuf = (byte *) _alloca16( width * height );
CompressImageDXT5Fast_Generic( inBuf, testOutBuf, width, height );
for ( int j = 0; j < height / 4; j++ ) {
for ( int i = 0; i < width / 4; i++ ) {
byte * ptr1 = outBuf + ( j * width/4 + i ) * 16 + j * tmpDstPadding;
byte * ptr2 = testOutBuf + ( j * width/4 + i ) * 16;
for ( int k = 0; k < 16; k++ ) {
assert( ptr1[k] == ptr2[k] );
}
}
}
dstPadding = tmpDstPadding;
#endif
}
/*
========================
idDxtEncoder::ScaleYCoCg_SSE2
========================
*/
ID_INLINE void idDxtEncoder::ScaleYCoCg_SSE2( byte *colorBlock, byte *minColor, byte *maxColor ) const {
#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
__asm {
mov esi, colorBlock
mov edx, minColor
mov ecx, maxColor
movd xmm0, dword ptr [edx]
movd xmm1, dword ptr [ecx]
punpcklbw xmm0, SIMD_SSE2_byte_0
punpcklbw xmm1, SIMD_SSE2_byte_0
movdqa xmm6, SIMD_SSE2_word_center_128
movdqa xmm7, SIMD_SSE2_word_center_128
psubw xmm6, xmm0
psubw xmm7, xmm1
psubw xmm0, SIMD_SSE2_word_center_128
psubw xmm1, SIMD_SSE2_word_center_128
pmaxsw xmm6, xmm0
pmaxsw xmm7, xmm1
pmaxsw xmm6, xmm7
pshuflw xmm7, xmm6, R_SHUFFLE_D( 1, 0, 1, 0 )
pmaxsw xmm6, xmm7
pshufd xmm6, xmm6, R_SHUFFLE_D( 0, 0, 0, 0 )
movdqa xmm7, xmm6
pcmpgtw xmm6, SIMD_SSE2_word_63 // mask0
pcmpgtw xmm7, SIMD_SSE2_word_31 // mask1
pandn xmm7, SIMD_SSE2_byte_2
por xmm7, SIMD_SSE2_byte_1
pandn xmm6, xmm7
movdqa xmm3, xmm6
movdqa xmm7, xmm6
pxor xmm7, SIMD_SSE2_byte_not
por xmm7, SIMD_SSE2_byte_scale_mask0 // 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00
paddw xmm6, SIMD_SSE2_byte_1
pand xmm6, SIMD_SSE2_byte_scale_mask1 // 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0xFF
por xmm6, SIMD_SSE2_byte_scale_mask2 // 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00
movd xmm4, dword ptr [edx]
movd xmm5, dword ptr [ecx]
pand xmm4, SIMD_SSE2_byte_scale_mask3 // 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0xFF, 0xFF
pand xmm5, SIMD_SSE2_byte_scale_mask3
pslld xmm3, 3
pand xmm3, SIMD_SSE2_byte_scale_mask4 // 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00
por xmm4, xmm3
por xmm5, xmm3
paddb xmm4, SIMD_SSE2_byte_minus_128_0
paddb xmm5, SIMD_SSE2_byte_minus_128_0
pmullw xmm4, xmm6
pmullw xmm5, xmm6
pand xmm4, xmm7
pand xmm5, xmm7
psubb xmm4, SIMD_SSE2_byte_minus_128_0
psubb xmm5, SIMD_SSE2_byte_minus_128_0
movd dword ptr [edx], xmm4
movd dword ptr [ecx], xmm5
movdqa xmm0, xmmword ptr [esi+ 0*4]
movdqa xmm1, xmmword ptr [esi+ 4*4]
movdqa xmm2, xmmword ptr [esi+ 8*4]
movdqa xmm3, xmmword ptr [esi+12*4]
paddb xmm0, SIMD_SSE2_byte_minus_128_0
paddb xmm1, SIMD_SSE2_byte_minus_128_0
paddb xmm2, SIMD_SSE2_byte_minus_128_0
paddb xmm3, SIMD_SSE2_byte_minus_128_0
pmullw xmm0, xmm6
pmullw xmm1, xmm6
pmullw xmm2, xmm6
pmullw xmm3, xmm6
pand xmm0, xmm7
pand xmm1, xmm7
pand xmm2, xmm7
pand xmm3, xmm7
psubb xmm0, SIMD_SSE2_byte_minus_128_0
psubb xmm1, SIMD_SSE2_byte_minus_128_0
psubb xmm2, SIMD_SSE2_byte_minus_128_0
psubb xmm3, SIMD_SSE2_byte_minus_128_0
movdqa xmmword ptr [esi+ 0*4], xmm0
movdqa xmmword ptr [esi+ 4*4], xmm1
movdqa xmmword ptr [esi+ 8*4], xmm2
movdqa xmmword ptr [esi+12*4], xmm3
}
#elif defined ( ID_WIN_X86_SSE2_INTRIN )
__m128i block0 = *((__m128i *)(&colorBlock[ 0]));
__m128i block1 = *((__m128i *)(&colorBlock[16]));
__m128i block2 = *((__m128i *)(&colorBlock[32]));
__m128i block3 = *((__m128i *)(&colorBlock[48]));
__m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
temp0 = _mm_cvtsi32_si128( *(int *)minColor );
temp1 = _mm_cvtsi32_si128( *(int *)maxColor );
temp0 = _mm_unpacklo_epi8( temp0, (const __m128i &)SIMD_SSE2_byte_0 );
temp1 = _mm_unpacklo_epi8( temp1, (const __m128i &)SIMD_SSE2_byte_0 );
// TODO: Algorithm seems to be get the absolute difference
temp6 = _mm_sub_epi16( (const __m128i &)SIMD_SSE2_word_center_128, temp0 );
temp7 = _mm_sub_epi16( (const __m128i &)SIMD_SSE2_word_center_128, temp1 );
temp0 = _mm_sub_epi16( temp0, (const __m128i &)SIMD_SSE2_word_center_128 );
temp1 = _mm_sub_epi16( temp1, (const __m128i &)SIMD_SSE2_word_center_128 );
temp6 = _mm_max_epi16( temp6, temp0 );
temp7 = _mm_max_epi16( temp7, temp1 );
temp6 = _mm_max_epi16( temp6, temp7 );
temp7 = _mm_shufflelo_epi16( temp6, R_SHUFFLE_D( 1, 0, 1, 0 ) );
temp6 = _mm_max_epi16( temp6, temp7 );
temp6 = _mm_shuffle_epi32( temp6, R_SHUFFLE_D( 0, 0, 0, 0 ) );
temp7 = temp6;
temp6 = _mm_cmpgt_epi16( temp6, (const __m128i &)SIMD_SSE2_word_63 ); // mask0
temp7 = _mm_cmpgt_epi16( temp7, (const __m128i &)SIMD_SSE2_word_31 ); // mask1
temp7 = _mm_andnot_si128( temp7, (const __m128i &)SIMD_SSE2_byte_2 );
temp7 = _mm_or_si128( temp7, (const __m128i &)SIMD_SSE2_byte_1 );
temp6 = _mm_andnot_si128( temp6, temp7 );
temp3 = temp6;
temp7 = temp6;
temp7 = _mm_xor_si128( temp7, (const __m128i &)SIMD_SSE2_byte_not );
temp7 = _mm_or_si128( temp7, (const __m128i &)SIMD_SSE2_byte_scale_mask0 ); // 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00
temp6 = _mm_add_epi16( temp6, (const __m128i &)SIMD_SSE2_byte_1 );
temp6 = _mm_and_si128( temp6, (const __m128i &)SIMD_SSE2_byte_scale_mask1 ); // 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0xFF
temp6 = _mm_or_si128( temp6, (const __m128i &)SIMD_SSE2_byte_scale_mask2 ); // 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00
// TODO: remove this second store
temp4 = _mm_cvtsi32_si128( *(int *)minColor );
temp5 = _mm_cvtsi32_si128( *(int *)maxColor );
temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_byte_scale_mask3 ); // 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0xFF, 0xFF
temp5 = _mm_and_si128( temp5, (const __m128i &)SIMD_SSE2_byte_scale_mask3 );
temp3 = _mm_slli_epi32( temp3, 3 );
temp3 = _mm_and_si128( temp3, (const __m128i &)SIMD_SSE2_byte_scale_mask4 ); // 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00
temp4 = _mm_or_si128( temp4, temp3 );
temp5 = _mm_or_si128( temp5, temp3 );
temp4 = _mm_add_epi8( temp4, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
temp5 = _mm_add_epi8( temp5, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
temp4 = _mm_mullo_epi16( temp4, temp6 );
temp5 = _mm_mullo_epi16( temp5, temp6 );
temp4 = _mm_and_si128( temp4, temp7 );
temp5 = _mm_and_si128( temp5, temp7 );
temp4 = _mm_sub_epi8( temp4, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
temp5 = _mm_sub_epi8( temp5, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
*(int *)minColor = _mm_cvtsi128_si32( temp4 );
*(int *)maxColor = _mm_cvtsi128_si32( temp5 );
temp0 = _mm_add_epi8( block0, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
temp1 = _mm_add_epi8( block1, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
temp2 = _mm_add_epi8( block2, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
temp3 = _mm_add_epi8( block3, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
temp0 = _mm_mullo_epi16( temp0, temp6 );
temp1 = _mm_mullo_epi16( temp1, temp6 );
temp2 = _mm_mullo_epi16( temp2, temp6 );
temp3 = _mm_mullo_epi16( temp3, temp6 );
temp0 = _mm_and_si128( temp0, temp7 );
temp1 = _mm_and_si128( temp1, temp7 );
temp2 = _mm_and_si128( temp2, temp7 );
temp3 = _mm_and_si128( temp3, temp7 );
*((__m128i *)(&colorBlock[ 0])) = _mm_sub_epi8( temp0, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
*((__m128i *)(&colorBlock[16])) = _mm_sub_epi8( temp1, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
*((__m128i *)(&colorBlock[32])) = _mm_sub_epi8( temp2, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
*((__m128i *)(&colorBlock[48])) = _mm_sub_epi8( temp3, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
#else
assert( false );
#endif
}
/*
========================
idDxtEncoder::InsetYCoCgBBox_SSE2
========================
*/
ID_INLINE void idDxtEncoder::InsetYCoCgBBox_SSE2( byte *minColor, byte *maxColor ) const {
#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
__asm {
mov esi, minColor
mov edi, maxColor
movd xmm0, dword ptr [esi]
movd xmm1, dword ptr [edi]
punpcklbw xmm0, SIMD_SSE2_byte_0
punpcklbw xmm1, SIMD_SSE2_byte_0
movdqa xmm2, xmm1
psubw xmm2, xmm0
psubw xmm2, SIMD_SSE2_word_insetYCoCgRound
pand xmm2, SIMD_SSE2_word_insetYCoCgMask
pmullw xmm0, SIMD_SSE2_word_insetYCoCgShiftUp
pmullw xmm1, SIMD_SSE2_word_insetYCoCgShiftUp
paddw xmm0, xmm2
psubw xmm1, xmm2
pmulhw xmm0, SIMD_SSE2_word_insetYCoCgShiftDown
pmulhw xmm1, SIMD_SSE2_word_insetYCoCgShiftDown
pmaxsw xmm0, SIMD_SSE2_word_0
pmaxsw xmm1, SIMD_SSE2_word_0
pand xmm0, SIMD_SSE2_word_insetYCoCgQuantMask
pand xmm1, SIMD_SSE2_word_insetYCoCgQuantMask
movdqa xmm2, xmm0
movdqa xmm3, xmm1
pmulhw xmm2, SIMD_SSE2_word_insetYCoCgRep
pmulhw xmm3, SIMD_SSE2_word_insetYCoCgRep
por xmm0, xmm2
por xmm1, xmm3
packuswb xmm0, xmm0
packuswb xmm1, xmm1
movd dword ptr [esi], xmm0
movd dword ptr [edi], xmm1
}
#elif defined ( ID_WIN_X86_SSE2_INTRIN )
__m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
temp0 = _mm_cvtsi32_si128( *(int *)minColor );
temp1 = _mm_cvtsi32_si128( *(int *)maxColor );
temp0 = _mm_unpacklo_epi8( temp0, (const __m128i &)SIMD_SSE2_byte_0 );
temp1 = _mm_unpacklo_epi8( temp1, (const __m128i &)SIMD_SSE2_byte_0 );
temp2 = _mm_sub_epi16( temp1, temp0 );
temp2 = _mm_sub_epi16( temp2, (const __m128i &)SIMD_SSE2_word_insetYCoCgRound );
temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_word_insetYCoCgMask );
temp0 = _mm_mullo_epi16( temp0, (const __m128i &)SIMD_SSE2_word_insetYCoCgShiftUp );
temp1 = _mm_mullo_epi16( temp1, (const __m128i &)SIMD_SSE2_word_insetYCoCgShiftUp );
temp0 = _mm_add_epi16( temp0, temp2 );
temp1 = _mm_sub_epi16( temp1, temp2 );
temp0 = _mm_mulhi_epi16( temp0, (const __m128i &)SIMD_SSE2_word_insetYCoCgShiftDown );
temp1 = _mm_mulhi_epi16( temp1, (const __m128i &)SIMD_SSE2_word_insetYCoCgShiftDown );
temp0 = _mm_max_epi16( temp0, (const __m128i &)SIMD_SSE2_word_0 );
temp1 = _mm_max_epi16( temp1, (const __m128i &)SIMD_SSE2_word_0 );
temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_word_insetYCoCgQuantMask );
temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_word_insetYCoCgQuantMask );
temp2 = _mm_mulhi_epi16( temp0, (const __m128i &)SIMD_SSE2_word_insetYCoCgRep );
temp3 = _mm_mulhi_epi16( temp1, (const __m128i &)SIMD_SSE2_word_insetYCoCgRep );
temp0 = _mm_or_si128( temp0, temp2 );
temp1 = _mm_or_si128( temp1, temp3 );
temp0 = _mm_packus_epi16( temp0, temp0 );
temp1 = _mm_packus_epi16( temp1, temp1 );
*(int *)minColor = _mm_cvtsi128_si32( temp0 );
*(int *)maxColor = _mm_cvtsi128_si32( temp1 );
#else
assert( false );
#endif
}
/*
========================
idDxtEncoder::SelectYCoCgDiagonal_SSE2
params: colorBlock - 16 pixel block to find color indexes for
paramO: minColor - min color found
paramO: maxColor - max color found
return: diagonal to use
========================
*/
ID_INLINE void idDxtEncoder::SelectYCoCgDiagonal_SSE2( const byte *colorBlock, byte *minColor, byte *maxColor ) const {
#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
__asm {
mov esi, colorBlock
mov edx, minColor
mov ecx, maxColor
movdqa xmm0, xmmword ptr [esi+ 0]
movdqa xmm1, xmmword ptr [esi+16]
movdqa xmm2, xmmword ptr [esi+32]
movdqa xmm3, xmmword ptr [esi+48]
pand xmm0, SIMD_SSE2_dword_word_mask
pand xmm1, SIMD_SSE2_dword_word_mask
pand xmm2, SIMD_SSE2_dword_word_mask
pand xmm3, SIMD_SSE2_dword_word_mask
pslldq xmm1, 2
pslldq xmm3, 2
por xmm0, xmm1
por xmm2, xmm3
movd xmm1, dword ptr [edx] // minColor
movd xmm3, dword ptr [ecx] // maxColor
movdqa xmm6, xmm1
movdqa xmm7, xmm3
pavgb xmm1, xmm3
pshuflw xmm1, xmm1, R_SHUFFLE_D( 0, 0, 0, 0 )
pshufd xmm1, xmm1, R_SHUFFLE_D( 0, 0, 0, 0 )
movdqa xmm3, xmm1
pmaxub xmm1, xmm0
pmaxub xmm3, xmm2
pcmpeqb xmm1, xmm0
pcmpeqb xmm3, xmm2
movdqa xmm0, xmm1
movdqa xmm2, xmm3
psrldq xmm0, 1
psrldq xmm2, 1
pxor xmm0, xmm1
pxor xmm2, xmm3
pand xmm0, SIMD_SSE2_word_1
pand xmm2, SIMD_SSE2_word_1
paddw xmm0, xmm2
psadbw xmm0, SIMD_SSE2_byte_0
pshufd xmm1, xmm0, R_SHUFFLE_D( 2, 3, 0, 1 )
#ifdef NVIDIA_7X_HARDWARE_BUG_FIX
paddw xmm1, xmm0 // side
pcmpgtw xmm1, SIMD_SSE2_word_8 // mask = -( side > 8 )
pand xmm1, SIMD_SSE2_byte_diagonalMask
movdqa xmm0, xmm6
pcmpeqb xmm0, xmm7 // mask &= -( minColor[0] != maxColor[0] )
pslldq xmm0, 1
pandn xmm0, xmm1
#else
paddw xmm0, xmm1 // side
pcmpgtw xmm0, SIMD_SSE2_word_8 // mask = -( side > 8 )
pand xmm0, SIMD_SSE2_byte_diagonalMask
#endif
pxor xmm6, xmm7
pand xmm0, xmm6
pxor xmm7, xmm0
pxor xmm6, xmm7
movd dword ptr [edx], xmm6
movd dword ptr [ecx], xmm7
}
#elif defined ( ID_WIN_X86_SSE2_INTRIN )
__m128i block0 = *((__m128i *)(&colorBlock[ 0]));
__m128i block1 = *((__m128i *)(&colorBlock[16]));
__m128i block2 = *((__m128i *)(&colorBlock[32]));
__m128i block3 = *((__m128i *)(&colorBlock[48]));
__m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
temp0 = _mm_and_si128( block0, (const __m128i &)SIMD_SSE2_dword_word_mask );
temp1 = _mm_and_si128( block1, (const __m128i &)SIMD_SSE2_dword_word_mask );
temp2 = _mm_and_si128( block2, (const __m128i &)SIMD_SSE2_dword_word_mask );
temp3 = _mm_and_si128( block3, (const __m128i &)SIMD_SSE2_dword_word_mask );
temp1 = _mm_slli_si128( temp1, 2 );
temp3 = _mm_slli_si128( temp3, 2 );
temp0 = _mm_or_si128( temp0, temp1 );
temp2 = _mm_or_si128( temp2, temp3 );
temp6 = _mm_cvtsi32_si128( *(int *)minColor );
temp7 = _mm_cvtsi32_si128( *(int *)maxColor );
temp1 = _mm_avg_epu8( temp6, temp7 );
temp1 = _mm_shufflelo_epi16( temp1, R_SHUFFLE_D( 0, 0, 0, 0 ) );
temp1 = _mm_shuffle_epi32( temp1, R_SHUFFLE_D( 0, 0, 0, 0 ) );
temp3 = _mm_max_epu8( temp1, temp2 );
temp1 = _mm_max_epu8( temp1, temp0 );
temp1 = _mm_cmpeq_epi8( temp1, temp0 );
temp3 = _mm_cmpeq_epi8( temp3, temp2 );
temp0 = _mm_srli_si128( temp1, 1 );
temp2 = _mm_srli_si128( temp3, 1 );
temp0 = _mm_xor_si128( temp0, temp1 );
temp2 = _mm_xor_si128( temp2, temp3 );
temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_word_1 );
temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_word_1 );
temp0 = _mm_add_epi16( temp0, temp2 );
temp0 = _mm_sad_epu8( temp0, (const __m128i &)SIMD_SSE2_byte_0 );
temp1 = _mm_shuffle_epi32( temp0, R_SHUFFLE_D( 2, 3, 0, 1 ) );
#ifdef NVIDIA_7X_HARDWARE_BUG_FIX
temp1 = _mm_add_epi16( temp1, temp0 );
temp1 = _mm_cmpgt_epi16( temp1, (const __m128i &)SIMD_SSE2_word_8 );
temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_byte_diagonalMask );
temp0 = _mm_cmpeq_epi8( temp6, temp7 );
temp0 = _mm_slli_si128( temp0, 1 );
temp0 = _mm_andnot_si128( temp0, temp1 );
#else
temp0 = _mm_add_epi16( temp0, temp1 );
temp0 = _mm_cmpgt_epi16( temp0, (const __m128i &)SIMD_SSE2_word_8 );
temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_byte_diagonalMask );
#endif
temp6 = _mm_xor_si128( temp6, temp7 );
temp0 = _mm_and_si128( temp0, temp6 );
temp7 = _mm_xor_si128( temp7, temp0 );
temp6 = _mm_xor_si128( temp6, temp7 );
*(int *)minColor = _mm_cvtsi128_si32( temp6 );
*(int *)maxColor = _mm_cvtsi128_si32( temp7 );
#else
assert( false );
#endif
}
/*
========================
idDxtEncoder::CompressYCoCgDXT5Fast_SSE2
params: inBuf - image to compress
paramO: outBuf - result of compression
params: width - width of image
params: height - height of image
========================
*/
void idDxtEncoder::CompressYCoCgDXT5Fast_SSE2( const byte *inBuf, byte *outBuf, int width, int height ) {
ALIGN16( byte block[64] );
ALIGN16( byte minColor[4] );
ALIGN16( byte maxColor[4] );
//assert( HasConstantValuePer4x4Block( inBuf, width, height, 2 ) );
assert( width >= 4 && ( width & 3 ) == 0 );
assert( height >= 4 && ( height & 3 ) == 0 );
this->width = width;
this->height = height;
this->outData = outBuf;
for ( int j = 0; j < height; j += 4, inBuf += width * 4*4 ) {
for ( int i = 0; i < width; i += 4 ) {
ExtractBlock_SSE2( inBuf + i * 4, width, block );
GetMinMaxBBox_SSE2( block, minColor, maxColor );
ScaleYCoCg_SSE2( block, minColor, maxColor );
InsetYCoCgBBox_SSE2( minColor, maxColor );
SelectYCoCgDiagonal_SSE2( block, minColor, maxColor );
EmitByte( maxColor[3] );
EmitByte( minColor[3] );
EmitAlphaIndices_SSE2( block, minColor[3], maxColor[3] );
EmitUShort( ColorTo565( maxColor ) );
EmitUShort( ColorTo565( minColor ) );
EmitCoCgIndices_SSE2( block, minColor, maxColor );
}
outData += dstPadding;
inBuf += srcPadding;
}
#ifdef TEST_COMPRESSION
int tmpDstPadding = dstPadding;
dstPadding = 0;
byte * testOutBuf = (byte *) _alloca16( width * height );
CompressYCoCgDXT5Fast_Generic( inBuf, testOutBuf, width, height );
for ( int j = 0; j < height / 4; j++ ) {
for ( int i = 0; i < width / 4; i++ ) {
byte * ptr1 = outBuf + ( j * width/4 + i ) * 16 + j * tmpDstPadding;
byte * ptr2 = testOutBuf + ( j * width/4 + i ) * 16;
for ( int k = 0; k < 16; k++ ) {
assert( ptr1[k] == ptr2[k] );
}
}
}
dstPadding = tmpDstPadding;
#endif
}
/*
========================
idDxtEncoder::EmitGreenIndices_SSE2
params: block - 16-normal block for which to find normal Y indices
paramO: minGreen - Minimal normal Y found
paramO: maxGreen - Maximal normal Y found
========================
*/
void idDxtEncoder::EmitGreenIndices_SSE2( const byte *block, const int channelBitOffset, const int minGreen, const int maxGreen ) {
#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
assert( maxGreen >= minGreen );
byte *outPtr = outData;
__asm {
movd xmm7, channelBitOffset
mov esi, block
movdqa xmm0, xmmword ptr [esi+ 0]
movdqa xmm5, xmmword ptr [esi+16]
movdqa xmm6, xmmword ptr [esi+32]
movdqa xmm4, xmmword ptr [esi+48]
psrld xmm0, xmm7
psrld xmm5, xmm7
psrld xmm6, xmm7
psrld xmm4, xmm7
pand xmm0, SIMD_SSE2_dword_byte_mask
pand xmm5, SIMD_SSE2_dword_byte_mask
pand xmm6, SIMD_SSE2_dword_byte_mask
pand xmm4, SIMD_SSE2_dword_byte_mask
packuswb xmm0, xmm5
packuswb xmm6, xmm4
//---------------------
movd xmm2, maxGreen
pshuflw xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 )
movd xmm3, minGreen
pshuflw xmm3, xmm3, R_SHUFFLE_D( 0, 0, 0, 0 )
pmullw xmm2, SIMD_SSE2_word_scale_5_3_1
pmullw xmm3, SIMD_SSE2_word_scale_1_3_5
paddw xmm2, SIMD_SSE2_word_3
paddw xmm3, xmm2
pmulhw xmm3, SIMD_SSE2_word_div_by_6
pshuflw xmm1, xmm3, R_SHUFFLE_D( 0, 0, 0, 0 )
pshuflw xmm2, xmm3, R_SHUFFLE_D( 1, 1, 1, 1 )
pshuflw xmm3, xmm3, R_SHUFFLE_D( 2, 2, 2, 2 )
pshufd xmm1, xmm1, R_SHUFFLE_D( 0, 0, 0, 0 )
pshufd xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 )
pshufd xmm3, xmm3, R_SHUFFLE_D( 0, 0, 0, 0 )
packuswb xmm1, xmm1
packuswb xmm2, xmm2
packuswb xmm3, xmm3
packuswb xmm0, xmm6
pmaxub xmm1, xmm0
pmaxub xmm2, xmm0
pmaxub xmm3, xmm0
pcmpeqb xmm1, xmm0
pcmpeqb xmm2, xmm0
pcmpeqb xmm3, xmm0
movdqa xmm0, SIMD_SSE2_byte_4
paddsb xmm0, xmm1
paddsb xmm2, xmm3
paddsb xmm0, xmm2
pand xmm0, SIMD_SSE2_byte_3
movdqa xmm4, SIMD_SSE2_byte_2
pcmpgtb xmm4, xmm0
pand xmm4, SIMD_SSE2_byte_1
pxor xmm0, xmm4
movdqa xmm4, xmm0
movdqa xmm5, xmm0
movdqa xmm6, xmm0
movdqa xmm7, xmm0
psrlq xmm4, 8- 2
psrlq xmm5, 16- 4
psrlq xmm6, 24- 6
psrlq xmm7, 32- 8
pand xmm4, SIMD_SSE2_dword_color_bit_mask1
pand xmm5, SIMD_SSE2_dword_color_bit_mask2
pand xmm6, SIMD_SSE2_dword_color_bit_mask3
pand xmm7, SIMD_SSE2_dword_color_bit_mask4
por xmm5, xmm4
por xmm7, xmm6
por xmm7, xmm5
movdqa xmm4, xmm0
movdqa xmm5, xmm0
movdqa xmm6, xmm0
psrlq xmm4, 40-10
psrlq xmm5, 48-12
psrlq xmm6, 56-14
pand xmm0, SIMD_SSE2_dword_color_bit_mask0
pand xmm4, SIMD_SSE2_dword_color_bit_mask5
pand xmm5, SIMD_SSE2_dword_color_bit_mask6
pand xmm6, SIMD_SSE2_dword_color_bit_mask7
por xmm4, xmm5
por xmm0, xmm6
por xmm7, xmm4
por xmm7, xmm0
mov esi, outPtr
pshufd xmm7, xmm7, R_SHUFFLE_D( 0, 2, 1, 3 )
pshuflw xmm7, xmm7, R_SHUFFLE_D( 0, 2, 1, 3 )
movd [esi], xmm7
}
outData += 4;
#elif defined ( ID_WIN_X86_SSE2_INTRIN )
__m128i block0 = *((__m128i *)(&block[ 0]));
__m128i block1 = *((__m128i *)(&block[16]));
__m128i block2 = *((__m128i *)(&block[32]));
__m128i block3 = *((__m128i *)(&block[48]));
__m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
temp7 = _mm_cvtsi32_si128( channelBitOffset );
temp0 = _mm_srl_epi32( block0, temp7 );
temp5 = _mm_srl_epi32( block1, temp7 );
temp6 = _mm_srl_epi32( block2, temp7 );
temp4 = _mm_srl_epi32( block3, temp7 );
temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_dword_byte_mask );
temp5 = _mm_and_si128( temp5, (const __m128i &)SIMD_SSE2_dword_byte_mask );
temp6 = _mm_and_si128( temp6, (const __m128i &)SIMD_SSE2_dword_byte_mask );
temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_dword_byte_mask );
temp0 = _mm_packus_epi16( temp0, temp5 );
temp6 = _mm_packus_epi16( temp6, temp4 );
//---------------------
temp2 = _mm_cvtsi32_si128( maxGreen );
temp2 = _mm_shufflelo_epi16( temp2, R_SHUFFLE_D( 0, 0, 0, 0 ) );
temp3 = _mm_cvtsi32_si128( minGreen );
temp3 = _mm_shufflelo_epi16( temp3, R_SHUFFLE_D( 0, 0, 0, 0 ) );
temp2 = _mm_mullo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_scale_5_3_1 );
temp3 = _mm_mullo_epi16( temp3, (const __m128i &)SIMD_SSE2_word_scale_1_3_5 );
temp2 = _mm_add_epi16( temp2, (const __m128i &)SIMD_SSE2_word_3 );
temp3 = _mm_add_epi16( temp3, temp2 );
temp3 = _mm_mulhi_epi16( temp3, (const __m128i &)SIMD_SSE2_word_div_by_6 );
temp1 = _mm_shufflelo_epi16( temp3, R_SHUFFLE_D( 0, 0, 0, 0 ) );
temp2 = _mm_shufflelo_epi16( temp3, R_SHUFFLE_D( 1, 1, 1, 1 ) );
temp3 = _mm_shufflelo_epi16( temp3, R_SHUFFLE_D( 2, 2, 2, 2 ) );
temp1 = _mm_shuffle_epi32( temp1, R_SHUFFLE_D( 0, 0, 0, 0 ) );
temp2 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 0, 0, 0, 0 ) );
temp3 = _mm_shuffle_epi32( temp3, R_SHUFFLE_D( 0, 0, 0, 0 ) );
temp1 = _mm_packus_epi16( temp1, temp1 );
temp2 = _mm_packus_epi16( temp2, temp2 );
temp3 = _mm_packus_epi16( temp3, temp3 );
temp0 = _mm_packus_epi16( temp0, temp6 );
temp1 = _mm_max_epu8( temp1, temp0 );
temp2 = _mm_max_epu8( temp2, temp0 );
temp3 = _mm_max_epu8( temp3, temp0 );
temp1 = _mm_cmpeq_epi8( temp1, temp0 );
temp2 = _mm_cmpeq_epi8( temp2, temp0 );
temp3 = _mm_cmpeq_epi8( temp3, temp0 );
temp0 = (const __m128i &)SIMD_SSE2_byte_4;
temp0 = _mm_adds_epi8( temp0, temp1 );
temp2 = _mm_adds_epi8( temp2, temp3 );
temp0 = _mm_adds_epi8( temp0, temp2 );
temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_byte_3 );
temp4 = (const __m128i &)SIMD_SSE2_byte_2;
temp4 = _mm_cmpgt_epi8( temp4, temp0 );
temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_byte_1 );
temp0 = _mm_xor_si128( temp0, temp4 );
temp4 = _mm_srli_epi64( temp0, 8 - 2 );
temp5 = _mm_srli_epi64( temp0, 16 - 4 );
temp6 = _mm_srli_epi64( temp0, 24 - 6 );
temp7 = _mm_srli_epi64( temp0, 32 - 8 );
temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_dword_color_bit_mask1 );
temp5 = _mm_and_si128( temp5, (const __m128i &)SIMD_SSE2_dword_color_bit_mask2 );
temp6 = _mm_and_si128( temp6, (const __m128i &)SIMD_SSE2_dword_color_bit_mask3 );
temp7 = _mm_and_si128( temp7, (const __m128i &)SIMD_SSE2_dword_color_bit_mask4 );
temp5 = _mm_or_si128( temp5, temp4 );
temp7 = _mm_or_si128( temp7, temp6 );
temp7 = _mm_or_si128( temp7, temp5 );
temp4 = _mm_srli_epi64( temp0, 40 - 10 );
temp5 = _mm_srli_epi64( temp0, 48 - 12 );
temp6 = _mm_srli_epi64( temp0, 56 - 14 );
temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_dword_color_bit_mask0 );
temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_dword_color_bit_mask5 );
temp5 = _mm_and_si128( temp5, (const __m128i &)SIMD_SSE2_dword_color_bit_mask6 );
temp6 = _mm_and_si128( temp6, (const __m128i &)SIMD_SSE2_dword_color_bit_mask7 );
temp4 = _mm_or_si128( temp4, temp5 );
temp0 = _mm_or_si128( temp0, temp6 );
temp7 = _mm_or_si128( temp7, temp4 );
temp7 = _mm_or_si128( temp7, temp0 );
temp7 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 0, 2, 1, 3 ) );
temp7 = _mm_shufflelo_epi16( temp7, R_SHUFFLE_D( 0, 2, 1, 3 ) );
int result = _mm_cvtsi128_si32( temp7 );
EmitUInt( result );
#else
assert( false );
#endif
}
/*
========================
idDxtEncoder::InsetNormalsBBoxDXT5_SSE2
========================
*/
void idDxtEncoder::InsetNormalsBBoxDXT5_SSE2( byte *minNormal, byte *maxNormal ) const {
#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
__asm {
mov esi, minNormal
mov edi, maxNormal
movd xmm0, dword ptr [esi] // xmm0 = minNormal
movd xmm1, dword ptr [edi] // xmm1 = maxNormal
punpcklbw xmm0, SIMD_SSE2_byte_0
punpcklbw xmm1, SIMD_SSE2_byte_0
movdqa xmm2, xmm1
psubw xmm2, xmm0
psubw xmm2, SIMD_SSE2_word_insetNormalDXT5Round
pand xmm2, SIMD_SSE2_word_insetNormalDXT5Mask // xmm2 = inset (1 & 3)
pmullw xmm0, SIMD_SSE2_word_insetNormalDXT5ShiftUp
pmullw xmm1, SIMD_SSE2_word_insetNormalDXT5ShiftUp
paddw xmm0, xmm2
psubw xmm1, xmm2
pmulhw xmm0, SIMD_SSE2_word_insetNormalDXT5ShiftDown // xmm0 = mini
pmulhw xmm1, SIMD_SSE2_word_insetNormalDXT5ShiftDown // xmm1 = maxi
// mini and maxi must be >= 0 and <= 255
pmaxsw xmm0, SIMD_SSE2_word_0
pmaxsw xmm1, SIMD_SSE2_word_0
pminsw xmm0, SIMD_SSE2_word_255
pminsw xmm1, SIMD_SSE2_word_255
movdqa xmm2, xmm0
movdqa xmm3, xmm1
pand xmm0, SIMD_SSE2_word_insetNormalDXT5QuantMask
pand xmm1, SIMD_SSE2_word_insetNormalDXT5QuantMask
pmulhw xmm2, SIMD_SSE2_word_insetNormalDXT5Rep
pmulhw xmm3, SIMD_SSE2_word_insetNormalDXT5Rep
por xmm0, xmm2
por xmm1, xmm3
packuswb xmm0, xmm0
packuswb xmm1, xmm1
movd dword ptr [esi], xmm0
movd dword ptr [edi], xmm1
}
#elif defined ( ID_WIN_X86_SSE2_INTRIN )
__m128i temp0, temp1, temp2, temp3;
temp0 = _mm_cvtsi32_si128( *(int *)minNormal );
temp1 = _mm_cvtsi32_si128( *(int *)maxNormal );
temp0 = _mm_unpacklo_epi8( temp0, (const __m128i &)SIMD_SSE2_byte_0 );
temp1 = _mm_unpacklo_epi8( temp1, (const __m128i &)SIMD_SSE2_byte_0 );
temp2 = _mm_sub_epi16( temp1, temp0 );
temp2 = _mm_sub_epi16( temp2, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5Round );
temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5Mask ); // xmm2 = inset (1 & 3)
temp0 = _mm_mullo_epi16( temp0, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5ShiftUp );
temp1 = _mm_mullo_epi16( temp1, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5ShiftUp );
temp0 = _mm_add_epi16( temp0, temp2 );
temp1 = _mm_sub_epi16( temp1, temp2 );
temp0 = _mm_mulhi_epi16( temp0, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5ShiftDown ); // xmm0 = mini
temp1 = _mm_mulhi_epi16( temp1, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5ShiftDown ); // xmm1 = maxi
// mini and maxi must be >= 0 and <= 255
temp0 = _mm_max_epi16( temp0, (const __m128i &)SIMD_SSE2_word_0 );
temp1 = _mm_max_epi16( temp1, (const __m128i &)SIMD_SSE2_word_0 );
temp0 = _mm_min_epi16( temp0, (const __m128i &)SIMD_SSE2_word_255 );
temp1 = _mm_min_epi16( temp1, (const __m128i &)SIMD_SSE2_word_255 );
temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5QuantMask );
temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5QuantMask );
temp2 = _mm_mulhi_epi16( temp0, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5Rep );
temp3 = _mm_mulhi_epi16( temp1, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5Rep );
temp0 = _mm_or_si128( temp0, temp2 );
temp1 = _mm_or_si128( temp1, temp3 );
temp0 = _mm_packus_epi16( temp0, temp0 );
temp1 = _mm_packus_epi16( temp1, temp1 );
*(int *)minNormal = _mm_cvtsi128_si32( temp0 );
*(int *)maxNormal = _mm_cvtsi128_si32( temp1 );
#else
assert( false );
#endif
}
/*
========================
idDxtEncoder::CompressNormalMapDXT5Fast_SSE2
params: inBuf - image to compress in _y_x component order
paramO: outBuf - result of compression
params: width - width of image
params: height - height of image
========================
*/
void idDxtEncoder::CompressNormalMapDXT5Fast_SSE2( const byte *inBuf, byte *outBuf, int width, int height ) {
ALIGN16( byte block[64] );
ALIGN16( byte normal1[4] );
ALIGN16( byte normal2[4] );
assert( width >= 4 && ( width & 3 ) == 0 );
assert( height >= 4 && ( height & 3 ) == 0 );
this->width = width;
this->height = height;
this->outData = outBuf;
for ( int j = 0; j < height; j += 4, inBuf += width * 4*4 ) {
for ( int i = 0; i < width; i += 4 ) {
ExtractBlock_SSE2( inBuf + i * 4, width, block );
GetMinMaxBBox_SSE2( block, normal1, normal2 );
InsetNormalsBBoxDXT5_SSE2( normal1, normal2 );
// Write out Nx into alpha channel.
EmitByte( normal2[3] );
EmitByte( normal1[3] );
EmitAlphaIndices_SSE2( block, 3*8, normal1[3], normal2[3] );
// Write out Ny into green channel.
EmitUShort( ColorTo565( block[0], normal2[1], block[2] ) );
EmitUShort( ColorTo565( block[0], normal1[1], block[2] ) );
EmitGreenIndices_SSE2( block, 1*8, normal1[1], normal2[1] );
}
outData += dstPadding;
inBuf += srcPadding;
}
#ifdef TEST_COMPRESSION
int tmpDstPadding = dstPadding;
dstPadding = 0;
byte * testOutBuf = (byte *) _alloca16( width * height );
CompressNormalMapDXT5Fast_Generic( inBuf, testOutBuf, width, height );
for ( int j = 0; j < height / 4; j++ ) {
for ( int i = 0; i < width / 4; i++ ) {
byte * ptr1 = outBuf + ( j * width/4 + i ) * 16 + j * tmpDstPadding;
byte * ptr2 = testOutBuf + ( j * width/4 + i ) * 16;
for ( int k = 0; k < 16; k++ ) {
assert( ptr1[k] == ptr2[k] );
}
}
}
dstPadding = tmpDstPadding;
#endif
}
#endif