Add the ETC compression algorithm portions of etcpak to the repository, with no modifications.

git-svn-id: https://svn.eduke32.com/eduke32@5680 1a8010ca-5511-0410-912e-c29ae57300e0
This commit is contained in:
hendricks266 2016-03-28 05:15:37 +00:00
parent bd6c92926e
commit 33634859c6
10 changed files with 1255 additions and 0 deletions

View file

@ -0,0 +1,2 @@
Bartosz Taudul <wolf.pld@gmail.com>
Daniel Jungmann <el.3d.source@gmail.com>

View file

@ -0,0 +1,24 @@
Copyright (c) 2013, Bartosz Taudul <wolf.pld@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the <organization> nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View file

@ -0,0 +1,89 @@
#ifndef __DARKRL__MATH_HPP__
#define __DARKRL__MATH_HPP__
#include <algorithm>
#include "Types.hpp"
template<typename T>
inline T AlignPOT( T val )
{
if( val == 0 ) return 1;
val--;
for( unsigned int i=1; i<sizeof( T ) * 8; i <<= 1 )
{
val |= val >> i;
}
return val + 1;
}
inline int CountSetBits( uint32 val )
{
val -= ( val >> 1 ) & 0x55555555;
val = ( ( val >> 2 ) & 0x33333333 ) + ( val & 0x33333333 );
val = ( ( val >> 4 ) + val ) & 0x0f0f0f0f;
val += val >> 8;
val += val >> 16;
return val & 0x0000003f;
}
inline int CountLeadingZeros( uint32 val )
{
val |= val >> 1;
val |= val >> 2;
val |= val >> 4;
val |= val >> 8;
val |= val >> 16;
return 32 - CountSetBits( val );
}
inline float sRGB2linear( float v )
{
const float a = 0.055f;
if( v <= 0.04045f )
{
return v / 12.92f;
}
else
{
return pow( ( v + a ) / ( 1 + a ), 2.4f );
}
}
inline float linear2sRGB( float v )
{
const float a = 0.055f;
if( v <= 0.0031308f )
{
return 12.92f * v;
}
else
{
return ( 1 + a ) * pow( v, 1/2.4f ) - a;
}
}
template<class T>
inline T SmoothStep( T x )
{
return x*x*(3-2*x);
}
inline uint8 clampu8( int32 val )
{
return std::min( std::max( 0, val ), 255 );
}
template<class T>
inline T sq( T val )
{
return val * val;
}
static inline int mul8bit( int a, int b )
{
int t = a*b + 128;
return ( t + ( t >> 8 ) ) >> 8;
}
#endif

View file

@ -0,0 +1,51 @@
#ifndef __PROCESSCOMMON_HPP__
#define __PROCESSCOMMON_HPP__
#include <assert.h>
#include <stddef.h>
#include "Types.hpp"
template<class T>
static size_t GetLeastError( const T* err, size_t num )
{
size_t idx = 0;
for( size_t i=1; i<num; i++ )
{
if( err[i] < err[idx] )
{
idx = i;
}
}
return idx;
}
static uint64 FixByteOrder( uint64 d )
{
return ( ( d & 0x00000000FFFFFFFF ) ) |
( ( d & 0xFF00000000000000 ) >> 24 ) |
( ( d & 0x000000FF00000000 ) << 24 ) |
( ( d & 0x00FF000000000000 ) >> 8 ) |
( ( d & 0x0000FF0000000000 ) << 8 );
}
template<class T, class S>
static uint64 EncodeSelectors( uint64 d, const T terr[2][8], const S tsel[16][8], const uint32* id )
{
size_t tidx[2];
tidx[0] = GetLeastError( terr[0], 8 );
tidx[1] = GetLeastError( terr[1], 8 );
d |= tidx[0] << 26;
d |= tidx[1] << 29;
for( int i=0; i<16; i++ )
{
uint64 t = tsel[i][tidx[id[i]%2]];
d |= ( t & 0x1 ) << ( i + 32 );
d |= ( t & 0x2 ) << ( i + 47 );
}
return d;
}
#endif

View file

@ -0,0 +1,707 @@
#include <array>
#include <string.h>
#include "Math.hpp"
#include "ProcessCommon.hpp"
#include "ProcessRGB.hpp"
#include "Tables.hpp"
#include "Types.hpp"
#include "Vector.hpp"
#ifdef __SSE4_1__
# ifdef _MSC_VER
# include <intrin.h>
# include <Windows.h>
# define _bswap(x) _byteswap_ulong(x)
# else
# include <x86intrin.h>
# endif
#endif
namespace
{
typedef std::array<uint16, 4> v4i;
void Average( const uint8* data, v4i* a )
{
#ifdef __SSE4_1__
__m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0);
__m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1);
__m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2);
__m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3);
__m128i d0l = _mm_unpacklo_epi8(d0, _mm_setzero_si128());
__m128i d0h = _mm_unpackhi_epi8(d0, _mm_setzero_si128());
__m128i d1l = _mm_unpacklo_epi8(d1, _mm_setzero_si128());
__m128i d1h = _mm_unpackhi_epi8(d1, _mm_setzero_si128());
__m128i d2l = _mm_unpacklo_epi8(d2, _mm_setzero_si128());
__m128i d2h = _mm_unpackhi_epi8(d2, _mm_setzero_si128());
__m128i d3l = _mm_unpacklo_epi8(d3, _mm_setzero_si128());
__m128i d3h = _mm_unpackhi_epi8(d3, _mm_setzero_si128());
__m128i sum0 = _mm_add_epi16(d0l, d1l);
__m128i sum1 = _mm_add_epi16(d0h, d1h);
__m128i sum2 = _mm_add_epi16(d2l, d3l);
__m128i sum3 = _mm_add_epi16(d2h, d3h);
__m128i sum0l = _mm_unpacklo_epi16(sum0, _mm_setzero_si128());
__m128i sum0h = _mm_unpackhi_epi16(sum0, _mm_setzero_si128());
__m128i sum1l = _mm_unpacklo_epi16(sum1, _mm_setzero_si128());
__m128i sum1h = _mm_unpackhi_epi16(sum1, _mm_setzero_si128());
__m128i sum2l = _mm_unpacklo_epi16(sum2, _mm_setzero_si128());
__m128i sum2h = _mm_unpackhi_epi16(sum2, _mm_setzero_si128());
__m128i sum3l = _mm_unpacklo_epi16(sum3, _mm_setzero_si128());
__m128i sum3h = _mm_unpackhi_epi16(sum3, _mm_setzero_si128());
__m128i b0 = _mm_add_epi32(sum0l, sum0h);
__m128i b1 = _mm_add_epi32(sum1l, sum1h);
__m128i b2 = _mm_add_epi32(sum2l, sum2h);
__m128i b3 = _mm_add_epi32(sum3l, sum3h);
__m128i a0 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b2, b3), _mm_set1_epi32(4)), 3);
__m128i a1 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b0, b1), _mm_set1_epi32(4)), 3);
__m128i a2 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b1, b3), _mm_set1_epi32(4)), 3);
__m128i a3 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b0, b2), _mm_set1_epi32(4)), 3);
_mm_storeu_si128((__m128i*)&a[0], _mm_packus_epi32(_mm_shuffle_epi32(a0, _MM_SHUFFLE(3, 0, 1, 2)), _mm_shuffle_epi32(a1, _MM_SHUFFLE(3, 0, 1, 2))));
_mm_storeu_si128((__m128i*)&a[2], _mm_packus_epi32(_mm_shuffle_epi32(a2, _MM_SHUFFLE(3, 0, 1, 2)), _mm_shuffle_epi32(a3, _MM_SHUFFLE(3, 0, 1, 2))));
#else
uint32 r[4];
uint32 g[4];
uint32 b[4];
memset(r, 0, sizeof(r));
memset(g, 0, sizeof(g));
memset(b, 0, sizeof(b));
for( int j=0; j<4; j++ )
{
for( int i=0; i<4; i++ )
{
int index = (j & 2) + (i >> 1);
b[index] += *data++;
g[index] += *data++;
r[index] += *data++;
data++;
}
}
a[0] = v4i{ uint16( (r[2] + r[3] + 4) / 8 ), uint16( (g[2] + g[3] + 4) / 8 ), uint16( (b[2] + b[3] + 4) / 8 ), 0};
a[1] = v4i{ uint16( (r[0] + r[1] + 4) / 8 ), uint16( (g[0] + g[1] + 4) / 8 ), uint16( (b[0] + b[1] + 4) / 8 ), 0};
a[2] = v4i{ uint16( (r[1] + r[3] + 4) / 8 ), uint16( (g[1] + g[3] + 4) / 8 ), uint16( (b[1] + b[3] + 4) / 8 ), 0};
a[3] = v4i{ uint16( (r[0] + r[2] + 4) / 8 ), uint16( (g[0] + g[2] + 4) / 8 ), uint16( (b[0] + b[2] + 4) / 8 ), 0};
#endif
}
void CalcErrorBlock( const uint8* data, uint err[4][4] )
{
#ifdef __SSE4_1__
__m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0);
__m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1);
__m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2);
__m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3);
__m128i dm0 = _mm_and_si128(d0, _mm_set1_epi32(0x00FFFFFF));
__m128i dm1 = _mm_and_si128(d1, _mm_set1_epi32(0x00FFFFFF));
__m128i dm2 = _mm_and_si128(d2, _mm_set1_epi32(0x00FFFFFF));
__m128i dm3 = _mm_and_si128(d3, _mm_set1_epi32(0x00FFFFFF));
__m128i d0l = _mm_unpacklo_epi8(dm0, _mm_setzero_si128());
__m128i d0h = _mm_unpackhi_epi8(dm0, _mm_setzero_si128());
__m128i d1l = _mm_unpacklo_epi8(dm1, _mm_setzero_si128());
__m128i d1h = _mm_unpackhi_epi8(dm1, _mm_setzero_si128());
__m128i d2l = _mm_unpacklo_epi8(dm2, _mm_setzero_si128());
__m128i d2h = _mm_unpackhi_epi8(dm2, _mm_setzero_si128());
__m128i d3l = _mm_unpacklo_epi8(dm3, _mm_setzero_si128());
__m128i d3h = _mm_unpackhi_epi8(dm3, _mm_setzero_si128());
__m128i sum0 = _mm_add_epi16(d0l, d1l);
__m128i sum1 = _mm_add_epi16(d0h, d1h);
__m128i sum2 = _mm_add_epi16(d2l, d3l);
__m128i sum3 = _mm_add_epi16(d2h, d3h);
__m128i sum0l = _mm_unpacklo_epi16(sum0, _mm_setzero_si128());
__m128i sum0h = _mm_unpackhi_epi16(sum0, _mm_setzero_si128());
__m128i sum1l = _mm_unpacklo_epi16(sum1, _mm_setzero_si128());
__m128i sum1h = _mm_unpackhi_epi16(sum1, _mm_setzero_si128());
__m128i sum2l = _mm_unpacklo_epi16(sum2, _mm_setzero_si128());
__m128i sum2h = _mm_unpackhi_epi16(sum2, _mm_setzero_si128());
__m128i sum3l = _mm_unpacklo_epi16(sum3, _mm_setzero_si128());
__m128i sum3h = _mm_unpackhi_epi16(sum3, _mm_setzero_si128());
__m128i b0 = _mm_add_epi32(sum0l, sum0h);
__m128i b1 = _mm_add_epi32(sum1l, sum1h);
__m128i b2 = _mm_add_epi32(sum2l, sum2h);
__m128i b3 = _mm_add_epi32(sum3l, sum3h);
__m128i a0 = _mm_add_epi32(b2, b3);
__m128i a1 = _mm_add_epi32(b0, b1);
__m128i a2 = _mm_add_epi32(b1, b3);
__m128i a3 = _mm_add_epi32(b0, b2);
_mm_storeu_si128((__m128i*)&err[0], a0);
_mm_storeu_si128((__m128i*)&err[1], a1);
_mm_storeu_si128((__m128i*)&err[2], a2);
_mm_storeu_si128((__m128i*)&err[3], a3);
#else
uint terr[4][4];
memset(terr, 0, 16 * sizeof(uint));
for( int j=0; j<4; j++ )
{
for( int i=0; i<4; i++ )
{
int index = (j & 2) + (i >> 1);
uint d = *data++;
terr[index][0] += d;
d = *data++;
terr[index][1] += d;
d = *data++;
terr[index][2] += d;
data++;
}
}
for( int i=0; i<3; i++ )
{
err[0][i] = terr[2][i] + terr[3][i];
err[1][i] = terr[0][i] + terr[1][i];
err[2][i] = terr[1][i] + terr[3][i];
err[3][i] = terr[0][i] + terr[2][i];
}
for( int i=0; i<4; i++ )
{
err[i][3] = 0;
}
#endif
}
uint CalcError( const uint block[4], const v4i& average )
{
uint err = 0x3FFFFFFF; // Big value to prevent negative values, but small enough to prevent overflow
err -= block[0] * 2 * average[2];
err -= block[1] * 2 * average[1];
err -= block[2] * 2 * average[0];
err += 8 * ( sq( average[0] ) + sq( average[1] ) + sq( average[2] ) );
return err;
}
void ProcessAverages( v4i* a )
{
#ifdef __SSE4_1__
for( int i=0; i<2; i++ )
{
__m128i d = _mm_loadu_si128((__m128i*)a[i*2].data());
__m128i t = _mm_add_epi16(_mm_mullo_epi16(d, _mm_set1_epi16(31)), _mm_set1_epi16(128));
__m128i c = _mm_srli_epi16(_mm_add_epi16(t, _mm_srli_epi16(t, 8)), 8);
__m128i c1 = _mm_shuffle_epi32(c, _MM_SHUFFLE(3, 2, 3, 2));
__m128i diff = _mm_sub_epi16(c, c1);
diff = _mm_max_epi16(diff, _mm_set1_epi16(-4));
diff = _mm_min_epi16(diff, _mm_set1_epi16(3));
__m128i co = _mm_add_epi16(c1, diff);
c = _mm_blend_epi16(co, c, 0xF0);
__m128i a0 = _mm_or_si128(_mm_slli_epi16(c, 3), _mm_srli_epi16(c, 2));
_mm_storeu_si128((__m128i*)a[4+i*2].data(), a0);
}
for( int i=0; i<2; i++ )
{
__m128i d = _mm_loadu_si128((__m128i*)a[i*2].data());
__m128i t0 = _mm_add_epi16(_mm_mullo_epi16(d, _mm_set1_epi16(15)), _mm_set1_epi16(128));
__m128i t1 = _mm_srli_epi16(_mm_add_epi16(t0, _mm_srli_epi16(t0, 8)), 8);
__m128i t2 = _mm_or_si128(t1, _mm_slli_epi16(t1, 4));
_mm_storeu_si128((__m128i*)a[i*2].data(), t2);
}
#else
for( int i=0; i<2; i++ )
{
for( int j=0; j<3; j++ )
{
int32 c1 = mul8bit( a[i*2+1][j], 31 );
int32 c2 = mul8bit( a[i*2][j], 31 );
int32 diff = c2 - c1;
if( diff > 3 ) diff = 3;
else if( diff < -4 ) diff = -4;
int32 co = c1 + diff;
a[5+i*2][j] = ( c1 << 3 ) | ( c1 >> 2 );
a[4+i*2][j] = ( co << 3 ) | ( co >> 2 );
}
}
for( int i=0; i<4; i++ )
{
a[i][0] = g_avg2[mul8bit( a[i][0], 15 )];
a[i][1] = g_avg2[mul8bit( a[i][1], 15 )];
a[i][2] = g_avg2[mul8bit( a[i][2], 15 )];
}
#endif
}
void EncodeAverages( uint64& _d, const v4i* a, size_t idx )
{
auto d = _d;
d |= ( idx << 24 );
size_t base = idx << 1;
if( ( idx & 0x2 ) == 0 )
{
for( int i=0; i<3; i++ )
{
d |= uint64( a[base+0][i] >> 4 ) << ( i*8 );
d |= uint64( a[base+1][i] >> 4 ) << ( i*8 + 4 );
}
}
else
{
for( int i=0; i<3; i++ )
{
d |= uint64( a[base+1][i] & 0xF8 ) << ( i*8 );
int32 c = ( ( a[base+0][i] & 0xF8 ) - ( a[base+1][i] & 0xF8 ) ) >> 3;
c &= ~0xFFFFFFF8;
d |= ((uint64)c) << ( i*8 );
}
}
_d = d;
}
uint64 CheckSolid( const uint8* src )
{
#ifdef __SSE4_1__
__m128i d0 = _mm_loadu_si128(((__m128i*)src) + 0);
__m128i d1 = _mm_loadu_si128(((__m128i*)src) + 1);
__m128i d2 = _mm_loadu_si128(((__m128i*)src) + 2);
__m128i d3 = _mm_loadu_si128(((__m128i*)src) + 3);
__m128i c = _mm_shuffle_epi32(d0, _MM_SHUFFLE(0, 0, 0, 0));
__m128i c0 = _mm_cmpeq_epi8(d0, c);
__m128i c1 = _mm_cmpeq_epi8(d1, c);
__m128i c2 = _mm_cmpeq_epi8(d2, c);
__m128i c3 = _mm_cmpeq_epi8(d3, c);
__m128i m0 = _mm_and_si128(c0, c1);
__m128i m1 = _mm_and_si128(c2, c3);
__m128i m = _mm_and_si128(m0, m1);
if (!_mm_testc_si128(m, _mm_set1_epi32(-1)))
{
return 0;
}
#else
const uint8* ptr = src + 4;
for( int i=1; i<16; i++ )
{
if( memcmp( src, ptr, 4 ) != 0 )
{
return 0;
}
ptr += 4;
}
#endif
return 0x02000000 |
( uint( src[0] & 0xF8 ) << 16 ) |
( uint( src[1] & 0xF8 ) << 8 ) |
( uint( src[2] & 0xF8 ) );
}
void PrepareAverages( v4i a[8], const uint8* src, uint err[4] )
{
Average( src, a );
ProcessAverages( a );
uint errblock[4][4];
CalcErrorBlock( src, errblock );
for( int i=0; i<4; i++ )
{
err[i/2] += CalcError( errblock[i], a[i] );
err[2+i/2] += CalcError( errblock[i], a[i+4] );
}
}
void FindBestFit( uint64 terr[2][8], uint16 tsel[16][8], v4i a[8], const uint32* id, const uint8* data )
{
for( size_t i=0; i<16; i++ )
{
uint16* sel = tsel[i];
uint bid = id[i];
uint64* ter = terr[bid%2];
uint8 b = *data++;
uint8 g = *data++;
uint8 r = *data++;
data++;
int dr = a[bid][0] - r;
int dg = a[bid][1] - g;
int db = a[bid][2] - b;
#ifdef __SSE4_1__
// Reference implementation
__m128i pix = _mm_set1_epi32(dr * 77 + dg * 151 + db * 28);
// Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
__m128i error0 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[0]));
__m128i error1 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[1]));
__m128i error2 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[0]));
__m128i error3 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[1]));
__m128i index0 = _mm_and_si128(_mm_cmplt_epi32(error1, error0), _mm_set1_epi32(1));
__m128i minError0 = _mm_min_epi32(error0, error1);
__m128i index1 = _mm_sub_epi32(_mm_set1_epi32(2), _mm_cmplt_epi32(error3, error2));
__m128i minError1 = _mm_min_epi32(error2, error3);
__m128i minIndex0 = _mm_blendv_epi8(index0, index1, _mm_cmplt_epi32(minError1, minError0));
__m128i minError = _mm_min_epi32(minError0, minError1);
// Squaring the minimum error to produce correct values when adding
__m128i minErrorLow = _mm_shuffle_epi32(minError, _MM_SHUFFLE(1, 1, 0, 0));
__m128i squareErrorLow = _mm_mul_epi32(minErrorLow, minErrorLow);
squareErrorLow = _mm_add_epi64(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 0));
_mm_storeu_si128(((__m128i*)ter) + 0, squareErrorLow);
__m128i minErrorHigh = _mm_shuffle_epi32(minError, _MM_SHUFFLE(3, 3, 2, 2));
__m128i squareErrorHigh = _mm_mul_epi32(minErrorHigh, minErrorHigh);
squareErrorHigh = _mm_add_epi64(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 1));
_mm_storeu_si128(((__m128i*)ter) + 1, squareErrorHigh);
// Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
error0 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[2]));
error1 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[3]));
error2 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[2]));
error3 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[3]));
index0 = _mm_and_si128(_mm_cmplt_epi32(error1, error0), _mm_set1_epi32(1));
minError0 = _mm_min_epi32(error0, error1);
index1 = _mm_sub_epi32(_mm_set1_epi32(2), _mm_cmplt_epi32(error3, error2));
minError1 = _mm_min_epi32(error2, error3);
__m128i minIndex1 = _mm_blendv_epi8(index0, index1, _mm_cmplt_epi32(minError1, minError0));
minError = _mm_min_epi32(minError0, minError1);
// Squaring the minimum error to produce correct values when adding
minErrorLow = _mm_shuffle_epi32(minError, _MM_SHUFFLE(1, 1, 0, 0));
squareErrorLow = _mm_mul_epi32(minErrorLow, minErrorLow);
squareErrorLow = _mm_add_epi64(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 2));
_mm_storeu_si128(((__m128i*)ter) + 2, squareErrorLow);
minErrorHigh = _mm_shuffle_epi32(minError, _MM_SHUFFLE(3, 3, 2, 2));
squareErrorHigh = _mm_mul_epi32(minErrorHigh, minErrorHigh);
squareErrorHigh = _mm_add_epi64(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 3));
_mm_storeu_si128(((__m128i*)ter) + 3, squareErrorHigh);
__m128i minIndex = _mm_packs_epi32(minIndex0, minIndex1);
_mm_storeu_si128((__m128i*)sel, minIndex);
#else
int pix = dr * 77 + dg * 151 + db * 28;
for( int t=0; t<8; t++ )
{
const int64* tab = g_table256[t];
uint idx = 0;
uint64 err = sq( tab[0] + pix );
for( int j=1; j<4; j++ )
{
uint64 local = sq( tab[j] + pix );
if( local < err )
{
err = local;
idx = j;
}
}
*sel++ = idx;
*ter++ += err;
}
#endif
}
}
#ifdef __SSE4_1__
// Non-reference implementation, but faster. Produces same results as the AVX2 version
void FindBestFit( uint32 terr[2][8], uint16 tsel[16][8], v4i a[8], const uint32* id, const uint8* data )
{
for( size_t i=0; i<16; i++ )
{
uint16* sel = tsel[i];
uint bid = id[i];
uint32* ter = terr[bid%2];
uint8 b = *data++;
uint8 g = *data++;
uint8 r = *data++;
data++;
int dr = a[bid][0] - r;
int dg = a[bid][1] - g;
int db = a[bid][2] - b;
// The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16
// This produces slightly different results, but is significant faster
__m128i pixel = _mm_set1_epi16(dr * 38 + dg * 76 + db * 14);
__m128i pix = _mm_abs_epi16(pixel);
// Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
// Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
__m128i error0 = _mm_abs_epi16(_mm_sub_epi16(pix, g_table128_SIMD[0]));
__m128i error1 = _mm_abs_epi16(_mm_sub_epi16(pix, g_table128_SIMD[1]));
__m128i index = _mm_and_si128(_mm_cmplt_epi16(error1, error0), _mm_set1_epi16(1));
__m128i minError = _mm_min_epi16(error0, error1);
// Exploiting symmetry of the selector table and use the sign bit
// This produces slightly different results, but is needed to produce same results as AVX2 implementation
__m128i indexBit = _mm_andnot_si128(_mm_srli_epi16(pixel, 15), _mm_set1_epi8(-1));
__m128i minIndex = _mm_or_si128(index, _mm_add_epi16(indexBit, indexBit));
// Squaring the minimum error to produce correct values when adding
__m128i squareErrorLo = _mm_mullo_epi16(minError, minError);
__m128i squareErrorHi = _mm_mulhi_epi16(minError, minError);
__m128i squareErrorLow = _mm_unpacklo_epi16(squareErrorLo, squareErrorHi);
__m128i squareErrorHigh = _mm_unpackhi_epi16(squareErrorLo, squareErrorHi);
squareErrorLow = _mm_add_epi32(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 0));
_mm_storeu_si128(((__m128i*)ter) + 0, squareErrorLow);
squareErrorHigh = _mm_add_epi32(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 1));
_mm_storeu_si128(((__m128i*)ter) + 1, squareErrorHigh);
_mm_storeu_si128((__m128i*)sel, minIndex);
}
}
#endif
uint8_t convert6(float f)
{
int i = (std::min(std::max(static_cast<int>(f), 0), 1023) - 15) >> 1;
return (i + 11 - ((i + 11) >> 7) - ((i + 4) >> 7)) >> 3;
}
uint8_t convert7(float f)
{
int i = (std::min(std::max(static_cast<int>(f), 0), 1023) - 15) >> 1;
return (i + 9 - ((i + 9) >> 8) - ((i + 6) >> 8)) >> 2;
}
std::pair<uint64, uint64> Planar(const uint8* src)
{
int32 r = 0;
int32 g = 0;
int32 b = 0;
for (int i = 0; i < 16; ++i)
{
b += src[i * 4 + 0];
g += src[i * 4 + 1];
r += src[i * 4 + 2];
}
int32 difRyz = 0;
int32 difGyz = 0;
int32 difByz = 0;
int32 difRxz = 0;
int32 difGxz = 0;
int32 difBxz = 0;
const int32 scaling[] = { -255, -85, 85, 255 };
for (int i = 0; i < 16; ++i)
{
int32 difB = (static_cast<int>(src[i * 4 + 0]) << 4) - b;
int32 difG = (static_cast<int>(src[i * 4 + 1]) << 4) - g;
int32 difR = (static_cast<int>(src[i * 4 + 2]) << 4) - r;
difRyz += difR * scaling[i % 4];
difGyz += difG * scaling[i % 4];
difByz += difB * scaling[i % 4];
difRxz += difR * scaling[i / 4];
difGxz += difG * scaling[i / 4];
difBxz += difB * scaling[i / 4];
}
const float scale = -4.0f / ((255 * 255 * 8.0f + 85 * 85 * 8.0f) * 16.0f);
float aR = difRxz * scale;
float aG = difGxz * scale;
float aB = difBxz * scale;
float bR = difRyz * scale;
float bG = difGyz * scale;
float bB = difByz * scale;
float dR = r * (4.0f / 16.0f);
float dG = g * (4.0f / 16.0f);
float dB = b * (4.0f / 16.0f);
// calculating the three colors RGBO, RGBH, and RGBV. RGB = df - af * x - bf * y;
float cofR = std::fma(aR, 255.0f, std::fma(bR, 255.0f, dR));
float cofG = std::fma(aG, 255.0f, std::fma(bG, 255.0f, dG));
float cofB = std::fma(aB, 255.0f, std::fma(bB, 255.0f, dB));
float chfR = std::fma(aR, -425.0f, std::fma(bR, 255.0f, dR));
float chfG = std::fma(aG, -425.0f, std::fma(bG, 255.0f, dG));
float chfB = std::fma(aB, -425.0f, std::fma(bB, 255.0f, dB));
float cvfR = std::fma(aR, 255.0f, std::fma(bR, -425.0f, dR));
float cvfG = std::fma(aG, 255.0f, std::fma(bG, -425.0f, dG));
float cvfB = std::fma(aB, 255.0f, std::fma(bB, -425.0f, dB));
// convert to r6g7b6
int32 coR = convert6(cofR);
int32 coG = convert7(cofG);
int32 coB = convert6(cofB);
int32 chR = convert6(chfR);
int32 chG = convert7(chfG);
int32 chB = convert6(chfB);
int32 cvR = convert6(cvfR);
int32 cvG = convert7(cvfG);
int32 cvB = convert6(cvfB);
// Error calculation
auto ro0 = coR;
auto go0 = coG;
auto bo0 = coB;
auto ro1 = (ro0 >> 4) | (ro0 << 2);
auto go1 = (go0 >> 6) | (go0 << 1);
auto bo1 = (bo0 >> 4) | (bo0 << 2);
auto ro2 = (ro1 << 2) + 2;
auto go2 = (go1 << 2) + 2;
auto bo2 = (bo1 << 2) + 2;
auto rh0 = chR;
auto gh0 = chG;
auto bh0 = chB;
auto rh1 = (rh0 >> 4) | (rh0 << 2);
auto gh1 = (gh0 >> 6) | (gh0 << 1);
auto bh1 = (bh0 >> 4) | (bh0 << 2);
auto rh2 = rh1 - ro1;
auto gh2 = gh1 - go1;
auto bh2 = bh1 - bo1;
auto rv0 = cvR;
auto gv0 = cvG;
auto bv0 = cvB;
auto rv1 = (rv0 >> 4) | (rv0 << 2);
auto gv1 = (gv0 >> 6) | (gv0 << 1);
auto bv1 = (bv0 >> 4) | (bv0 << 2);
auto rv2 = rv1 - ro1;
auto gv2 = gv1 - go1;
auto bv2 = bv1 - bo1;
uint64 error = 0;
for (int i = 0; i < 16; ++i)
{
int32 cR = clampu8((rh2 * (i / 4) + rv2 * (i % 4) + ro2) >> 2);
int32 cG = clampu8((gh2 * (i / 4) + gv2 * (i % 4) + go2) >> 2);
int32 cB = clampu8((bh2 * (i / 4) + bv2 * (i % 4) + bo2) >> 2);
int32 difB = static_cast<int>(src[i * 4 + 0]) - cB;
int32 difG = static_cast<int>(src[i * 4 + 1]) - cG;
int32 difR = static_cast<int>(src[i * 4 + 2]) - cR;
int32 dif = difR * 38 + difG * 76 + difB * 14;
error += dif * dif;
}
/**/
uint32 rgbv = cvB | (cvG << 6) | (cvR << 13);
uint32 rgbh = chB | (chG << 6) | (chR << 13);
uint32 hi = rgbv | ((rgbh & 0x1FFF) << 19);
uint32 lo = (chR & 0x1) | 0x2 | ((chR << 1) & 0x7C);
lo |= ((coB & 0x07) << 7) | ((coB & 0x18) << 8) | ((coB & 0x20) << 11);
lo |= ((coG & 0x3F) << 17) | ((coG & 0x40) << 18);
lo |= coR << 25;
const auto idx = (coR & 0x20) | ((coG & 0x20) >> 1) | ((coB & 0x1E) >> 1);
lo |= g_flags[idx];
uint64 result = static_cast<uint32>(_bswap(lo));
result |= static_cast<uint64>(static_cast<uint32>(_bswap(hi))) << 32;
return std::make_pair(result, error);
}
template<class T, class S>
uint64 EncodeSelectors( uint64 d, const T terr[2][8], const S tsel[16][8], const uint32* id, const uint64 value, const uint64 error)
{
size_t tidx[2];
tidx[0] = GetLeastError( terr[0], 8 );
tidx[1] = GetLeastError( terr[1], 8 );
if ((terr[0][tidx[0]] + terr[1][tidx[1]]) >= error)
{
return value;
}
d |= tidx[0] << 26;
d |= tidx[1] << 29;
for( int i=0; i<16; i++ )
{
uint64 t = tsel[i][tidx[id[i]%2]];
d |= ( t & 0x1 ) << ( i + 32 );
d |= ( t & 0x2 ) << ( i + 47 );
}
return FixByteOrder(d);
}
}
uint64 ProcessRGB( const uint8* src )
{
uint64 d = CheckSolid( src );
if( d != 0 ) return d;
v4i a[8];
uint err[4] = {};
PrepareAverages( a, src, err );
size_t idx = GetLeastError( err, 4 );
EncodeAverages( d, a, idx );
#if defined __SSE4_1__ && !defined REFERENCE_IMPLEMENTATION
uint32 terr[2][8] = {};
#else
uint64 terr[2][8] = {};
#endif
uint16 tsel[16][8];
auto id = g_id[idx];
FindBestFit( terr, tsel, a, id, src );
return FixByteOrder( EncodeSelectors( d, terr, tsel, id ) );
}
uint64 ProcessRGB_ETC2( const uint8* src )
{
auto result = Planar( src );
uint64 d = 0;
v4i a[8];
uint err[4] = {};
PrepareAverages( a, src, err );
size_t idx = GetLeastError( err, 4 );
EncodeAverages( d, a, idx );
uint32 terr[2][8] = {};
uint16 tsel[16][8];
auto id = g_id[idx];
FindBestFit( terr, tsel, a, id, src );
return EncodeSelectors( d, terr, tsel, id, result.first, result.second );
}

View file

@ -0,0 +1,9 @@
#ifndef __PROCESSRGB_HPP__
#define __PROCESSRGB_HPP__
#include "Types.hpp"
uint64 ProcessRGB( const uint8* src );
uint64 ProcessRGB_ETC2( const uint8* src );
#endif

View file

@ -0,0 +1,109 @@
#include "Tables.hpp"
const int32 g_table[8][4] = {
{ 2, 8, -2, -8 },
{ 5, 17, -5, -17 },
{ 9, 29, -9, -29 },
{ 13, 42, -13, -42 },
{ 18, 60, -18, -60 },
{ 24, 80, -24, -80 },
{ 33, 106, -33, -106 },
{ 47, 183, -47, -183 }
};
const int64 g_table256[8][4] = {
{ 2*256, 8*256, -2*256, -8*256 },
{ 5*256, 17*256, -5*256, -17*256 },
{ 9*256, 29*256, -9*256, -29*256 },
{ 13*256, 42*256, -13*256, -42*256 },
{ 18*256, 60*256, -18*256, -60*256 },
{ 24*256, 80*256, -24*256, -80*256 },
{ 33*256, 106*256, -33*256, -106*256 },
{ 47*256, 183*256, -47*256, -183*256 }
};
const uint32 g_id[4][16] = {
{ 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 },
{ 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 2 },
{ 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4 },
{ 7, 7, 6, 6, 7, 7, 6, 6, 7, 7, 6, 6, 7, 7, 6, 6 }
};
const uint32 g_avg2[16] = {
0x00,
0x11,
0x22,
0x33,
0x44,
0x55,
0x66,
0x77,
0x88,
0x99,
0xAA,
0xBB,
0xCC,
0xDD,
0xEE,
0xFF
};
const uint32 g_flags[64] = {
0x80800402, 0x80800402, 0x80800402, 0x80800402,
0x80800402, 0x80800402, 0x80800402, 0x8080E002,
0x80800402, 0x80800402, 0x8080E002, 0x8080E002,
0x80800402, 0x8080E002, 0x8080E002, 0x8080E002,
0x80000402, 0x80000402, 0x80000402, 0x80000402,
0x80000402, 0x80000402, 0x80000402, 0x8000E002,
0x80000402, 0x80000402, 0x8000E002, 0x8000E002,
0x80000402, 0x8000E002, 0x8000E002, 0x8000E002,
0x00800402, 0x00800402, 0x00800402, 0x00800402,
0x00800402, 0x00800402, 0x00800402, 0x0080E002,
0x00800402, 0x00800402, 0x0080E002, 0x0080E002,
0x00800402, 0x0080E002, 0x0080E002, 0x0080E002,
0x00000402, 0x00000402, 0x00000402, 0x00000402,
0x00000402, 0x00000402, 0x00000402, 0x0000E002,
0x00000402, 0x00000402, 0x0000E002, 0x0000E002,
0x00000402, 0x0000E002, 0x0000E002, 0x0000E002
};
#ifdef __SSE4_1__
const uint8 g_flags_AVX2[64] =
{
0x63, 0x63, 0x63, 0x63,
0x63, 0x63, 0x63, 0x7D,
0x63, 0x63, 0x7D, 0x7D,
0x63, 0x7D, 0x7D, 0x7D,
0x43, 0x43, 0x43, 0x43,
0x43, 0x43, 0x43, 0x5D,
0x43, 0x43, 0x5D, 0x5D,
0x43, 0x5D, 0x5D, 0x5D,
0x23, 0x23, 0x23, 0x23,
0x23, 0x23, 0x23, 0x3D,
0x23, 0x23, 0x3D, 0x3D,
0x23, 0x3D, 0x3D, 0x3D,
0x03, 0x03, 0x03, 0x03,
0x03, 0x03, 0x03, 0x1D,
0x03, 0x03, 0x1D, 0x1D,
0x03, 0x1D, 0x1D, 0x1D,
};
const __m128i g_table_SIMD[2] =
{
_mm_setr_epi16( 2, 5, 9, 13, 18, 24, 33, 47),
_mm_setr_epi16( 8, 17, 29, 42, 60, 80, 106, 183)
};
const __m128i g_table128_SIMD[2] =
{
_mm_setr_epi16( 2*128, 5*128, 9*128, 13*128, 18*128, 24*128, 33*128, 47*128),
_mm_setr_epi16( 8*128, 17*128, 29*128, 42*128, 60*128, 80*128, 106*128, 183*128)
};
const __m128i g_table256_SIMD[4] =
{
_mm_setr_epi32( 2*256, 5*256, 9*256, 13*256),
_mm_setr_epi32( 8*256, 17*256, 29*256, 42*256),
_mm_setr_epi32( 18*256, 24*256, 33*256, 47*256),
_mm_setr_epi32( 60*256, 80*256, 106*256, 183*256)
};
#endif

View file

@ -0,0 +1,25 @@
#ifndef __TABLES_HPP__
#define __TABLES_HPP__
#include "Types.hpp"
#ifdef __SSE4_1__
#include <smmintrin.h>
#endif
extern const int32 g_table[8][4];
extern const int64 g_table256[8][4];
extern const uint32 g_id[4][16];
extern const uint32 g_avg2[16];
extern const uint32 g_flags[64];
#ifdef __SSE4_1__
extern const uint8 g_flags_AVX2[64];
extern const __m128i g_table_SIMD[2];
extern const __m128i g_table128_SIMD[2];
extern const __m128i g_table256_SIMD[4];
#endif
#endif

View file

@ -0,0 +1,17 @@
#ifndef __DARKRL__TYPES_HPP__
#define __DARKRL__TYPES_HPP__
#include <cstdint>
typedef int8_t int8;
typedef uint8_t uint8;
typedef int16_t int16;
typedef uint16_t uint16;
typedef int32_t int32;
typedef uint32_t uint32;
typedef int64_t int64;
typedef uint64_t uint64;
typedef unsigned int uint;
#endif

View file

@ -0,0 +1,222 @@
#ifndef __DARKRL__VECTOR_HPP__
#define __DARKRL__VECTOR_HPP__
#include <assert.h>
#include <algorithm>
#include <math.h>
#include "Math.hpp"
#include "Types.hpp"
template<class T>
struct Vector2
{
Vector2() : x( 0 ), y( 0 ) {}
Vector2( T v ) : x( v ), y( v ) {}
Vector2( T _x, T _y ) : x( _x ), y( _y ) {}
bool operator==( const Vector2<T>& rhs ) const { return x == rhs.x && y == rhs.y; }
bool operator!=( const Vector2<T>& rhs ) const { return !( *this == rhs ); }
Vector2<T>& operator+=( const Vector2<T>& rhs )
{
x += rhs.x;
y += rhs.y;
return *this;
}
Vector2<T>& operator-=( const Vector2<T>& rhs )
{
x -= rhs.x;
y -= rhs.y;
return *this;
}
Vector2<T>& operator*=( const Vector2<T>& rhs )
{
x *= rhs.x;
y *= rhs.y;
return *this;
}
T x, y;
};
template<class T>
Vector2<T> operator+( const Vector2<T>& lhs, const Vector2<T>& rhs )
{
return Vector2<T>( lhs.x + rhs.x, lhs.y + rhs.y );
}
template<class T>
Vector2<T> operator-( const Vector2<T>& lhs, const Vector2<T>& rhs )
{
return Vector2<T>( lhs.x - rhs.x, lhs.y - rhs.y );
}
template<class T>
Vector2<T> operator*( const Vector2<T>& lhs, const float& rhs )
{
return Vector2<T>( lhs.x * rhs, lhs.y * rhs );
}
template<class T>
Vector2<T> operator/( const Vector2<T>& lhs, const T& rhs )
{
return Vector2<T>( lhs.x / rhs, lhs.y / rhs );
}
typedef Vector2<int32> v2i;
typedef Vector2<float> v2f;
template<class T>
struct Vector3
{
Vector3() : x( 0 ), y( 0 ), z( 0 ) {}
Vector3( T v ) : x( v ), y( v ), z( v ) {}
Vector3( T _x, T _y, T _z ) : x( _x ), y( _y ), z( _z ) {}
template<class Y>
Vector3( const Vector3<Y>& v ) : x( T( v.x ) ), y( T( v.y ) ), z( T( v.z ) ) {}
T Luminance() const { return T( x * 0.3f + y * 0.59f + z * 0.11f ); }
void Clamp()
{
x = std::min( T(1), std::max( T(0), x ) );
y = std::min( T(1), std::max( T(0), y ) );
z = std::min( T(1), std::max( T(0), z ) );
}
bool operator==( const Vector3<T>& rhs ) const { return x == rhs.x && y == rhs.y && z == rhs.z; }
bool operator!=( const Vector2<T>& rhs ) const { return !( *this == rhs ); }
T& operator[]( uint idx ) { assert( idx < 3 ); return ((T*)this)[idx]; }
const T& operator[]( uint idx ) const { assert( idx < 3 ); return ((T*)this)[idx]; }
Vector3<T> operator+=( const Vector3<T>& rhs )
{
x += rhs.x;
y += rhs.y;
z += rhs.z;
return *this;
}
Vector3<T> operator*=( const Vector3<T>& rhs )
{
x *= rhs.x;
y *= rhs.y;
z *= rhs.z;
return *this;
}
Vector3<T> operator*=( const float& rhs )
{
x *= rhs;
y *= rhs;
z *= rhs;
return *this;
}
T x, y, z;
T padding;
};
template<class T>
Vector3<T> operator+( const Vector3<T>& lhs, const Vector3<T>& rhs )
{
return Vector3<T>( lhs.x + rhs.x, lhs.y + rhs.y, lhs.z + rhs.z );
}
template<class T>
Vector3<T> operator-( const Vector3<T>& lhs, const Vector3<T>& rhs )
{
return Vector3<T>( lhs.x - rhs.x, lhs.y - rhs.y, lhs.z - rhs.z );
}
template<class T>
Vector3<T> operator*( const Vector3<T>& lhs, const Vector3<T>& rhs )
{
return Vector3<T>( lhs.x * rhs.x, lhs.y * rhs.y, lhs.z * rhs.z );
}
template<class T>
Vector3<T> operator*( const Vector3<T>& lhs, const float& rhs )
{
return Vector3<T>( T( lhs.x * rhs ), T( lhs.y * rhs ), T( lhs.z * rhs ) );
}
template<class T>
Vector3<T> operator/( const Vector3<T>& lhs, const T& rhs )
{
return Vector3<T>( lhs.x / rhs, lhs.y / rhs, lhs.z / rhs );
}
template<class T>
bool operator<( const Vector3<T>& lhs, const Vector3<T>& rhs )
{
return lhs.Luminance() < rhs.Luminance();
}
typedef Vector3<int32> v3i;
typedef Vector3<float> v3f;
typedef Vector3<uint8> v3b;
static inline v3b v3f_to_v3b( const v3f& v )
{
return v3b( uint8( std::min( 1.f, v.x ) * 255 ), uint8( std::min( 1.f, v.y ) * 255 ), uint8( std::min( 1.f, v.z ) * 255 ) );
}
template<class T>
Vector3<T> Mix( const Vector3<T>& v1, const Vector3<T>& v2, float amount )
{
return v1 + ( v2 - v1 ) * amount;
}
template<>
inline v3b Mix( const v3b& v1, const v3b& v2, float amount )
{
return v3b( v3f( v1 ) + ( v3f( v2 ) - v3f( v1 ) ) * amount );
}
template<class T>
Vector3<T> Desaturate( const Vector3<T>& v )
{
T l = v.Luminance();
return Vector3<T>( l, l, l );
}
template<class T>
Vector3<T> Desaturate( const Vector3<T>& v, float mul )
{
T l = T( v.Luminance() * mul );
return Vector3<T>( l, l, l );
}
template<class T>
Vector3<T> pow( const Vector3<T>& base, float exponent )
{
return Vector3<T>(
pow( base.x, exponent ),
pow( base.y, exponent ),
pow( base.z, exponent ) );
}
template<class T>
Vector3<T> sRGB2linear( const Vector3<T>& v )
{
return Vector3<T>(
sRGB2linear( v.x ),
sRGB2linear( v.y ),
sRGB2linear( v.z ) );
}
template<class T>
Vector3<T> linear2sRGB( const Vector3<T>& v )
{
return Vector3<T>(
linear2sRGB( v.x ),
linear2sRGB( v.y ),
linear2sRGB( v.z ) );
}
#endif