mirror of
https://github.com/ZDoom/qzdoom.git
synced 2024-12-11 05:01:09 +00:00
Moved vectorized drawers to their own files
This commit is contained in:
parent
3e7eb79729
commit
3f905197d0
5 changed files with 1212 additions and 1079 deletions
|
@ -48,10 +48,6 @@
|
|||
#endif
|
||||
#include <vector>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning(disable: 4752) // warning C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
|
||||
#endif
|
||||
|
||||
extern int vlinebits;
|
||||
extern int mvlinebits;
|
||||
extern int tmvlinebits;
|
||||
|
@ -62,8 +58,38 @@ extern float rw_lightstep;
|
|||
extern int wallshade;
|
||||
|
||||
CVAR(Bool, r_multithreaded, true, 0)
|
||||
CVAR(Bool, r_linearlight, false, 0)
|
||||
|
||||
//#define USE_AVX // Use AVX2 256 bit intrinsics (requires Haswell or newer)
|
||||
#ifndef NO_SSE
|
||||
|
||||
// Generate SSE drawers:
|
||||
#define VecCommand(name) name##_SSE_Command
|
||||
#define VEC_SHADE_SIMPLE_INIT SSE_SHADE_SIMPLE_INIT
|
||||
#define VEC_SHADE_SIMPLE_INIT4 SSE_SHADE_SIMPLE_INIT4
|
||||
#define VEC_SHADE_SIMPLE SSE_SHADE_SIMPLE
|
||||
#define VEC_SHADE_INIT SSE_SHADE_INIT
|
||||
#define VEC_SHADE_INIT4 SSE_SHADE_INIT4
|
||||
#define VEC_SHADE SSE_SHADE
|
||||
#include "r_draw_rgba_sse.h"
|
||||
|
||||
// Generate AVX drawers:
|
||||
#undef VecCommand
|
||||
#undef VEC_SHADE_SIMPLE_INIT
|
||||
#undef VEC_SHADE_SIMPLE_INIT4
|
||||
#undef VEC_SHADE_SIMPLE
|
||||
#undef VEC_SHADE_INIT
|
||||
#undef VEC_SHADE_INIT4
|
||||
#undef VEC_SHADE
|
||||
#define VecCommand(name) name##_AVX_Command
|
||||
#define VEC_SHADE_SIMPLE_INIT AVX_LINEAR_SHADE_SIMPLE_INIT
|
||||
#define VEC_SHADE_SIMPLE_INIT4 AVX_LINEAR_SHADE_SIMPLE_INIT4
|
||||
#define VEC_SHADE_SIMPLE AVX_LINEAR_SHADE_SIMPLE
|
||||
#define VEC_SHADE_INIT AVX_LINEAR_SHADE_INIT
|
||||
#define VEC_SHADE_INIT4 AVX_LINEAR_SHADE_INIT4
|
||||
#define VEC_SHADE AVX_LINEAR_SHADE
|
||||
#include "r_draw_rgba_sse.h"
|
||||
|
||||
#endif
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
@ -1495,7 +1521,6 @@ public:
|
|||
_shade_constants = ds_shade_constants;
|
||||
}
|
||||
|
||||
#ifdef NO_SSE
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
if (thread->line_skipped_by_thread(_y))
|
||||
|
@ -1560,401 +1585,6 @@ public:
|
|||
} while (--count);
|
||||
}
|
||||
}
|
||||
#elif defined(USE_AVX)
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
if (thread->line_skipped_by_thread(_y))
|
||||
return;
|
||||
|
||||
dsfixed_t xfrac;
|
||||
dsfixed_t yfrac;
|
||||
dsfixed_t xstep;
|
||||
dsfixed_t ystep;
|
||||
uint32_t* dest;
|
||||
const uint32_t* source = _source;
|
||||
int count;
|
||||
int spot;
|
||||
|
||||
xfrac = _xfrac;
|
||||
yfrac = _yfrac;
|
||||
|
||||
dest = ylookup[_y] + _x1 + (uint32_t*)_destorg;
|
||||
|
||||
count = _x2 - _x1 + 1;
|
||||
|
||||
xstep = _xstep;
|
||||
ystep = _ystep;
|
||||
|
||||
uint32_t light = calc_light_multiplier(_light);
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
if (_xbits == 6 && _ybits == 6)
|
||||
{
|
||||
// 64x64 is the most common case by far, so special case it.
|
||||
|
||||
int sse_count = count / 8;
|
||||
count -= sse_count * 8;
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
AVX2_SHADE_SIMPLE_INIT(light);
|
||||
|
||||
while (sse_count--)
|
||||
{
|
||||
uint32_t fg_pixels[8];
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
// Current texture index in u,v.
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
fg_pixels[i] = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
}
|
||||
|
||||
// Lookup pixel from flat texture tile,
|
||||
// re-index using light/colormap.
|
||||
__m256i fg = _mm256_loadu_si256((const __m256i*)fg_pixels);
|
||||
AVX2_SHADE_SIMPLE(fg);
|
||||
_mm256_storeu_si256((__m256i*)dest, fg);
|
||||
|
||||
// Next step in u,v.
|
||||
dest += 8;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
AVX2_SHADE_INIT(light, shade_constants);
|
||||
|
||||
while (sse_count--)
|
||||
{
|
||||
uint32_t fg_pixels[8];
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
// Current texture index in u,v.
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
fg_pixels[i] = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
}
|
||||
|
||||
// Lookup pixel from flat texture tile,
|
||||
// re-index using light/colormap.
|
||||
__m256i fg = _mm256_loadu_si256((const __m256i*)fg_pixels);
|
||||
AVX2_SHADE(fg, shade_constants);
|
||||
_mm256_storeu_si256((__m256i*)dest, fg);
|
||||
|
||||
// Next step in u,v.
|
||||
dest += 8;
|
||||
}
|
||||
}
|
||||
|
||||
if (count == 0)
|
||||
return;
|
||||
|
||||
do
|
||||
{
|
||||
// Current texture index in u,v.
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
|
||||
// Lookup pixel from flat texture tile
|
||||
*dest++ = shade_bgra(source[spot], light, shade_constants);
|
||||
|
||||
// Next step in u,v.
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
BYTE yshift = 32 - _ybits;
|
||||
BYTE xshift = yshift - _xbits;
|
||||
int xmask = ((1 << _xbits) - 1) << _ybits;
|
||||
|
||||
int sse_count = count / 8;
|
||||
count -= sse_count * 8;
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
AVX2_SHADE_SIMPLE_INIT(light);
|
||||
|
||||
while (sse_count--)
|
||||
{
|
||||
uint32_t fg_pixels[8];
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
fg_pixels[i] = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
}
|
||||
|
||||
// Lookup pixel from flat texture tile
|
||||
__m256i fg = _mm256_loadu_si256((const __m256i*)fg_pixels);
|
||||
AVX2_SHADE_SIMPLE(fg);
|
||||
_mm256_storeu_si256((__m256i*)dest, fg);
|
||||
dest += 8;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
AVX2_SHADE_INIT(light, shade_constants);
|
||||
|
||||
while (sse_count--)
|
||||
{
|
||||
uint32_t fg_pixels[8];
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
fg_pixels[i] = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
}
|
||||
|
||||
// Lookup pixel from flat texture tile
|
||||
__m256i fg = _mm256_loadu_si256((const __m256i*)fg_pixels);
|
||||
AVX2_SHADE_SIMPLE(fg);
|
||||
_mm256_storeu_si256((__m256i*)dest, fg);
|
||||
dest += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (count == 0)
|
||||
return;
|
||||
|
||||
do
|
||||
{
|
||||
// Current texture index in u,v.
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
|
||||
// Lookup pixel from flat texture tile
|
||||
*dest++ = shade_bgra(source[spot], light, shade_constants);
|
||||
|
||||
// Next step in u,v.
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
#else
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
if (thread->line_skipped_by_thread(_y))
|
||||
return;
|
||||
|
||||
dsfixed_t xfrac;
|
||||
dsfixed_t yfrac;
|
||||
dsfixed_t xstep;
|
||||
dsfixed_t ystep;
|
||||
uint32_t* dest;
|
||||
const uint32_t* source = _source;
|
||||
int count;
|
||||
int spot;
|
||||
|
||||
xfrac = _xfrac;
|
||||
yfrac = _yfrac;
|
||||
|
||||
dest = ylookup[_y] + _x1 + (uint32_t*)_destorg;
|
||||
|
||||
count = _x2 - _x1 + 1;
|
||||
|
||||
xstep = _xstep;
|
||||
ystep = _ystep;
|
||||
|
||||
uint32_t light = calc_light_multiplier(_light);
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
if (_xbits == 6 && _ybits == 6)
|
||||
{
|
||||
// 64x64 is the most common case by far, so special case it.
|
||||
|
||||
int sse_count = count / 4;
|
||||
count -= sse_count * 4;
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
SSE_SHADE_SIMPLE_INIT(light);
|
||||
|
||||
while (sse_count--)
|
||||
{
|
||||
// Current texture index in u,v.
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p0 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p1 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p2 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p3 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
// Lookup pixel from flat texture tile,
|
||||
// re-index using light/colormap.
|
||||
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
|
||||
SSE_SHADE_SIMPLE(fg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
|
||||
// Next step in u,v.
|
||||
dest += 4;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
SSE_SHADE_INIT(light, shade_constants);
|
||||
|
||||
while (sse_count--)
|
||||
{
|
||||
// Current texture index in u,v.
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p0 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p1 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p2 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p3 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
// Lookup pixel from flat texture tile,
|
||||
// re-index using light/colormap.
|
||||
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
|
||||
SSE_SHADE(fg, shade_constants);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
|
||||
// Next step in u,v.
|
||||
dest += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (count == 0)
|
||||
return;
|
||||
|
||||
do
|
||||
{
|
||||
// Current texture index in u,v.
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
|
||||
// Lookup pixel from flat texture tile
|
||||
*dest++ = shade_bgra(source[spot], light, shade_constants);
|
||||
|
||||
// Next step in u,v.
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
BYTE yshift = 32 - _ybits;
|
||||
BYTE xshift = yshift - _xbits;
|
||||
int xmask = ((1 << _xbits) - 1) << _ybits;
|
||||
|
||||
int sse_count = count / 4;
|
||||
count -= sse_count * 4;
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
SSE_SHADE_SIMPLE_INIT(light);
|
||||
|
||||
while (sse_count--)
|
||||
{
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p0 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p1 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p2 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p3 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
// Lookup pixel from flat texture tile
|
||||
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
|
||||
SSE_SHADE_SIMPLE(fg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += 4;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
SSE_SHADE_INIT(light, shade_constants);
|
||||
|
||||
while (sse_count--)
|
||||
{
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p0 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p1 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p2 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p3 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
// Lookup pixel from flat texture tile
|
||||
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
|
||||
SSE_SHADE(fg, shade_constants);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (count == 0)
|
||||
return;
|
||||
|
||||
do
|
||||
{
|
||||
// Current texture index in u,v.
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
|
||||
// Lookup pixel from flat texture tile
|
||||
*dest++ = shade_bgra(source[spot], light, shade_constants);
|
||||
|
||||
// Next step in u,v.
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
class DrawSpanMaskedRGBACommand : public DrawerCommand
|
||||
|
@ -2698,7 +2328,6 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
#ifdef NO_SSE
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
int count = thread->count_for_thread(_dest_y, _count);
|
||||
|
@ -2735,165 +2364,6 @@ public:
|
|||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
#elif defined(USE_AVX)
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
int count = thread->count_for_thread(_dest_y, _count);
|
||||
if (count <= 0)
|
||||
return;
|
||||
|
||||
uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
|
||||
int bits = vlinebits;
|
||||
int pitch = _pitch * thread->num_cores;
|
||||
|
||||
uint32_t light0 = calc_light_multiplier(palookuplight[0]);
|
||||
uint32_t light1 = calc_light_multiplier(palookuplight[1]);
|
||||
uint32_t light2 = calc_light_multiplier(palookuplight[2]);
|
||||
uint32_t light3 = calc_light_multiplier(palookuplight[3]);
|
||||
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
|
||||
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
|
||||
int skipped = thread->skipped_by_thread(_dest_y);
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
local_vplce[i] += local_vince[i] * skipped;
|
||||
local_vince[i] *= thread->num_cores;
|
||||
}
|
||||
|
||||
if (count & 1)
|
||||
{
|
||||
DWORD place;
|
||||
dest[0] = shade_bgra(bufplce[0][(place = local_vplce[0]) >> bits], light0, shade_constants); local_vplce[0] = place + local_vince[0];
|
||||
dest[1] = shade_bgra(bufplce[1][(place = local_vplce[1]) >> bits], light1, shade_constants); local_vplce[1] = place + local_vince[1];
|
||||
dest[2] = shade_bgra(bufplce[2][(place = local_vplce[2]) >> bits], light2, shade_constants); local_vplce[2] = place + local_vince[2];
|
||||
dest[3] = shade_bgra(bufplce[3][(place = local_vplce[3]) >> bits], light3, shade_constants); local_vplce[3] = place + local_vince[3];
|
||||
dest += pitch;
|
||||
}
|
||||
count /= 2;
|
||||
|
||||
// Assume all columns come from the same texture (which they do):
|
||||
const uint32_t *base_addr = MIN(MIN(MIN(bufplce[0], bufplce[1]), bufplce[2]), bufplce[3]);
|
||||
__m256i column_offsets = _mm256_set_epi32(
|
||||
bufplce[3] - base_addr, bufplce[2] - base_addr, bufplce[1] - base_addr, bufplce[0] - base_addr,
|
||||
bufplce[3] - base_addr, bufplce[2] - base_addr, bufplce[1] - base_addr, bufplce[0] - base_addr);
|
||||
|
||||
__m256i place = _mm256_set_epi32(
|
||||
local_vplce[3] + local_vince[3], local_vplce[2] + local_vince[2], local_vplce[1] + local_vince[1], local_vplce[0] + local_vince[0],
|
||||
local_vplce[3], local_vplce[2], local_vplce[1], local_vplce[0]);
|
||||
|
||||
__m256i step = _mm256_set_epi32(
|
||||
local_vince[3], local_vince[2], local_vince[1], local_vince[0],
|
||||
local_vince[3], local_vince[2], local_vince[1], local_vince[0]);
|
||||
step = _mm256_add_epi32(step, step);
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
AVX2_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
|
||||
while (count--)
|
||||
{
|
||||
__m256i fg = _mm256_i32gather_epi32((const int *)base_addr, _mm256_add_epi32(column_offsets, _mm256_srli_epi32(place, bits)), 4);
|
||||
place = _mm256_add_epi32(place, step);
|
||||
AVX2_SHADE_SIMPLE(fg);
|
||||
_mm256_storeu2_m128i((__m128i*)(dest + pitch), (__m128i*)dest, fg);
|
||||
dest += pitch * 2;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
AVX2_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
|
||||
while (count--)
|
||||
{
|
||||
__m256i fg = _mm256_i32gather_epi32((const int *)base_addr, _mm256_add_epi32(column_offsets, _mm256_srai_epi32(place, bits)), 4);
|
||||
place = _mm256_add_epi32(place, step);
|
||||
AVX2_SHADE(fg, shade_constants);
|
||||
_mm256_storeu2_m128i((__m128i*)(dest + pitch), (__m128i*)dest, fg);
|
||||
dest += pitch * 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
int count = thread->count_for_thread(_dest_y, _count);
|
||||
if (count <= 0)
|
||||
return;
|
||||
|
||||
uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
|
||||
int bits = vlinebits;
|
||||
int pitch = _pitch * thread->num_cores;
|
||||
|
||||
uint32_t light0 = calc_light_multiplier(palookuplight[0]);
|
||||
uint32_t light1 = calc_light_multiplier(palookuplight[1]);
|
||||
uint32_t light2 = calc_light_multiplier(palookuplight[2]);
|
||||
uint32_t light3 = calc_light_multiplier(palookuplight[3]);
|
||||
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
|
||||
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
|
||||
int skipped = thread->skipped_by_thread(_dest_y);
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
local_vplce[i] += local_vince[i] * skipped;
|
||||
local_vince[i] *= thread->num_cores;
|
||||
}
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
SSE_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
|
||||
do
|
||||
{
|
||||
DWORD place0 = local_vplce[0];
|
||||
DWORD place1 = local_vplce[1];
|
||||
DWORD place2 = local_vplce[2];
|
||||
DWORD place3 = local_vplce[3];
|
||||
|
||||
uint32_t p0 = bufplce[0][place0 >> bits];
|
||||
uint32_t p1 = bufplce[1][place1 >> bits];
|
||||
uint32_t p2 = bufplce[2][place2 >> bits];
|
||||
uint32_t p3 = bufplce[3][place3 >> bits];
|
||||
|
||||
local_vplce[0] = place0 + local_vince[0];
|
||||
local_vplce[1] = place1 + local_vince[1];
|
||||
local_vplce[2] = place2 + local_vince[2];
|
||||
local_vplce[3] = place3 + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
|
||||
SSE_SHADE_SIMPLE(fg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
SSE_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
|
||||
do
|
||||
{
|
||||
DWORD place0 = local_vplce[0];
|
||||
DWORD place1 = local_vplce[1];
|
||||
DWORD place2 = local_vplce[2];
|
||||
DWORD place3 = local_vplce[3];
|
||||
|
||||
uint32_t p0 = bufplce[0][place0 >> bits];
|
||||
uint32_t p1 = bufplce[1][place1 >> bits];
|
||||
uint32_t p2 = bufplce[2][place2 >> bits];
|
||||
uint32_t p3 = bufplce[3][place3 >> bits];
|
||||
|
||||
local_vplce[0] = place0 + local_vince[0];
|
||||
local_vplce[1] = place1 + local_vince[1];
|
||||
local_vplce[2] = place2 + local_vince[2];
|
||||
local_vplce[3] = place3 + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
|
||||
SSE_SHADE(fg, shade_constants);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
class Mvlinec1RGBACommand : public DrawerCommand
|
||||
|
@ -2980,7 +2450,6 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
#ifdef NO_SSE
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
int count = thread->count_for_thread(_dest_y, _count);
|
||||
|
@ -3018,93 +2487,6 @@ public:
|
|||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
#else
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
int count = thread->count_for_thread(_dest_y, _count);
|
||||
if (count <= 0)
|
||||
return;
|
||||
|
||||
uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
|
||||
int pitch = _pitch * thread->num_cores;
|
||||
int bits = mvlinebits;
|
||||
|
||||
uint32_t light0 = calc_light_multiplier(palookuplight[0]);
|
||||
uint32_t light1 = calc_light_multiplier(palookuplight[1]);
|
||||
uint32_t light2 = calc_light_multiplier(palookuplight[2]);
|
||||
uint32_t light3 = calc_light_multiplier(palookuplight[3]);
|
||||
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
|
||||
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
|
||||
int skipped = thread->skipped_by_thread(_dest_y);
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
local_vplce[i] += local_vince[i] * skipped;
|
||||
local_vince[i] *= thread->num_cores;
|
||||
}
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
SSE_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
|
||||
do
|
||||
{
|
||||
DWORD place0 = local_vplce[0];
|
||||
DWORD place1 = local_vplce[1];
|
||||
DWORD place2 = local_vplce[2];
|
||||
DWORD place3 = local_vplce[3];
|
||||
|
||||
uint32_t pix0 = bufplce[0][place0 >> bits];
|
||||
uint32_t pix1 = bufplce[1][place1 >> bits];
|
||||
uint32_t pix2 = bufplce[2][place2 >> bits];
|
||||
uint32_t pix3 = bufplce[3][place3 >> bits];
|
||||
|
||||
// movemask = !(pix == 0)
|
||||
__m128i movemask = _mm_xor_si128(_mm_cmpeq_epi32(_mm_set_epi32(pix3, pix2, pix1, pix0), _mm_setzero_si128()), _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
|
||||
|
||||
local_vplce[0] = place0 + local_vince[0];
|
||||
local_vplce[1] = place1 + local_vince[1];
|
||||
local_vplce[2] = place2 + local_vince[2];
|
||||
local_vplce[3] = place3 + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
|
||||
SSE_SHADE_SIMPLE(fg);
|
||||
_mm_maskmoveu_si128(fg, movemask, (char*)dest);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
SSE_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
|
||||
do
|
||||
{
|
||||
DWORD place0 = local_vplce[0];
|
||||
DWORD place1 = local_vplce[1];
|
||||
DWORD place2 = local_vplce[2];
|
||||
DWORD place3 = local_vplce[3];
|
||||
|
||||
uint32_t pix0 = bufplce[0][place0 >> bits];
|
||||
uint32_t pix1 = bufplce[1][place1 >> bits];
|
||||
uint32_t pix2 = bufplce[2][place2 >> bits];
|
||||
uint32_t pix3 = bufplce[3][place3 >> bits];
|
||||
|
||||
// movemask = !(pix == 0)
|
||||
__m128i movemask = _mm_xor_si128(_mm_cmpeq_epi32(_mm_set_epi32(pix3, pix2, pix1, pix0), _mm_setzero_si128()), _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
|
||||
|
||||
local_vplce[0] = place0 + local_vince[0];
|
||||
local_vplce[1] = place1 + local_vince[1];
|
||||
local_vplce[2] = place2 + local_vince[2];
|
||||
local_vplce[3] = place3 + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
|
||||
SSE_SHADE(fg, shade_constants);
|
||||
_mm_maskmoveu_si128(fg, movemask, (char*)dest);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
class Tmvline1AddRGBACommand : public DrawerCommand
|
||||
|
@ -4254,7 +3636,14 @@ void R_DrawRevSubClampTranslatedColumn_rgba()
|
|||
|
||||
void R_DrawSpan_rgba()
|
||||
{
|
||||
#ifdef NO_SSE
|
||||
DrawerCommandQueue::QueueCommand<DrawSpanRGBACommand>();
|
||||
#else
|
||||
if (!r_linearlight)
|
||||
DrawerCommandQueue::QueueCommand<DrawSpanRGBA_SSE_Command>();
|
||||
else
|
||||
DrawerCommandQueue::QueueCommand<DrawSpanRGBA_AVX_Command>();
|
||||
#endif
|
||||
}
|
||||
|
||||
void R_DrawSpanMasked_rgba()
|
||||
|
@ -4304,7 +3693,14 @@ DWORD vlinec1_rgba()
|
|||
|
||||
void vlinec4_rgba()
|
||||
{
|
||||
#ifdef NO_SSE
|
||||
DrawerCommandQueue::QueueCommand<Vlinec4RGBACommand>();
|
||||
#else
|
||||
if (!r_linearlight)
|
||||
DrawerCommandQueue::QueueCommand<Vlinec4RGBA_SSE_Command>();
|
||||
else
|
||||
DrawerCommandQueue::QueueCommand<Vlinec4RGBA_AVX_Command>();
|
||||
#endif
|
||||
for (int i = 0; i < 4; i++)
|
||||
vplce[i] += vince[i] * dc_count;
|
||||
}
|
||||
|
@ -4317,7 +3713,14 @@ DWORD mvlinec1_rgba()
|
|||
|
||||
void mvlinec4_rgba()
|
||||
{
|
||||
#ifdef NO_SSE
|
||||
DrawerCommandQueue::QueueCommand<Mvlinec4RGBACommand>();
|
||||
#else
|
||||
if (!r_linearlight)
|
||||
DrawerCommandQueue::QueueCommand<Mvlinec4RGBA_SSE_Command>();
|
||||
else
|
||||
DrawerCommandQueue::QueueCommand<Mvlinec4RGBA_AVX_Command>();
|
||||
#endif
|
||||
for (int i = 0; i < 4; i++)
|
||||
vplce[i] += vince[i] * dc_count;
|
||||
}
|
||||
|
|
|
@ -424,59 +424,124 @@ FORCEINLINE uint32_t alpha_blend(uint32_t fg, uint32_t bg)
|
|||
return 0xff000000 | (red << 16) | (green << 8) | blue;
|
||||
}
|
||||
|
||||
// Calculate constants for a simple shade
|
||||
#define AVX2_SHADE_SIMPLE_INIT(light) \
|
||||
__m256i mlight = _mm256_set_epi16(256, light, light, light, 256, light, light, light, 256, light, light, light, 256, light, light, light);
|
||||
// Calculate constants for a simple shade with gamma correction
|
||||
#define AVX_LINEAR_SHADE_SIMPLE_INIT(light) \
|
||||
__m256 mlight_hi = _mm256_set_ps(1.0f, light * (1.0f/256.0f), light * (1.0f/256.0f), light * (1.0f/256.0f), 1.0f, light * (1.0f/256.0f), light * (1.0f/256.0f), light * (1.0f/256.0f)); \
|
||||
mlight_hi = _mm256_mul_ps(mlight_hi, mlight_hi); \
|
||||
__m256 mlight_lo = mlight_hi; \
|
||||
__m256 mrcp_255 = _mm256_set1_ps(1.0f/255.0f); \
|
||||
__m256 m255 = _mm256_set1_ps(255.0f);
|
||||
|
||||
// Calculate constants for a simple shade with different light levels for each pixel
|
||||
#define AVX2_SHADE_SIMPLE_INIT4(light3, light2, light1, light0) \
|
||||
__m256i mlight = _mm256_set_epi16(256, light3, light3, light3, 256, light2, light2, light2, 256, light1, light1, light1, 256, light0, light0, light0);
|
||||
// Calculate constants for a simple shade with different light levels for each pixel and gamma correction
|
||||
#define AVX_LINEAR_SHADE_SIMPLE_INIT4(light3, light2, light1, light0) \
|
||||
__m256 mlight_hi = _mm256_set_ps(1.0f, light1 * (1.0f/256.0f), light1 * (1.0f/256.0f), light1 * (1.0f/256.0f), 1.0f, light0 * (1.0f/256.0f), light0 * (1.0f/256.0f), light0 * (1.0f/256.0f)); \
|
||||
__m256 mlight_lo = _mm256_set_ps(1.0f, light3 * (1.0f/256.0f), light3 * (1.0f/256.0f), light3 * (1.0f/256.0f), 1.0f, light2 * (1.0f/256.0f), light2 * (1.0f/256.0f), light2 * (1.0f/256.0f)); \
|
||||
mlight_hi = _mm256_mul_ps(mlight_hi, mlight_hi); \
|
||||
mlight_lo = _mm256_mul_ps(mlight_lo, mlight_lo); \
|
||||
__m256 mrcp_255 = _mm256_set1_ps(1.0f/255.0f); \
|
||||
__m256 m255 = _mm256_set1_ps(255.0f);
|
||||
|
||||
// Simple shade 8 pixels
|
||||
#define AVX2_SHADE_SIMPLE(fg) { \
|
||||
__m256i fg_hi = _mm256_unpackhi_epi8(fg, _mm256_setzero_si256()); \
|
||||
__m256i fg_lo = _mm256_unpacklo_epi8(fg, _mm256_setzero_si256()); \
|
||||
fg_hi = _mm256_mullo_epi16(fg_hi, mlight); \
|
||||
fg_hi = _mm256_srli_epi16(fg_hi, 8); \
|
||||
fg_lo = _mm256_mullo_epi16(fg_lo, mlight); \
|
||||
fg_lo = _mm256_srli_epi16(fg_lo, 8); \
|
||||
fg = _mm256_packus_epi16(fg_lo, fg_hi); \
|
||||
// Simple shade 4 pixels with gamma correction
|
||||
#define AVX_LINEAR_SHADE_SIMPLE(fg) { \
|
||||
__m256i fg_16 = _mm256_set_m128i(_mm_unpackhi_epi8(fg, _mm_setzero_si128()), _mm_unpacklo_epi8(fg, _mm_setzero_si128())); \
|
||||
__m256 fg_hi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(fg_16, _mm256_setzero_si256())); \
|
||||
__m256 fg_lo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(fg_16, _mm256_setzero_si256())); \
|
||||
fg_hi = _mm256_mul_ps(fg_hi, mrcp_255); \
|
||||
fg_hi = _mm256_mul_ps(fg_hi, fg_hi); \
|
||||
fg_hi = _mm256_mul_ps(fg_hi, mlight_hi); \
|
||||
fg_hi = _mm256_sqrt_ps(fg_hi); \
|
||||
fg_hi = _mm256_mul_ps(fg_hi, m255); \
|
||||
fg_lo = _mm256_mul_ps(fg_lo, mrcp_255); \
|
||||
fg_lo = _mm256_mul_ps(fg_lo, fg_lo); \
|
||||
fg_lo = _mm256_mul_ps(fg_lo, mlight_lo); \
|
||||
fg_lo = _mm256_sqrt_ps(fg_lo); \
|
||||
fg_lo = _mm256_mul_ps(fg_lo, m255); \
|
||||
fg_16 = _mm256_packus_epi32(_mm256_cvtps_epi32(fg_lo), _mm256_cvtps_epi32(fg_hi)); \
|
||||
fg = _mm_packus_epi16(_mm256_extractf128_si256(fg_16, 0), _mm256_extractf128_si256(fg_16, 1)); \
|
||||
}
|
||||
|
||||
// Calculate constants for a complex shade
|
||||
#define AVX2_SHADE_INIT(light, shade_constants) \
|
||||
__m256i mlight = _mm256_set_epi16(256, light, light, light, 256, light, light, light, 256, light, light, light, 256, light, light, light); \
|
||||
__m256i color = _mm256_set_epi16( \
|
||||
shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, \
|
||||
shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, \
|
||||
shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, \
|
||||
shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); \
|
||||
__m256i fade = _mm256_set_epi16( \
|
||||
shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, \
|
||||
shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, \
|
||||
shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, \
|
||||
shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); \
|
||||
__m256i fade_amount = _mm256_mullo_epi16(fade, _mm256_subs_epu16(_mm256_set1_epi16(256), mlight)); \
|
||||
__m256i desaturate = _mm256_set1_epi16(shade_constants.desaturate); \
|
||||
__m256i inv_desaturate = _mm256_set1_epi16(256 - shade_constants.desaturate);
|
||||
// Calculate constants for a complex shade with gamma correction
|
||||
#define AVX_LINEAR_SHADE_INIT(light, shade_constants) \
|
||||
__m256 mlight_hi = _mm256_set_ps(1.0f, light * (1.0f/256.0f), light * (1.0f/256.0f), light * (1.0f/256.0f), 1.0f, light * (1.0f/256.0f), light * (1.0f/256.0f), light * (1.0f/256.0f)); \
|
||||
mlight_hi = _mm256_mul_ps(mlight_hi, mlight_hi); \
|
||||
__m256 mlight_lo = mlight_hi; \
|
||||
__m256 mrcp_255 = _mm256_set1_ps(1.0f/255.0f); \
|
||||
__m256 m255 = _mm256_set1_ps(255.0f); \
|
||||
__m256 color = _mm256_set_ps( \
|
||||
shade_constants.light_alpha * (1.0f/256.0f), shade_constants.light_red * (1.0f/256.0f), shade_constants.light_green * (1.0f/256.0f), shade_constants.light_blue * (1.0f/256.0f), \
|
||||
shade_constants.light_alpha * (1.0f/256.0f), shade_constants.light_red * (1.0f/256.0f), shade_constants.light_green * (1.0f/256.0f), shade_constants.light_blue * (1.0f/256.0f)); \
|
||||
__m256 fade = _mm256_set_ps( \
|
||||
shade_constants.fade_alpha * (1.0f/256.0f), shade_constants.fade_red * (1.0f/256.0f), shade_constants.fade_green * (1.0f/256.0f), shade_constants.fade_blue * (1.0f/256.0f), \
|
||||
shade_constants.fade_alpha * (1.0f/256.0f), shade_constants.fade_red * (1.0f/256.0f), shade_constants.fade_green * (1.0f/256.0f), shade_constants.fade_blue * (1.0f/256.0f)); \
|
||||
__m256 fade_amount_hi = _mm256_mul_ps(fade, _mm256_sub_ps(_mm256_set1_ps(1.0f), mlight_hi)); \
|
||||
__m256 fade_amount_lo = _mm256_mul_ps(fade, _mm256_sub_ps(_mm256_set1_ps(1.0f), mlight_lo)); \
|
||||
__m256 inv_desaturate = _mm256_set1_ps((256 - shade_constants.desaturate) * (1.0f/256.0f)); \
|
||||
__m128 ss_desaturate = _mm_set_ss(shade_constants.desaturate * (1.0f/256.0f)); \
|
||||
__m128 intensity_weight = _mm_set_ps(0.0f, 77.0f/256.0f, 143.0f/256.0f, 37.0f/256.0f);
|
||||
|
||||
// Calculate constants for a complex shade with different light levels for each pixel
|
||||
#define AVX2_SHADE_INIT4(light3, light2, light1, light0, shade_constants) \
|
||||
__m256i mlight = _mm256_set_epi16(256, light3, light3, light3, 256, light2, light2, light2, 256, light1, light1, light1, 256, light0, light0, light0); \
|
||||
__m256i color = _mm256_set_epi16( \
|
||||
shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, \
|
||||
shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, \
|
||||
shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, \
|
||||
shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); \
|
||||
__m256i fade = _mm256_set_epi16( \
|
||||
shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, \
|
||||
shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, \
|
||||
shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, \
|
||||
shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); \
|
||||
__m256i fade_amount = _mm256_mullo_epi16(fade, _mm256_subs_epu16(_mm256_set1_epi16(256), mlight)); \
|
||||
__m256i desaturate = _mm256_set1_epi16(shade_constants.desaturate); \
|
||||
__m256i inv_desaturate = _mm256_set1_epi16(256 - shade_constants.desaturate);
|
||||
// Calculate constants for a complex shade with different light levels for each pixel and gamma correction
|
||||
#define AVX_LINEAR_SHADE_INIT4(light3, light2, light1, light0, shade_constants) \
|
||||
__m256 mlight_hi = _mm256_set_ps(1.0f, light1 * (1.0f/256.0f), light1 * (1.0f/256.0f), light1 * (1.0f/256.0f), 1.0f, light0 * (1.0f/256.0f), light0 * (1.0f/256.0f), light0 * (1.0f/256.0f)); \
|
||||
__m256 mlight_lo = _mm256_set_ps(1.0f, light3 * (1.0f/256.0f), light3 * (1.0f/256.0f), light3 * (1.0f/256.0f), 1.0f, light2 * (1.0f/256.0f), light2 * (1.0f/256.0f), light2 * (1.0f/256.0f)); \
|
||||
mlight_hi = _mm256_mul_ps(mlight_hi, mlight_hi); \
|
||||
mlight_lo = _mm256_mul_ps(mlight_lo, mlight_lo); \
|
||||
__m256 mrcp_255 = _mm256_set1_ps(1.0f/255.0f); \
|
||||
__m256 m255 = _mm256_set1_ps(255.0f); \
|
||||
__m256 color = _mm256_set_ps( \
|
||||
shade_constants.light_alpha * (1.0f/256.0f), shade_constants.light_red * (1.0f/256.0f), shade_constants.light_green * (1.0f/256.0f), shade_constants.light_blue * (1.0f/256.0f), \
|
||||
shade_constants.light_alpha * (1.0f/256.0f), shade_constants.light_red * (1.0f/256.0f), shade_constants.light_green * (1.0f/256.0f), shade_constants.light_blue * (1.0f/256.0f)); \
|
||||
__m256 fade = _mm256_set_ps( \
|
||||
shade_constants.fade_alpha * (1.0f/256.0f), shade_constants.fade_red * (1.0f/256.0f), shade_constants.fade_green * (1.0f/256.0f), shade_constants.fade_blue * (1.0f/256.0f), \
|
||||
shade_constants.fade_alpha * (1.0f/256.0f), shade_constants.fade_red * (1.0f/256.0f), shade_constants.fade_green * (1.0f/256.0f), shade_constants.fade_blue * (1.0f/256.0f)); \
|
||||
__m256 fade_amount_hi = _mm256_mul_ps(fade, _mm256_sub_ps(_mm256_set1_ps(1.0f), mlight_hi)); \
|
||||
__m256 fade_amount_lo = _mm256_mul_ps(fade, _mm256_sub_ps(_mm256_set1_ps(1.0f), mlight_lo)); \
|
||||
__m256 inv_desaturate = _mm256_set1_ps((256 - shade_constants.desaturate) * (1.0f/256.0f)); \
|
||||
__m128 ss_desaturate = _mm_set_ss(shade_constants.desaturate * (1.0f/256.0f)); \
|
||||
__m128 intensity_weight = _mm_set_ps(0.0f, 77.0f/256.0f, 143.0f/256.0f, 37.0f/256.0f);
|
||||
|
||||
// Complex shade 4 pixels with gamma correction
|
||||
#define AVX_LINEAR_SHADE(fg, shade_constants) { \
|
||||
__m256i fg_16 = _mm256_set_m128i(_mm_unpackhi_epi8(fg, _mm_setzero_si128()), _mm_unpacklo_epi8(fg, _mm_setzero_si128())); \
|
||||
__m256 fg_hi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(fg_16, _mm256_setzero_si256())); \
|
||||
__m256 fg_lo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(fg_16, _mm256_setzero_si256())); \
|
||||
fg_hi = _mm256_mul_ps(fg_hi, mrcp_255); \
|
||||
fg_hi = _mm256_mul_ps(fg_hi, fg_hi); \
|
||||
fg_lo = _mm256_mul_ps(fg_lo, mrcp_255); \
|
||||
fg_lo = _mm256_mul_ps(fg_lo, fg_lo); \
|
||||
\
|
||||
__m128 intensity_hi0 = _mm_mul_ps(_mm256_extractf128_ps(fg_hi, 0), intensity_weight); \
|
||||
__m128 intensity_hi1 = _mm_mul_ps(_mm256_extractf128_ps(fg_hi, 1), intensity_weight); \
|
||||
intensity_hi0 = _mm_mul_ss(_mm_add_ss(_mm_add_ss(intensity_hi0, _mm_shuffle_ps(intensity_hi0, intensity_hi0, _MM_SHUFFLE(1,1,1,1))), _mm_shuffle_ps(intensity_hi0, intensity_hi0, _MM_SHUFFLE(2,2,2,2))), ss_desaturate); \
|
||||
intensity_hi0 = _mm_shuffle_ps(intensity_hi0, intensity_hi0, _MM_SHUFFLE(0,0,0,0)); \
|
||||
intensity_hi1 = _mm_mul_ss(_mm_add_ss(_mm_add_ss(intensity_hi1, _mm_shuffle_ps(intensity_hi1, intensity_hi1, _MM_SHUFFLE(1,1,1,1))), _mm_shuffle_ps(intensity_hi1, intensity_hi1, _MM_SHUFFLE(2,2,2,2))), ss_desaturate); \
|
||||
intensity_hi1 = _mm_shuffle_ps(intensity_hi1, intensity_hi1, _MM_SHUFFLE(0,0,0,0)); \
|
||||
__m256 intensity_hi = _mm256_set_m128(intensity_hi1, intensity_hi0); \
|
||||
\
|
||||
fg_hi = _mm256_add_ps(_mm256_mul_ps(fg_hi, inv_desaturate), intensity_hi); \
|
||||
fg_hi = _mm256_add_ps(_mm256_mul_ps(fg_hi, mlight_hi), fade_amount_hi); \
|
||||
fg_hi = _mm256_mul_ps(fg_hi, color); \
|
||||
\
|
||||
__m128 intensity_lo0 = _mm_mul_ps(_mm256_extractf128_ps(fg_lo, 0), intensity_weight); \
|
||||
__m128 intensity_lo1 = _mm_mul_ps(_mm256_extractf128_ps(fg_lo, 1), intensity_weight); \
|
||||
intensity_lo0 = _mm_mul_ss(_mm_add_ss(_mm_add_ss(intensity_lo0, _mm_shuffle_ps(intensity_lo0, intensity_lo0, _MM_SHUFFLE(1,1,1,1))), _mm_shuffle_ps(intensity_lo0, intensity_lo0, _MM_SHUFFLE(2,2,2,2))), ss_desaturate); \
|
||||
intensity_lo0 = _mm_shuffle_ps(intensity_lo0, intensity_lo0, _MM_SHUFFLE(0,0,0,0)); \
|
||||
intensity_lo1 = _mm_mul_ss(_mm_add_ss(_mm_add_ss(intensity_lo1, _mm_shuffle_ps(intensity_lo1, intensity_lo1, _MM_SHUFFLE(1,1,1,1))), _mm_shuffle_ps(intensity_lo1, intensity_lo1, _MM_SHUFFLE(2,2,2,2))), ss_desaturate); \
|
||||
intensity_lo1 = _mm_shuffle_ps(intensity_lo1, intensity_lo1, _MM_SHUFFLE(0,0,0,0)); \
|
||||
__m256 intensity_lo = _mm256_set_m128(intensity_lo1, intensity_lo0); \
|
||||
\
|
||||
fg_lo = _mm256_add_ps(_mm256_mul_ps(fg_lo, inv_desaturate), intensity_lo); \
|
||||
fg_lo = _mm256_add_ps(_mm256_mul_ps(fg_lo, mlight_lo), fade_amount_lo); \
|
||||
fg_lo = _mm256_mul_ps(fg_lo, color); \
|
||||
\
|
||||
fg_hi = _mm256_sqrt_ps(fg_hi); \
|
||||
fg_hi = _mm256_mul_ps(fg_hi, m255); \
|
||||
fg_lo = _mm256_sqrt_ps(fg_lo); \
|
||||
fg_lo = _mm256_mul_ps(fg_lo, m255); \
|
||||
fg_16 = _mm256_packus_epi32(_mm256_cvtps_epi32(fg_lo), _mm256_cvtps_epi32(fg_hi)); \
|
||||
fg = _mm_packus_epi16(_mm256_extractf128_si256(fg_16, 0), _mm256_extractf128_si256(fg_16, 1)); \
|
||||
}
|
||||
|
||||
/*
|
||||
// Complex shade 8 pixels
|
||||
#define AVX2_SHADE(fg, shade_constants) { \
|
||||
__m256i fg_hi = _mm256_unpackhi_epi8(fg, _mm256_setzero_si256()); \
|
||||
|
@ -499,7 +564,7 @@ FORCEINLINE uint32_t alpha_blend(uint32_t fg, uint32_t bg)
|
|||
\
|
||||
fg = _mm256_packus_epi16(fg_lo, fg_hi); \
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
|
||||
|
||||
|
|
491
src/r_draw_rgba_sse.h
Normal file
491
src/r_draw_rgba_sse.h
Normal file
|
@ -0,0 +1,491 @@
|
|||
//
|
||||
// SSE/AVX intrinsics based drawers for the r_draw family of drawers.
|
||||
//
|
||||
// Note: This header file is intentionally not guarded by a __R_DRAW_RGBA_SSE__ define.
|
||||
// It is because the code is nearly identical for SSE vs AVX. The file is included
|
||||
// multiple times by r_draw_rgba.cpp with different defines that changes the class
|
||||
// names outputted and the type of intrinsics used.
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning(disable: 4752) // warning C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
|
||||
#endif
|
||||
|
||||
class VecCommand(DrawSpanRGBA) : public DrawerCommand
|
||||
{
|
||||
const uint32_t * RESTRICT _source;
|
||||
fixed_t _xfrac;
|
||||
fixed_t _yfrac;
|
||||
fixed_t _xstep;
|
||||
fixed_t _ystep;
|
||||
int _x1;
|
||||
int _x2;
|
||||
int _y;
|
||||
int _xbits;
|
||||
int _ybits;
|
||||
BYTE * RESTRICT _destorg;
|
||||
fixed_t _light;
|
||||
ShadeConstants _shade_constants;
|
||||
|
||||
public:
|
||||
VecCommand(DrawSpanRGBA)()
|
||||
{
|
||||
_source = (const uint32_t*)ds_source;
|
||||
_xfrac = ds_xfrac;
|
||||
_yfrac = ds_yfrac;
|
||||
_xstep = ds_xstep;
|
||||
_ystep = ds_ystep;
|
||||
_x1 = ds_x1;
|
||||
_x2 = ds_x2;
|
||||
_y = ds_y;
|
||||
_xbits = ds_xbits;
|
||||
_ybits = ds_ybits;
|
||||
_destorg = dc_destorg;
|
||||
_light = ds_light;
|
||||
_shade_constants = ds_shade_constants;
|
||||
}
|
||||
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
if (thread->line_skipped_by_thread(_y))
|
||||
return;
|
||||
|
||||
dsfixed_t xfrac;
|
||||
dsfixed_t yfrac;
|
||||
dsfixed_t xstep;
|
||||
dsfixed_t ystep;
|
||||
uint32_t* dest;
|
||||
const uint32_t* source = _source;
|
||||
int count;
|
||||
int spot;
|
||||
|
||||
xfrac = _xfrac;
|
||||
yfrac = _yfrac;
|
||||
|
||||
dest = ylookup[_y] + _x1 + (uint32_t*)_destorg;
|
||||
|
||||
count = _x2 - _x1 + 1;
|
||||
|
||||
xstep = _xstep;
|
||||
ystep = _ystep;
|
||||
|
||||
uint32_t light = calc_light_multiplier(_light);
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
if (_xbits == 6 && _ybits == 6)
|
||||
{
|
||||
// 64x64 is the most common case by far, so special case it.
|
||||
|
||||
int sse_count = count / 4;
|
||||
count -= sse_count * 4;
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT(light);
|
||||
|
||||
while (sse_count--)
|
||||
{
|
||||
// Current texture index in u,v.
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p0 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p1 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p2 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p3 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
// Lookup pixel from flat texture tile,
|
||||
// re-index using light/colormap.
|
||||
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
|
||||
// Next step in u,v.
|
||||
dest += 4;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT(light, shade_constants);
|
||||
|
||||
while (sse_count--)
|
||||
{
|
||||
// Current texture index in u,v.
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p0 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p1 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p2 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p3 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
// Lookup pixel from flat texture tile,
|
||||
// re-index using light/colormap.
|
||||
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
|
||||
// Next step in u,v.
|
||||
dest += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (count == 0)
|
||||
return;
|
||||
|
||||
do
|
||||
{
|
||||
// Current texture index in u,v.
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
|
||||
// Lookup pixel from flat texture tile
|
||||
*dest++ = shade_bgra(source[spot], light, shade_constants);
|
||||
|
||||
// Next step in u,v.
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
BYTE yshift = 32 - _ybits;
|
||||
BYTE xshift = yshift - _xbits;
|
||||
int xmask = ((1 << _xbits) - 1) << _ybits;
|
||||
|
||||
int sse_count = count / 4;
|
||||
count -= sse_count * 4;
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT(light);
|
||||
|
||||
while (sse_count--)
|
||||
{
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p0 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p1 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p2 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p3 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
// Lookup pixel from flat texture tile
|
||||
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += 4;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT(light, shade_constants);
|
||||
|
||||
while (sse_count--)
|
||||
{
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p0 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p1 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p2 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p3 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
// Lookup pixel from flat texture tile
|
||||
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (count == 0)
|
||||
return;
|
||||
|
||||
do
|
||||
{
|
||||
// Current texture index in u,v.
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
|
||||
// Lookup pixel from flat texture tile
|
||||
*dest++ = shade_bgra(source[spot], light, shade_constants);
|
||||
|
||||
// Next step in u,v.
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class VecCommand(Vlinec4RGBA) : public DrawerCommand
|
||||
{
|
||||
BYTE * RESTRICT _dest;
|
||||
int _count;
|
||||
int _pitch;
|
||||
ShadeConstants _shade_constants;
|
||||
int vlinebits;
|
||||
fixed_t palookuplight[4];
|
||||
DWORD vplce[4];
|
||||
DWORD vince[4];
|
||||
const uint32 * RESTRICT bufplce[4];
|
||||
|
||||
public:
|
||||
VecCommand(Vlinec4RGBA)()
|
||||
{
|
||||
_dest = dc_dest;
|
||||
_count = dc_count;
|
||||
_pitch = dc_pitch;
|
||||
_shade_constants = dc_shade_constants;
|
||||
vlinebits = ::vlinebits;
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
palookuplight[i] = ::palookuplight[i];
|
||||
vplce[i] = ::vplce[i];
|
||||
vince[i] = ::vince[i];
|
||||
bufplce[i] = (const uint32 *)::bufplce[i];
|
||||
}
|
||||
}
|
||||
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
int count = thread->count_for_thread(_dest_y, _count);
|
||||
if (count <= 0)
|
||||
return;
|
||||
|
||||
uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
|
||||
int bits = vlinebits;
|
||||
int pitch = _pitch * thread->num_cores;
|
||||
|
||||
uint32_t light0 = calc_light_multiplier(palookuplight[0]);
|
||||
uint32_t light1 = calc_light_multiplier(palookuplight[1]);
|
||||
uint32_t light2 = calc_light_multiplier(palookuplight[2]);
|
||||
uint32_t light3 = calc_light_multiplier(palookuplight[3]);
|
||||
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
|
||||
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
|
||||
int skipped = thread->skipped_by_thread(_dest_y);
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
local_vplce[i] += local_vince[i] * skipped;
|
||||
local_vince[i] *= thread->num_cores;
|
||||
}
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
|
||||
do
|
||||
{
|
||||
DWORD place0 = local_vplce[0];
|
||||
DWORD place1 = local_vplce[1];
|
||||
DWORD place2 = local_vplce[2];
|
||||
DWORD place3 = local_vplce[3];
|
||||
|
||||
uint32_t p0 = bufplce[0][place0 >> bits];
|
||||
uint32_t p1 = bufplce[1][place1 >> bits];
|
||||
uint32_t p2 = bufplce[2][place2 >> bits];
|
||||
uint32_t p3 = bufplce[3][place3 >> bits];
|
||||
|
||||
local_vplce[0] = place0 + local_vince[0];
|
||||
local_vplce[1] = place1 + local_vince[1];
|
||||
local_vplce[2] = place2 + local_vince[2];
|
||||
local_vplce[3] = place3 + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
|
||||
do
|
||||
{
|
||||
DWORD place0 = local_vplce[0];
|
||||
DWORD place1 = local_vplce[1];
|
||||
DWORD place2 = local_vplce[2];
|
||||
DWORD place3 = local_vplce[3];
|
||||
|
||||
uint32_t p0 = bufplce[0][place0 >> bits];
|
||||
uint32_t p1 = bufplce[1][place1 >> bits];
|
||||
uint32_t p2 = bufplce[2][place2 >> bits];
|
||||
uint32_t p3 = bufplce[3][place3 >> bits];
|
||||
|
||||
local_vplce[0] = place0 + local_vince[0];
|
||||
local_vplce[1] = place1 + local_vince[1];
|
||||
local_vplce[2] = place2 + local_vince[2];
|
||||
local_vplce[3] = place3 + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class VecCommand(Mvlinec4RGBA) : public DrawerCommand
|
||||
{
|
||||
BYTE * RESTRICT _dest;
|
||||
int _count;
|
||||
int _pitch;
|
||||
ShadeConstants _shade_constants;
|
||||
int mvlinebits;
|
||||
fixed_t palookuplight[4];
|
||||
DWORD vplce[4];
|
||||
DWORD vince[4];
|
||||
const uint32 * RESTRICT bufplce[4];
|
||||
|
||||
public:
|
||||
VecCommand(Mvlinec4RGBA)()
|
||||
{
|
||||
_dest = dc_dest;
|
||||
_count = dc_count;
|
||||
_pitch = dc_pitch;
|
||||
_shade_constants = dc_shade_constants;
|
||||
mvlinebits = ::mvlinebits;
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
palookuplight[i] = ::palookuplight[i];
|
||||
vplce[i] = ::vplce[i];
|
||||
vince[i] = ::vince[i];
|
||||
bufplce[i] = (const uint32 *)::bufplce[i];
|
||||
}
|
||||
}
|
||||
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
int count = thread->count_for_thread(_dest_y, _count);
|
||||
if (count <= 0)
|
||||
return;
|
||||
|
||||
uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
|
||||
int pitch = _pitch * thread->num_cores;
|
||||
int bits = mvlinebits;
|
||||
|
||||
uint32_t light0 = calc_light_multiplier(palookuplight[0]);
|
||||
uint32_t light1 = calc_light_multiplier(palookuplight[1]);
|
||||
uint32_t light2 = calc_light_multiplier(palookuplight[2]);
|
||||
uint32_t light3 = calc_light_multiplier(palookuplight[3]);
|
||||
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
|
||||
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
|
||||
int skipped = thread->skipped_by_thread(_dest_y);
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
local_vplce[i] += local_vince[i] * skipped;
|
||||
local_vince[i] *= thread->num_cores;
|
||||
}
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
|
||||
do
|
||||
{
|
||||
DWORD place0 = local_vplce[0];
|
||||
DWORD place1 = local_vplce[1];
|
||||
DWORD place2 = local_vplce[2];
|
||||
DWORD place3 = local_vplce[3];
|
||||
|
||||
uint32_t pix0 = bufplce[0][place0 >> bits];
|
||||
uint32_t pix1 = bufplce[1][place1 >> bits];
|
||||
uint32_t pix2 = bufplce[2][place2 >> bits];
|
||||
uint32_t pix3 = bufplce[3][place3 >> bits];
|
||||
|
||||
// movemask = !(pix == 0)
|
||||
__m128i movemask = _mm_xor_si128(_mm_cmpeq_epi32(_mm_set_epi32(pix3, pix2, pix1, pix0), _mm_setzero_si128()), _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
|
||||
|
||||
local_vplce[0] = place0 + local_vince[0];
|
||||
local_vplce[1] = place1 + local_vince[1];
|
||||
local_vplce[2] = place2 + local_vince[2];
|
||||
local_vplce[3] = place3 + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
_mm_maskmoveu_si128(fg, movemask, (char*)dest);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
|
||||
do
|
||||
{
|
||||
DWORD place0 = local_vplce[0];
|
||||
DWORD place1 = local_vplce[1];
|
||||
DWORD place2 = local_vplce[2];
|
||||
DWORD place3 = local_vplce[3];
|
||||
|
||||
uint32_t pix0 = bufplce[0][place0 >> bits];
|
||||
uint32_t pix1 = bufplce[1][place1 >> bits];
|
||||
uint32_t pix2 = bufplce[2][place2 >> bits];
|
||||
uint32_t pix3 = bufplce[3][place3 >> bits];
|
||||
|
||||
// movemask = !(pix == 0)
|
||||
__m128i movemask = _mm_xor_si128(_mm_cmpeq_epi32(_mm_set_epi32(pix3, pix2, pix1, pix0), _mm_setzero_si128()), _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
|
||||
|
||||
local_vplce[0] = place0 + local_vince[0];
|
||||
local_vplce[1] = place1 + local_vince[1];
|
||||
local_vplce[2] = place2 + local_vince[2];
|
||||
local_vplce[3] = place3 + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
_mm_maskmoveu_si128(fg, movemask, (char*)dest);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
};
|
|
@ -51,6 +51,39 @@ extern unsigned int dc_tspans[4][MAXHEIGHT];
|
|||
extern unsigned int *dc_ctspan[4];
|
||||
extern unsigned int *horizspan[4];
|
||||
|
||||
EXTERN_CVAR(Bool, r_linearlight)
|
||||
|
||||
#ifndef NO_SSE
|
||||
|
||||
// Generate SSE drawers:
|
||||
#define VecCommand(name) name##_SSE_Command
|
||||
#define VEC_SHADE_SIMPLE_INIT SSE_SHADE_SIMPLE_INIT
|
||||
#define VEC_SHADE_SIMPLE_INIT4 SSE_SHADE_SIMPLE_INIT4
|
||||
#define VEC_SHADE_SIMPLE SSE_SHADE_SIMPLE
|
||||
#define VEC_SHADE_INIT SSE_SHADE_INIT
|
||||
#define VEC_SHADE_INIT4 SSE_SHADE_INIT4
|
||||
#define VEC_SHADE SSE_SHADE
|
||||
#include "r_drawt_rgba_sse.h"
|
||||
|
||||
// Generate AVX drawers:
|
||||
#undef VecCommand
|
||||
#undef VEC_SHADE_SIMPLE_INIT
|
||||
#undef VEC_SHADE_SIMPLE_INIT4
|
||||
#undef VEC_SHADE_SIMPLE
|
||||
#undef VEC_SHADE_INIT
|
||||
#undef VEC_SHADE_INIT4
|
||||
#undef VEC_SHADE
|
||||
#define VecCommand(name) name##_AVX_Command
|
||||
#define VEC_SHADE_SIMPLE_INIT AVX_LINEAR_SHADE_SIMPLE_INIT
|
||||
#define VEC_SHADE_SIMPLE_INIT4 AVX_LINEAR_SHADE_SIMPLE_INIT4
|
||||
#define VEC_SHADE_SIMPLE AVX_LINEAR_SHADE_SIMPLE
|
||||
#define VEC_SHADE_INIT AVX_LINEAR_SHADE_INIT
|
||||
#define VEC_SHADE_INIT4 AVX_LINEAR_SHADE_INIT4
|
||||
#define VEC_SHADE AVX_LINEAR_SHADE
|
||||
#include "r_drawt_rgba_sse.h"
|
||||
|
||||
#endif
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class RtCopy1colRGBACommand : public DrawerCommand
|
||||
|
@ -206,7 +239,6 @@ public:
|
|||
_colormap = dc_colormap;
|
||||
}
|
||||
|
||||
#ifdef NO_SSE
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
uint32_t *source;
|
||||
|
@ -253,132 +285,6 @@ public:
|
|||
dest += pitch * 2;
|
||||
} while (--count);
|
||||
}
|
||||
#else
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
uint32_t *source;
|
||||
uint32_t *dest;
|
||||
int count;
|
||||
int pitch;
|
||||
int sincr;
|
||||
|
||||
count = thread->count_for_thread(yl, yh - yl + 1);
|
||||
if (count <= 0)
|
||||
return;
|
||||
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
uint32_t light = calc_light_multiplier(_light);
|
||||
uint32_t *palette = (uint32_t*)GPalette.BaseColors;
|
||||
|
||||
dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg);
|
||||
source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4;
|
||||
pitch = _pitch * thread->num_cores;
|
||||
sincr = thread->num_cores * 4;
|
||||
|
||||
BYTE *colormap = _colormap;
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
SSE_SHADE_SIMPLE_INIT(light);
|
||||
|
||||
if (count & 1) {
|
||||
uint32_t p0 = colormap[source[0]];
|
||||
uint32_t p1 = colormap[source[1]];
|
||||
uint32_t p2 = colormap[source[2]];
|
||||
uint32_t p3 = colormap[source[3]];
|
||||
|
||||
// shade_pal_index:
|
||||
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
||||
SSE_SHADE_SIMPLE(fg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
|
||||
source += sincr;
|
||||
dest += pitch;
|
||||
}
|
||||
if (!(count >>= 1))
|
||||
return;
|
||||
|
||||
do {
|
||||
// shade_pal_index 0-3
|
||||
{
|
||||
uint32_t p0 = colormap[source[0]];
|
||||
uint32_t p1 = colormap[source[1]];
|
||||
uint32_t p2 = colormap[source[2]];
|
||||
uint32_t p3 = colormap[source[3]];
|
||||
|
||||
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
||||
SSE_SHADE_SIMPLE(fg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
}
|
||||
|
||||
// shade_pal_index 4-7 (pitch)
|
||||
{
|
||||
uint32_t p0 = colormap[source[sincr]];
|
||||
uint32_t p1 = colormap[source[sincr + 1]];
|
||||
uint32_t p2 = colormap[source[sincr + 2]];
|
||||
uint32_t p3 = colormap[source[sincr + 3]];
|
||||
|
||||
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
||||
SSE_SHADE_SIMPLE(fg);
|
||||
_mm_storeu_si128((__m128i*)(dest + pitch), fg);
|
||||
}
|
||||
|
||||
source += sincr * 2;
|
||||
dest += pitch * 2;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
SSE_SHADE_INIT(light, shade_constants);
|
||||
|
||||
if (count & 1) {
|
||||
uint32_t p0 = colormap[source[0]];
|
||||
uint32_t p1 = colormap[source[1]];
|
||||
uint32_t p2 = colormap[source[2]];
|
||||
uint32_t p3 = colormap[source[3]];
|
||||
|
||||
// shade_pal_index:
|
||||
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
||||
SSE_SHADE(fg, shade_constants);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
|
||||
source += sincr;
|
||||
dest += pitch;
|
||||
}
|
||||
if (!(count >>= 1))
|
||||
return;
|
||||
|
||||
do {
|
||||
// shade_pal_index 0-3
|
||||
{
|
||||
uint32_t p0 = colormap[source[0]];
|
||||
uint32_t p1 = colormap[source[1]];
|
||||
uint32_t p2 = colormap[source[2]];
|
||||
uint32_t p3 = colormap[source[3]];
|
||||
|
||||
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
||||
SSE_SHADE(fg, shade_constants);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
}
|
||||
|
||||
// shade_pal_index 4-7 (pitch)
|
||||
{
|
||||
uint32_t p0 = colormap[source[sincr]];
|
||||
uint32_t p1 = colormap[source[sincr + 1]];
|
||||
uint32_t p2 = colormap[source[sincr + 2]];
|
||||
uint32_t p3 = colormap[source[sincr + 3]];
|
||||
|
||||
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
||||
SSE_SHADE(fg, shade_constants);
|
||||
_mm_storeu_si128((__m128i*)(dest + pitch), fg);
|
||||
}
|
||||
|
||||
source += sincr * 2;
|
||||
dest += pitch * 2;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
class RtTranslate1colRGBACommand : public DrawerCommand
|
||||
|
@ -607,7 +513,6 @@ public:
|
|||
_destalpha = dc_destalpha;
|
||||
}
|
||||
|
||||
#ifdef NO_SSE
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
uint32_t *source;
|
||||
|
@ -655,107 +560,6 @@ public:
|
|||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
#else
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
uint32_t *source;
|
||||
uint32_t *dest;
|
||||
int count;
|
||||
int pitch;
|
||||
int sincr;
|
||||
|
||||
count = thread->count_for_thread(yl, yh - yl + 1);
|
||||
if (count <= 0)
|
||||
return;
|
||||
|
||||
dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg);
|
||||
source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4;
|
||||
pitch = _pitch * thread->num_cores;
|
||||
sincr = 4 * thread->num_cores;
|
||||
|
||||
uint32_t light = calc_light_multiplier(_light);
|
||||
uint32_t *palette = (uint32_t*)GPalette.BaseColors;
|
||||
BYTE *colormap = _colormap;
|
||||
|
||||
uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8);
|
||||
uint32_t bg_alpha = _destalpha >> (FRACBITS - 8);
|
||||
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
SSE_SHADE_SIMPLE_INIT(light);
|
||||
|
||||
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
|
||||
__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
|
||||
|
||||
do {
|
||||
uint32_t p0 = colormap[source[0]];
|
||||
uint32_t p1 = colormap[source[1]];
|
||||
uint32_t p2 = colormap[source[2]];
|
||||
uint32_t p3 = colormap[source[3]];
|
||||
|
||||
// shade_pal_index:
|
||||
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
||||
SSE_SHADE_SIMPLE(fg);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
|
||||
// unpack bg:
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
// (fg_red * fg_alpha + bg_red * bg_alpha) / 256:
|
||||
__m128i color_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8);
|
||||
__m128i color_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8);
|
||||
|
||||
__m128i color = _mm_packus_epi16(color_lo, color_hi);
|
||||
_mm_storeu_si128((__m128i*)dest, color);
|
||||
|
||||
source += sincr;
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
SSE_SHADE_INIT(light, shade_constants);
|
||||
|
||||
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
|
||||
__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
|
||||
|
||||
do {
|
||||
uint32_t p0 = colormap[source[0]];
|
||||
uint32_t p1 = colormap[source[1]];
|
||||
uint32_t p2 = colormap[source[2]];
|
||||
uint32_t p3 = colormap[source[3]];
|
||||
|
||||
// shade_pal_index:
|
||||
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
||||
SSE_SHADE(fg, shade_constants);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
|
||||
// unpack bg:
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
// (fg_red * fg_alpha + bg_red * bg_alpha) / 256:
|
||||
__m128i color_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8);
|
||||
__m128i color_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8);
|
||||
|
||||
__m128i color = _mm_packus_epi16(color_lo, color_hi);
|
||||
_mm_storeu_si128((__m128i*)dest, color);
|
||||
|
||||
source += sincr;
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
class RtShaded1colRGBACommand : public DrawerCommand
|
||||
|
@ -853,7 +657,6 @@ public:
|
|||
_light = dc_light;
|
||||
}
|
||||
|
||||
#ifdef NO_SSE
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
BYTE *colormap;
|
||||
|
@ -898,57 +701,6 @@ public:
|
|||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
#else
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
BYTE *colormap;
|
||||
uint32_t *source;
|
||||
uint32_t *dest;
|
||||
int count;
|
||||
int pitch;
|
||||
int sincr;
|
||||
|
||||
count = thread->count_for_thread(yl, yh - yl + 1);
|
||||
if (count <= 0)
|
||||
return;
|
||||
|
||||
colormap = _colormap;
|
||||
dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg);
|
||||
source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4;
|
||||
pitch = _pitch * thread->num_cores;
|
||||
sincr = 4 * thread->num_cores;
|
||||
|
||||
__m128i fg = _mm_unpackhi_epi8(_mm_set1_epi32(shade_pal_index_simple(_color, calc_light_multiplier(_light))), _mm_setzero_si128());
|
||||
__m128i alpha_one = _mm_set1_epi16(64);
|
||||
|
||||
do {
|
||||
uint32_t p0 = colormap[source[0]];
|
||||
uint32_t p1 = colormap[source[1]];
|
||||
uint32_t p2 = colormap[source[2]];
|
||||
uint32_t p3 = colormap[source[3]];
|
||||
|
||||
__m128i alpha_hi = _mm_set_epi16(64, p3, p3, p3, 64, p2, p2, p2);
|
||||
__m128i alpha_lo = _mm_set_epi16(64, p1, p1, p1, 64, p0, p0, p0);
|
||||
__m128i inv_alpha_hi = _mm_subs_epu16(alpha_one, alpha_hi);
|
||||
__m128i inv_alpha_lo = _mm_subs_epu16(alpha_one, alpha_lo);
|
||||
|
||||
// unpack bg:
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
// (fg_red * alpha + bg_red * inv_alpha) / 64:
|
||||
__m128i color_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg, alpha_hi), _mm_mullo_epi16(bg_hi, inv_alpha_hi)), 6);
|
||||
__m128i color_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg, alpha_lo), _mm_mullo_epi16(bg_lo, inv_alpha_lo)), 6);
|
||||
|
||||
__m128i color = _mm_packus_epi16(color_lo, color_hi);
|
||||
_mm_storeu_si128((__m128i*)dest, color);
|
||||
|
||||
source += sincr;
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
class RtAddClamp1colRGBACommand : public DrawerCommand
|
||||
|
@ -1051,7 +803,6 @@ public:
|
|||
_shade_constants = dc_shade_constants;
|
||||
}
|
||||
|
||||
#ifdef NO_SSE
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
uint32_t *source;
|
||||
|
@ -1097,106 +848,6 @@ public:
|
|||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
#else
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
uint32_t *source;
|
||||
uint32_t *dest;
|
||||
int count;
|
||||
int pitch;
|
||||
int sincr;
|
||||
|
||||
count = thread->count_for_thread(yl, yh - yl + 1);
|
||||
if (count <= 0)
|
||||
return;
|
||||
|
||||
dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg);
|
||||
source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4;
|
||||
pitch = _pitch * thread->num_cores;
|
||||
sincr = 4 * thread->num_cores;
|
||||
|
||||
uint32_t light = calc_light_multiplier(_light);
|
||||
uint32_t *palette = (uint32_t*)GPalette.BaseColors;
|
||||
|
||||
uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8);
|
||||
uint32_t bg_alpha = _destalpha >> (FRACBITS - 8);
|
||||
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
SSE_SHADE_SIMPLE_INIT(light);
|
||||
|
||||
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
|
||||
__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
|
||||
|
||||
do {
|
||||
uint32_t p0 = source[0];
|
||||
uint32_t p1 = source[1];
|
||||
uint32_t p2 = source[2];
|
||||
uint32_t p3 = source[3];
|
||||
|
||||
// shade_pal_index:
|
||||
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
||||
SSE_SHADE_SIMPLE(fg);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
|
||||
// unpack bg:
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
// (fg_red * fg_alpha + bg_red * bg_alpha) / 256:
|
||||
__m128i color_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8);
|
||||
__m128i color_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8);
|
||||
|
||||
__m128i color = _mm_packus_epi16(color_lo, color_hi);
|
||||
_mm_storeu_si128((__m128i*)dest, color);
|
||||
|
||||
source += sincr;
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
SSE_SHADE_INIT(light, shade_constants);
|
||||
|
||||
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
|
||||
__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
|
||||
|
||||
do {
|
||||
uint32_t p0 = source[0];
|
||||
uint32_t p1 = source[1];
|
||||
uint32_t p2 = source[2];
|
||||
uint32_t p3 = source[3];
|
||||
|
||||
// shade_pal_index:
|
||||
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
||||
SSE_SHADE(fg, shade_constants);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
|
||||
// unpack bg:
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
// (fg_red * fg_alpha + bg_red * bg_alpha) / 256:
|
||||
__m128i color_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8);
|
||||
__m128i color_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8);
|
||||
|
||||
__m128i color = _mm_packus_epi16(color_lo, color_hi);
|
||||
_mm_storeu_si128((__m128i*)dest, color);
|
||||
|
||||
source += sincr;
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
class RtSubClamp1colRGBACommand : public DrawerCommand
|
||||
|
@ -1657,7 +1308,14 @@ void rt_map1col_rgba (int hx, int sx, int yl, int yh)
|
|||
// Maps all four spans to the screen starting at sx.
|
||||
void rt_map4cols_rgba (int sx, int yl, int yh)
|
||||
{
|
||||
#ifdef NO_SSE
|
||||
DrawerCommandQueue::QueueCommand<RtMap4colsRGBACommand>(sx, yl, yh);
|
||||
#else
|
||||
if (!r_linearlight)
|
||||
DrawerCommandQueue::QueueCommand<RtMap4colsRGBA_SSE_Command>(sx, yl, yh);
|
||||
else
|
||||
DrawerCommandQueue::QueueCommand<RtMap4colsRGBA_AVX_Command>(sx, yl, yh);
|
||||
#endif
|
||||
}
|
||||
|
||||
void rt_Translate1col_rgba(const BYTE *translation, int hx, int yl, int yh)
|
||||
|
@ -1693,7 +1351,14 @@ void rt_add1col_rgba (int hx, int sx, int yl, int yh)
|
|||
// Adds all four spans to the screen starting at sx without clamping.
|
||||
void rt_add4cols_rgba (int sx, int yl, int yh)
|
||||
{
|
||||
#ifdef NO_SSE
|
||||
DrawerCommandQueue::QueueCommand<RtAdd4colsRGBACommand>(sx, yl, yh);
|
||||
#else
|
||||
if (!r_linearlight)
|
||||
DrawerCommandQueue::QueueCommand<RtAdd4colsRGBA_SSE_Command>(sx, yl, yh);
|
||||
else
|
||||
DrawerCommandQueue::QueueCommand<RtAdd4colsRGBA_AVX_Command>(sx, yl, yh);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Translates and adds one span at hx to the screen at sx without clamping.
|
||||
|
@ -1719,7 +1384,14 @@ void rt_shaded1col_rgba (int hx, int sx, int yl, int yh)
|
|||
// Shades all four spans to the screen starting at sx.
|
||||
void rt_shaded4cols_rgba (int sx, int yl, int yh)
|
||||
{
|
||||
#ifdef NO_SSE
|
||||
DrawerCommandQueue::QueueCommand<RtShaded4colsRGBACommand>(sx, yl, yh);
|
||||
#else
|
||||
if (!r_linearlight)
|
||||
DrawerCommandQueue::QueueCommand<RtShaded4colsRGBA_SSE_Command>(sx, yl, yh);
|
||||
else
|
||||
DrawerCommandQueue::QueueCommand<RtShaded4colsRGBA_AVX_Command>(sx, yl, yh);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Adds one span at hx to the screen at sx with clamping.
|
||||
|
@ -1731,7 +1403,14 @@ void rt_addclamp1col_rgba (int hx, int sx, int yl, int yh)
|
|||
// Adds all four spans to the screen starting at sx with clamping.
|
||||
void rt_addclamp4cols_rgba (int sx, int yl, int yh)
|
||||
{
|
||||
#ifdef NO_SSE
|
||||
DrawerCommandQueue::QueueCommand<RtAddClamp4colsRGBACommand>(sx, yl, yh);
|
||||
#else
|
||||
if (!r_linearlight)
|
||||
DrawerCommandQueue::QueueCommand<RtAddClamp4colsRGBA_SSE_Command>(sx, yl, yh);
|
||||
else
|
||||
DrawerCommandQueue::QueueCommand<RtAddClamp4colsRGBA_AVX_Command>(sx, yl, yh);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Translates and adds one span at hx to the screen at sx with clamping.
|
||||
|
|
495
src/r_drawt_rgba_sse.h
Normal file
495
src/r_drawt_rgba_sse.h
Normal file
|
@ -0,0 +1,495 @@
|
|||
//
|
||||
// SSE/AVX intrinsics based drawers for the r_drawt family of drawers.
|
||||
//
|
||||
// Note: This header file is intentionally not guarded by a __R_DRAWT_RGBA_SSE__ define.
|
||||
// It is because the code is nearly identical for SSE vs AVX. The file is included
|
||||
// multiple times by r_drawt_rgba.cpp with different defines that changes the class
|
||||
// names outputted and the type of intrinsics used.
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning(disable: 4752) // warning C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
|
||||
#endif
|
||||
|
||||
class VecCommand(RtMap4colsRGBA) : public DrawerCommand
|
||||
{
|
||||
int sx;
|
||||
int yl;
|
||||
int yh;
|
||||
fixed_t _light;
|
||||
ShadeConstants _shade_constants;
|
||||
BYTE * RESTRICT _destorg;
|
||||
int _pitch;
|
||||
BYTE * RESTRICT _colormap;
|
||||
|
||||
public:
|
||||
VecCommand(RtMap4colsRGBA)(int sx, int yl, int yh)
|
||||
{
|
||||
this->sx = sx;
|
||||
this->yl = yl;
|
||||
this->yh = yh;
|
||||
|
||||
_light = dc_light;
|
||||
_shade_constants = dc_shade_constants;
|
||||
_destorg = dc_destorg;
|
||||
_pitch = dc_pitch;
|
||||
_colormap = dc_colormap;
|
||||
}
|
||||
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
uint32_t *source;
|
||||
uint32_t *dest;
|
||||
int count;
|
||||
int pitch;
|
||||
int sincr;
|
||||
|
||||
count = thread->count_for_thread(yl, yh - yl + 1);
|
||||
if (count <= 0)
|
||||
return;
|
||||
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
uint32_t light = calc_light_multiplier(_light);
|
||||
uint32_t *palette = (uint32_t*)GPalette.BaseColors;
|
||||
|
||||
dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg);
|
||||
source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4;
|
||||
pitch = _pitch * thread->num_cores;
|
||||
sincr = thread->num_cores * 4;
|
||||
|
||||
BYTE *colormap = _colormap;
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT(light);
|
||||
|
||||
if (count & 1) {
|
||||
uint32_t p0 = colormap[source[0]];
|
||||
uint32_t p1 = colormap[source[1]];
|
||||
uint32_t p2 = colormap[source[2]];
|
||||
uint32_t p3 = colormap[source[3]];
|
||||
|
||||
// shade_pal_index:
|
||||
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
|
||||
source += sincr;
|
||||
dest += pitch;
|
||||
}
|
||||
if (!(count >>= 1))
|
||||
return;
|
||||
|
||||
do {
|
||||
// shade_pal_index 0-3
|
||||
{
|
||||
uint32_t p0 = colormap[source[0]];
|
||||
uint32_t p1 = colormap[source[1]];
|
||||
uint32_t p2 = colormap[source[2]];
|
||||
uint32_t p3 = colormap[source[3]];
|
||||
|
||||
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
}
|
||||
|
||||
// shade_pal_index 4-7 (pitch)
|
||||
{
|
||||
uint32_t p0 = colormap[source[sincr]];
|
||||
uint32_t p1 = colormap[source[sincr + 1]];
|
||||
uint32_t p2 = colormap[source[sincr + 2]];
|
||||
uint32_t p3 = colormap[source[sincr + 3]];
|
||||
|
||||
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
_mm_storeu_si128((__m128i*)(dest + pitch), fg);
|
||||
}
|
||||
|
||||
source += sincr * 2;
|
||||
dest += pitch * 2;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT(light, shade_constants);
|
||||
|
||||
if (count & 1) {
|
||||
uint32_t p0 = colormap[source[0]];
|
||||
uint32_t p1 = colormap[source[1]];
|
||||
uint32_t p2 = colormap[source[2]];
|
||||
uint32_t p3 = colormap[source[3]];
|
||||
|
||||
// shade_pal_index:
|
||||
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
|
||||
source += sincr;
|
||||
dest += pitch;
|
||||
}
|
||||
if (!(count >>= 1))
|
||||
return;
|
||||
|
||||
do {
|
||||
// shade_pal_index 0-3
|
||||
{
|
||||
uint32_t p0 = colormap[source[0]];
|
||||
uint32_t p1 = colormap[source[1]];
|
||||
uint32_t p2 = colormap[source[2]];
|
||||
uint32_t p3 = colormap[source[3]];
|
||||
|
||||
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
}
|
||||
|
||||
// shade_pal_index 4-7 (pitch)
|
||||
{
|
||||
uint32_t p0 = colormap[source[sincr]];
|
||||
uint32_t p1 = colormap[source[sincr + 1]];
|
||||
uint32_t p2 = colormap[source[sincr + 2]];
|
||||
uint32_t p3 = colormap[source[sincr + 3]];
|
||||
|
||||
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
_mm_storeu_si128((__m128i*)(dest + pitch), fg);
|
||||
}
|
||||
|
||||
source += sincr * 2;
|
||||
dest += pitch * 2;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class VecCommand(RtAdd4colsRGBA) : public DrawerCommand
|
||||
{
|
||||
int sx;
|
||||
int yl;
|
||||
int yh;
|
||||
BYTE * RESTRICT _destorg;
|
||||
int _pitch;
|
||||
fixed_t _light;
|
||||
ShadeConstants _shade_constants;
|
||||
BYTE * RESTRICT _colormap;
|
||||
fixed_t _srcalpha;
|
||||
fixed_t _destalpha;
|
||||
|
||||
public:
|
||||
VecCommand(RtAdd4colsRGBA)(int sx, int yl, int yh)
|
||||
{
|
||||
this->sx = sx;
|
||||
this->yl = yl;
|
||||
this->yh = yh;
|
||||
|
||||
_destorg = dc_destorg;
|
||||
_pitch = dc_pitch;
|
||||
_light = dc_light;
|
||||
_shade_constants = dc_shade_constants;
|
||||
_colormap = dc_colormap;
|
||||
_srcalpha = dc_srcalpha;
|
||||
_destalpha = dc_destalpha;
|
||||
}
|
||||
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
uint32_t *source;
|
||||
uint32_t *dest;
|
||||
int count;
|
||||
int pitch;
|
||||
int sincr;
|
||||
|
||||
count = thread->count_for_thread(yl, yh - yl + 1);
|
||||
if (count <= 0)
|
||||
return;
|
||||
|
||||
dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg);
|
||||
source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4;
|
||||
pitch = _pitch * thread->num_cores;
|
||||
sincr = 4 * thread->num_cores;
|
||||
|
||||
uint32_t light = calc_light_multiplier(_light);
|
||||
uint32_t *palette = (uint32_t*)GPalette.BaseColors;
|
||||
BYTE *colormap = _colormap;
|
||||
|
||||
uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8);
|
||||
uint32_t bg_alpha = _destalpha >> (FRACBITS - 8);
|
||||
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT(light);
|
||||
|
||||
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
|
||||
__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
|
||||
|
||||
do {
|
||||
uint32_t p0 = colormap[source[0]];
|
||||
uint32_t p1 = colormap[source[1]];
|
||||
uint32_t p2 = colormap[source[2]];
|
||||
uint32_t p3 = colormap[source[3]];
|
||||
|
||||
// shade_pal_index:
|
||||
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
|
||||
// unpack bg:
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
// (fg_red * fg_alpha + bg_red * bg_alpha) / 256:
|
||||
__m128i color_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8);
|
||||
__m128i color_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8);
|
||||
|
||||
__m128i color = _mm_packus_epi16(color_lo, color_hi);
|
||||
_mm_storeu_si128((__m128i*)dest, color);
|
||||
|
||||
source += sincr;
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT(light, shade_constants);
|
||||
|
||||
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
|
||||
__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
|
||||
|
||||
do {
|
||||
uint32_t p0 = colormap[source[0]];
|
||||
uint32_t p1 = colormap[source[1]];
|
||||
uint32_t p2 = colormap[source[2]];
|
||||
uint32_t p3 = colormap[source[3]];
|
||||
|
||||
// shade_pal_index:
|
||||
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
|
||||
// unpack bg:
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
// (fg_red * fg_alpha + bg_red * bg_alpha) / 256:
|
||||
__m128i color_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8);
|
||||
__m128i color_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8);
|
||||
|
||||
__m128i color = _mm_packus_epi16(color_lo, color_hi);
|
||||
_mm_storeu_si128((__m128i*)dest, color);
|
||||
|
||||
source += sincr;
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class VecCommand(RtShaded4colsRGBA) : public DrawerCommand
|
||||
{
|
||||
int sx;
|
||||
int yl;
|
||||
int yh;
|
||||
lighttable_t * RESTRICT _colormap;
|
||||
int _color;
|
||||
BYTE * RESTRICT _destorg;
|
||||
int _pitch;
|
||||
fixed_t _light;
|
||||
|
||||
public:
|
||||
VecCommand(RtShaded4colsRGBA)(int sx, int yl, int yh)
|
||||
{
|
||||
this->sx = sx;
|
||||
this->yl = yl;
|
||||
this->yh = yh;
|
||||
|
||||
_colormap = dc_colormap;
|
||||
_color = dc_color;
|
||||
_destorg = dc_destorg;
|
||||
_pitch = dc_pitch;
|
||||
_light = dc_light;
|
||||
}
|
||||
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
BYTE *colormap;
|
||||
uint32_t *source;
|
||||
uint32_t *dest;
|
||||
int count;
|
||||
int pitch;
|
||||
int sincr;
|
||||
|
||||
count = thread->count_for_thread(yl, yh - yl + 1);
|
||||
if (count <= 0)
|
||||
return;
|
||||
|
||||
colormap = _colormap;
|
||||
dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg);
|
||||
source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4;
|
||||
pitch = _pitch * thread->num_cores;
|
||||
sincr = 4 * thread->num_cores;
|
||||
|
||||
__m128i fg = _mm_unpackhi_epi8(_mm_set1_epi32(shade_pal_index_simple(_color, calc_light_multiplier(_light))), _mm_setzero_si128());
|
||||
__m128i alpha_one = _mm_set1_epi16(64);
|
||||
|
||||
do {
|
||||
uint32_t p0 = colormap[source[0]];
|
||||
uint32_t p1 = colormap[source[1]];
|
||||
uint32_t p2 = colormap[source[2]];
|
||||
uint32_t p3 = colormap[source[3]];
|
||||
|
||||
__m128i alpha_hi = _mm_set_epi16(64, p3, p3, p3, 64, p2, p2, p2);
|
||||
__m128i alpha_lo = _mm_set_epi16(64, p1, p1, p1, 64, p0, p0, p0);
|
||||
__m128i inv_alpha_hi = _mm_subs_epu16(alpha_one, alpha_hi);
|
||||
__m128i inv_alpha_lo = _mm_subs_epu16(alpha_one, alpha_lo);
|
||||
|
||||
// unpack bg:
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
// (fg_red * alpha + bg_red * inv_alpha) / 64:
|
||||
__m128i color_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg, alpha_hi), _mm_mullo_epi16(bg_hi, inv_alpha_hi)), 6);
|
||||
__m128i color_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg, alpha_lo), _mm_mullo_epi16(bg_lo, inv_alpha_lo)), 6);
|
||||
|
||||
__m128i color = _mm_packus_epi16(color_lo, color_hi);
|
||||
_mm_storeu_si128((__m128i*)dest, color);
|
||||
|
||||
source += sincr;
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
};
|
||||
|
||||
class VecCommand(RtAddClamp4colsRGBA) : public DrawerCommand
|
||||
{
|
||||
int sx;
|
||||
int yl;
|
||||
int yh;
|
||||
BYTE * RESTRICT _destorg;
|
||||
int _pitch;
|
||||
fixed_t _light;
|
||||
fixed_t _srcalpha;
|
||||
fixed_t _destalpha;
|
||||
ShadeConstants _shade_constants;
|
||||
|
||||
public:
|
||||
VecCommand(RtAddClamp4colsRGBA)(int sx, int yl, int yh)
|
||||
{
|
||||
this->sx = sx;
|
||||
this->yl = yl;
|
||||
this->yh = yh;
|
||||
|
||||
_destorg = dc_destorg;
|
||||
_pitch = dc_pitch;
|
||||
_light = dc_light;
|
||||
_srcalpha = dc_srcalpha;
|
||||
_destalpha = dc_destalpha;
|
||||
_shade_constants = dc_shade_constants;
|
||||
}
|
||||
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
uint32_t *source;
|
||||
uint32_t *dest;
|
||||
int count;
|
||||
int pitch;
|
||||
int sincr;
|
||||
|
||||
count = thread->count_for_thread(yl, yh - yl + 1);
|
||||
if (count <= 0)
|
||||
return;
|
||||
|
||||
dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg);
|
||||
source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4;
|
||||
pitch = _pitch * thread->num_cores;
|
||||
sincr = 4 * thread->num_cores;
|
||||
|
||||
uint32_t light = calc_light_multiplier(_light);
|
||||
uint32_t *palette = (uint32_t*)GPalette.BaseColors;
|
||||
|
||||
uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8);
|
||||
uint32_t bg_alpha = _destalpha >> (FRACBITS - 8);
|
||||
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT(light);
|
||||
|
||||
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
|
||||
__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
|
||||
|
||||
do {
|
||||
uint32_t p0 = source[0];
|
||||
uint32_t p1 = source[1];
|
||||
uint32_t p2 = source[2];
|
||||
uint32_t p3 = source[3];
|
||||
|
||||
// shade_pal_index:
|
||||
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
|
||||
// unpack bg:
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
// (fg_red * fg_alpha + bg_red * bg_alpha) / 256:
|
||||
__m128i color_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8);
|
||||
__m128i color_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8);
|
||||
|
||||
__m128i color = _mm_packus_epi16(color_lo, color_hi);
|
||||
_mm_storeu_si128((__m128i*)dest, color);
|
||||
|
||||
source += sincr;
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT(light, shade_constants);
|
||||
|
||||
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
|
||||
__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
|
||||
|
||||
do {
|
||||
uint32_t p0 = source[0];
|
||||
uint32_t p1 = source[1];
|
||||
uint32_t p2 = source[2];
|
||||
uint32_t p3 = source[3];
|
||||
|
||||
// shade_pal_index:
|
||||
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
|
||||
// unpack bg:
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
// (fg_red * fg_alpha + bg_red * bg_alpha) / 256:
|
||||
__m128i color_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8);
|
||||
__m128i color_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8);
|
||||
|
||||
__m128i color = _mm_packus_epi16(color_lo, color_hi);
|
||||
_mm_storeu_si128((__m128i*)dest, color);
|
||||
|
||||
source += sincr;
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
};
|
Loading…
Reference in a new issue