mirror of
https://github.com/ZDoom/gzdoom.git
synced 2024-11-28 23:02:07 +00:00
Added more SSE drawers
This commit is contained in:
parent
3f905197d0
commit
38aba81dcc
5 changed files with 1072 additions and 148 deletions
|
@ -2411,10 +2411,7 @@ public:
|
|||
do
|
||||
{
|
||||
uint32_t pix = source[frac >> bits];
|
||||
if (pix != 0)
|
||||
{
|
||||
*dest = shade_bgra(pix, light, shade_constants);
|
||||
}
|
||||
*dest = alpha_blend(shade_bgra(pix, light, shade_constants), *dest);
|
||||
frac += fracstep;
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
|
@ -2480,10 +2477,10 @@ public:
|
|||
do
|
||||
{
|
||||
uint32_t pix;
|
||||
pix = bufplce[0][(place = local_vplce[0]) >> bits]; if (pix) dest[0] = shade_bgra(pix, light0, shade_constants); local_vplce[0] = place + local_vince[0];
|
||||
pix = bufplce[1][(place = local_vplce[1]) >> bits]; if (pix) dest[1] = shade_bgra(pix, light1, shade_constants); local_vplce[1] = place + local_vince[1];
|
||||
pix = bufplce[2][(place = local_vplce[2]) >> bits]; if (pix) dest[2] = shade_bgra(pix, light2, shade_constants); local_vplce[2] = place + local_vince[2];
|
||||
pix = bufplce[3][(place = local_vplce[3]) >> bits]; if (pix) dest[3] = shade_bgra(pix, light3, shade_constants); local_vplce[3] = place + local_vince[3];
|
||||
pix = bufplce[0][(place = local_vplce[0]) >> bits]; dest[0] = alpha_blend(shade_bgra(pix, light0, shade_constants), dest[0]); local_vplce[0] = place + local_vince[0];
|
||||
pix = bufplce[1][(place = local_vplce[1]) >> bits]; dest[1] = alpha_blend(shade_bgra(pix, light1, shade_constants), dest[1]); local_vplce[1] = place + local_vince[1];
|
||||
pix = bufplce[2][(place = local_vplce[2]) >> bits]; dest[2] = alpha_blend(shade_bgra(pix, light2, shade_constants), dest[2]); local_vplce[2] = place + local_vince[2];
|
||||
pix = bufplce[3][(place = local_vplce[3]) >> bits]; dest[3] = alpha_blend(shade_bgra(pix, light3, shade_constants), dest[3]); local_vplce[3] = place + local_vince[3];
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
|
@ -2535,14 +2532,16 @@ public:
|
|||
uint32_t light = calc_light_multiplier(_light);
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8);
|
||||
uint32_t bg_alpha = _destalpha >> (FRACBITS - 8);
|
||||
uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
|
||||
uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
|
||||
|
||||
do
|
||||
{
|
||||
uint32_t pix = source[frac >> bits];
|
||||
if (pix != 0)
|
||||
{
|
||||
|
||||
uint32_t fg_alpha, bg_alpha;
|
||||
calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha);
|
||||
|
||||
uint32_t fg = shade_bgra(pix, light, shade_constants);
|
||||
uint32_t fg_red = (fg >> 16) & 0xff;
|
||||
uint32_t fg_green = (fg >> 8) & 0xff;
|
||||
|
@ -2557,7 +2556,7 @@ public:
|
|||
uint32_t blue = clamp<uint32_t>((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255);
|
||||
|
||||
*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
|
||||
}
|
||||
|
||||
frac += fracstep;
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
|
@ -2615,8 +2614,8 @@ public:
|
|||
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8);
|
||||
uint32_t bg_alpha = _destalpha >> (FRACBITS - 8);
|
||||
uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
|
||||
uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
|
||||
|
||||
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
|
||||
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
|
||||
|
@ -2632,8 +2631,10 @@ public:
|
|||
for (int i = 0; i < 4; ++i)
|
||||
{
|
||||
uint32_t pix = bufplce[i][local_vplce[i] >> bits];
|
||||
if (pix != 0)
|
||||
{
|
||||
|
||||
uint32_t fg_alpha, bg_alpha;
|
||||
calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha);
|
||||
|
||||
uint32_t fg = shade_bgra(pix, light[i], shade_constants);
|
||||
uint32_t fg_red = (fg >> 16) & 0xff;
|
||||
uint32_t fg_green = (fg >> 8) & 0xff;
|
||||
|
@ -2648,7 +2649,7 @@ public:
|
|||
uint32_t blue = clamp<uint32_t>((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255);
|
||||
|
||||
dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
|
||||
}
|
||||
|
||||
local_vplce[i] += local_vince[i];
|
||||
}
|
||||
dest += pitch;
|
||||
|
@ -2702,14 +2703,16 @@ public:
|
|||
uint32_t light = calc_light_multiplier(_light);
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8);
|
||||
uint32_t bg_alpha = _destalpha >> (FRACBITS - 8);
|
||||
uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
|
||||
uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
|
||||
|
||||
do
|
||||
{
|
||||
uint32_t pix = source[frac >> bits];
|
||||
if (pix != 0)
|
||||
{
|
||||
|
||||
uint32_t fg_alpha, bg_alpha;
|
||||
calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha);
|
||||
|
||||
uint32_t fg = shade_bgra(pix, light, shade_constants);
|
||||
uint32_t fg_red = (fg >> 16) & 0xff;
|
||||
uint32_t fg_green = (fg >> 8) & 0xff;
|
||||
|
@ -2724,7 +2727,7 @@ public:
|
|||
uint32_t blue = clamp<uint32_t>((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255);
|
||||
|
||||
*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
|
||||
}
|
||||
|
||||
frac += fracstep;
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
|
@ -2782,8 +2785,8 @@ public:
|
|||
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8);
|
||||
uint32_t bg_alpha = _destalpha >> (FRACBITS - 8);
|
||||
uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
|
||||
uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
|
||||
|
||||
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
|
||||
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
|
||||
|
@ -2799,8 +2802,10 @@ public:
|
|||
for (int i = 0; i < 4; ++i)
|
||||
{
|
||||
uint32_t pix = bufplce[i][local_vplce[i] >> bits];
|
||||
if (pix != 0)
|
||||
{
|
||||
|
||||
uint32_t fg_alpha, bg_alpha;
|
||||
calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha);
|
||||
|
||||
uint32_t fg = shade_bgra(pix, light[i], shade_constants);
|
||||
uint32_t fg_red = (fg >> 16) & 0xff;
|
||||
uint32_t fg_green = (fg >> 8) & 0xff;
|
||||
|
@ -2815,7 +2820,7 @@ public:
|
|||
uint32_t blue = clamp<uint32_t>((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255);
|
||||
|
||||
dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
|
||||
}
|
||||
|
||||
local_vplce[i] += local_vince[i];
|
||||
}
|
||||
dest += pitch;
|
||||
|
@ -2869,14 +2874,16 @@ public:
|
|||
uint32_t light = calc_light_multiplier(_light);
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8);
|
||||
uint32_t bg_alpha = _destalpha >> (FRACBITS - 8);
|
||||
uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
|
||||
uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
|
||||
|
||||
do
|
||||
{
|
||||
uint32_t pix = source[frac >> bits];
|
||||
if (pix != 0)
|
||||
{
|
||||
|
||||
uint32_t fg_alpha, bg_alpha;
|
||||
calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha);
|
||||
|
||||
uint32_t fg = shade_bgra(pix, light, shade_constants);
|
||||
uint32_t fg_red = (fg >> 16) & 0xff;
|
||||
uint32_t fg_green = (fg >> 8) & 0xff;
|
||||
|
@ -2891,7 +2898,7 @@ public:
|
|||
uint32_t blue = clamp<uint32_t>((0x10000 - fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256;
|
||||
|
||||
*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
|
||||
}
|
||||
|
||||
frac += fracstep;
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
|
@ -2949,8 +2956,8 @@ public:
|
|||
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8);
|
||||
uint32_t bg_alpha = _destalpha >> (FRACBITS - 8);
|
||||
uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
|
||||
uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
|
||||
|
||||
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
|
||||
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
|
||||
|
@ -2966,8 +2973,10 @@ public:
|
|||
for (int i = 0; i < 4; ++i)
|
||||
{
|
||||
uint32_t pix = bufplce[i][local_vplce[i] >> bits];
|
||||
if (pix != 0)
|
||||
{
|
||||
|
||||
uint32_t fg_alpha, bg_alpha;
|
||||
calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha);
|
||||
|
||||
uint32_t fg = shade_bgra(pix, light[i], shade_constants);
|
||||
uint32_t fg_red = (fg >> 16) & 0xff;
|
||||
uint32_t fg_green = (fg >> 8) & 0xff;
|
||||
|
@ -2982,7 +2991,7 @@ public:
|
|||
uint32_t blue = clamp<uint32_t>((0x10000 - fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256;
|
||||
|
||||
dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
|
||||
}
|
||||
|
||||
local_vplce[i] += local_vince[i];
|
||||
}
|
||||
dest += pitch;
|
||||
|
@ -3036,14 +3045,16 @@ public:
|
|||
uint32_t light = calc_light_multiplier(_light);
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8);
|
||||
uint32_t bg_alpha = _destalpha >> (FRACBITS - 8);
|
||||
uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
|
||||
uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
|
||||
|
||||
do
|
||||
{
|
||||
uint32_t pix = source[frac >> bits];
|
||||
if (pix != 0)
|
||||
{
|
||||
|
||||
uint32_t fg_alpha, bg_alpha;
|
||||
calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha);
|
||||
|
||||
uint32_t fg = shade_bgra(pix, light, shade_constants);
|
||||
uint32_t fg_red = (fg >> 16) & 0xff;
|
||||
uint32_t fg_green = (fg >> 8) & 0xff;
|
||||
|
@ -3058,7 +3069,7 @@ public:
|
|||
uint32_t blue = clamp<uint32_t>((0x10000 + fg_blue * fg_alpha - bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256;
|
||||
|
||||
*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
|
||||
}
|
||||
|
||||
frac += fracstep;
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
|
@ -3116,8 +3127,8 @@ public:
|
|||
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8);
|
||||
uint32_t bg_alpha = _destalpha >> (FRACBITS - 8);
|
||||
uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
|
||||
uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
|
||||
|
||||
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
|
||||
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
|
||||
|
@ -3133,8 +3144,10 @@ public:
|
|||
for (int i = 0; i < 4; ++i)
|
||||
{
|
||||
uint32_t pix = bufplce[i][local_vplce[i] >> bits];
|
||||
if (pix != 0)
|
||||
{
|
||||
|
||||
uint32_t fg_alpha, bg_alpha;
|
||||
calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha);
|
||||
|
||||
uint32_t fg = shade_bgra(pix, light[i], shade_constants);
|
||||
uint32_t fg_red = (fg >> 16) & 0xff;
|
||||
uint32_t fg_green = (fg >> 8) & 0xff;
|
||||
|
@ -3149,7 +3162,7 @@ public:
|
|||
uint32_t blue = clamp<uint32_t>((0x10000 + fg_blue * fg_alpha - bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256;
|
||||
|
||||
dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
|
||||
}
|
||||
|
||||
local_vplce[i] += local_vince[i];
|
||||
}
|
||||
dest += pitch;
|
||||
|
@ -3733,7 +3746,14 @@ fixed_t tmvline1_add_rgba()
|
|||
|
||||
void tmvline4_add_rgba()
|
||||
{
|
||||
#ifdef NO_SSE
|
||||
DrawerCommandQueue::QueueCommand<Tmvline4AddRGBACommand>();
|
||||
#else
|
||||
if (!r_linearlight)
|
||||
DrawerCommandQueue::QueueCommand<Tmvline4AddRGBA_SSE_Command>();
|
||||
else
|
||||
DrawerCommandQueue::QueueCommand<Tmvline4AddRGBA_AVX_Command>();
|
||||
#endif
|
||||
for (int i = 0; i < 4; i++)
|
||||
vplce[i] += vince[i] * dc_count;
|
||||
}
|
||||
|
@ -3746,7 +3766,14 @@ fixed_t tmvline1_addclamp_rgba()
|
|||
|
||||
void tmvline4_addclamp_rgba()
|
||||
{
|
||||
#ifdef NO_SSE
|
||||
DrawerCommandQueue::QueueCommand<Tmvline4AddClampRGBACommand>();
|
||||
#else
|
||||
if (!r_linearlight)
|
||||
DrawerCommandQueue::QueueCommand<Tmvline4AddClampRGBA_SSE_Command>();
|
||||
else
|
||||
DrawerCommandQueue::QueueCommand<Tmvline4AddClampRGBA_AVX_Command>();
|
||||
#endif
|
||||
for (int i = 0; i < 4; i++)
|
||||
vplce[i] += vince[i] * dc_count;
|
||||
}
|
||||
|
@ -3759,7 +3786,14 @@ fixed_t tmvline1_subclamp_rgba()
|
|||
|
||||
void tmvline4_subclamp_rgba()
|
||||
{
|
||||
#ifdef NO_SSE
|
||||
DrawerCommandQueue::QueueCommand<Tmvline4SubClampRGBACommand>();
|
||||
#else
|
||||
if (!r_linearlight)
|
||||
DrawerCommandQueue::QueueCommand<Tmvline4SubClampRGBA_SSE_Command>();
|
||||
else
|
||||
DrawerCommandQueue::QueueCommand<Tmvline4SubClampRGBA_AVX_Command>();
|
||||
#endif
|
||||
for (int i = 0; i < 4; i++)
|
||||
vplce[i] += vince[i] * dc_count;
|
||||
}
|
||||
|
@ -3772,7 +3806,14 @@ fixed_t tmvline1_revsubclamp_rgba()
|
|||
|
||||
void tmvline4_revsubclamp_rgba()
|
||||
{
|
||||
#ifdef NO_SSE
|
||||
DrawerCommandQueue::QueueCommand<Tmvline4RevSubClampRGBACommand>();
|
||||
#else
|
||||
if (!r_linearlight)
|
||||
DrawerCommandQueue::QueueCommand<Tmvline4RevSubClampRGBA_SSE_Command>();
|
||||
else
|
||||
DrawerCommandQueue::QueueCommand<Tmvline4RevSubClampRGBA_AVX_Command>();
|
||||
#endif
|
||||
for (int i = 0; i < 4; i++)
|
||||
vplce[i] += vince[i] * dc_count;
|
||||
}
|
||||
|
|
|
@ -417,9 +417,9 @@ FORCEINLINE uint32_t alpha_blend(uint32_t fg, uint32_t bg)
|
|||
uint32_t bg_green = (bg >> 8) & 0xff;
|
||||
uint32_t bg_blue = bg & 0xff;
|
||||
|
||||
uint32_t red = ((fg_red * alpha) + (bg_red * inv_alpha)) / 256;
|
||||
uint32_t green = ((fg_green * alpha) + (bg_green * inv_alpha)) / 256;
|
||||
uint32_t blue = ((fg_blue * alpha) + (bg_blue * inv_alpha)) / 256;
|
||||
uint32_t red = clamp<uint32_t>(fg_red + (bg_red * inv_alpha) / 256, 0, 255);
|
||||
uint32_t green = clamp<uint32_t>(fg_green + (bg_green * inv_alpha) / 256, 0, 255);
|
||||
uint32_t blue = clamp<uint32_t>(fg_blue + (bg_blue * inv_alpha) / 256, 0, 255);
|
||||
|
||||
return 0xff000000 | (red << 16) | (green << 8) | blue;
|
||||
}
|
||||
|
@ -543,7 +543,7 @@ FORCEINLINE uint32_t alpha_blend(uint32_t fg, uint32_t bg)
|
|||
|
||||
/*
|
||||
// Complex shade 8 pixels
|
||||
#define AVX2_SHADE(fg, shade_constants) { \
|
||||
#define AVX_SHADE(fg, shade_constants) { \
|
||||
__m256i fg_hi = _mm256_unpackhi_epi8(fg, _mm256_setzero_si256()); \
|
||||
__m256i fg_lo = _mm256_unpacklo_epi8(fg, _mm256_setzero_si256()); \
|
||||
\
|
||||
|
@ -566,8 +566,58 @@ FORCEINLINE uint32_t alpha_blend(uint32_t fg, uint32_t bg)
|
|||
}
|
||||
*/
|
||||
|
||||
// Normal premultiplied alpha blend using the alpha from fg
|
||||
#define VEC_ALPHA_BLEND(fg,bg) { \
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); \
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); \
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); \
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); \
|
||||
__m128i m255 = _mm_set1_epi16(255); \
|
||||
__m128i inv_alpha_hi = _mm_sub_epi16(m255, _mm_shufflehi_epi16(_mm_shufflelo_epi16(fg_hi, _MM_SHUFFLE(3,3,3,3)), _MM_SHUFFLE(3,3,3,3))); \
|
||||
__m128i inv_alpha_lo = _mm_sub_epi16(m255, _mm_shufflehi_epi16(_mm_shufflelo_epi16(fg_lo, _MM_SHUFFLE(3,3,3,3)), _MM_SHUFFLE(3,3,3,3))); \
|
||||
inv_alpha_hi = _mm_add_epi16(inv_alpha_hi, _mm_srli_epi16(inv_alpha_hi, 7)); \
|
||||
inv_alpha_lo = _mm_add_epi16(inv_alpha_lo, _mm_srli_epi16(inv_alpha_lo, 7)); \
|
||||
bg_hi = _mm_mullo_epi16(bg_hi, inv_alpha_hi); \
|
||||
bg_hi = _mm_srli_epi16(bg_hi, 8); \
|
||||
bg_lo = _mm_mullo_epi16(bg_lo, inv_alpha_lo); \
|
||||
bg_lo = _mm_srli_epi16(bg_lo, 8); \
|
||||
bg = _mm_packus_epi16(bg_lo, bg_hi); \
|
||||
fg = _mm_adds_epu8(fg, bg); \
|
||||
}
|
||||
|
||||
/*
|
||||
FORCEINLINE void calc_blend_alpha(uint32_t fg, uint32_t src_alpha, uint32_t dest_alpha, uint32_t &fg_alpha, uint32_t &bg_alpha)
|
||||
{
|
||||
fg_alpha = src_alpha;
|
||||
bg_alpha = dest_alpha;
|
||||
}
|
||||
|
||||
#define VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha) \
|
||||
__m128i fg_alpha_hi = msrc_alpha; \
|
||||
__m128i fg_alpha_lo = msrc_alpha; \
|
||||
__m128i bg_alpha_hi = mdest_alpha; \
|
||||
__m128i bg_alpha_lo = mdest_alpha;
|
||||
*/
|
||||
|
||||
// Calculates the final alpha values to be used when combined with the source texture alpha channel
|
||||
FORCEINLINE void calc_blend_alpha(uint32_t fg, uint32_t src_alpha, uint32_t dest_alpha, uint32_t &fg_alpha, uint32_t &bg_alpha)
|
||||
{
|
||||
fg_alpha = (fg >> 24) & 0xff;
|
||||
fg_alpha += fg_alpha >> 7;
|
||||
bg_alpha = (dest_alpha * (256 - fg_alpha)) >> 8;
|
||||
fg_alpha = (src_alpha * fg_alpha) >> 8;
|
||||
}
|
||||
|
||||
// Calculates the final alpha values to be used when combined with the source texture alpha channel
|
||||
#define VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha) \
|
||||
__m128i fg_alpha_hi = _mm_shufflehi_epi16(_mm_shufflelo_epi16(_mm_unpackhi_epi8(fg, _mm_setzero_si128()), _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)); \
|
||||
__m128i fg_alpha_lo = _mm_shufflehi_epi16(_mm_shufflelo_epi16(_mm_unpacklo_epi8(fg, _mm_setzero_si128()), _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)); \
|
||||
fg_alpha_hi = _mm_add_epi16(fg_alpha_hi, _mm_srli_epi16(fg_alpha_hi, 7)); \
|
||||
fg_alpha_lo = _mm_add_epi16(fg_alpha_lo, _mm_srli_epi16(fg_alpha_lo, 7)); \
|
||||
__m128i bg_alpha_hi = _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16(256), fg_alpha_hi), mdest_alpha), 8); \
|
||||
__m128i bg_alpha_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16(256), fg_alpha_lo), mdest_alpha), 8); \
|
||||
fg_alpha_hi = _mm_srli_epi16(_mm_mullo_epi16(fg_alpha_hi, msrc_alpha), 8); \
|
||||
fg_alpha_lo = _mm_srli_epi16(_mm_mullo_epi16(fg_alpha_lo, msrc_alpha), 8);
|
||||
|
||||
// Calculate constants for a simple shade
|
||||
#define SSE_SHADE_SIMPLE_INIT(light) \
|
||||
|
|
|
@ -444,17 +444,16 @@ public:
|
|||
uint32_t pix2 = bufplce[2][place2 >> bits];
|
||||
uint32_t pix3 = bufplce[3][place3 >> bits];
|
||||
|
||||
// movemask = !(pix == 0)
|
||||
__m128i movemask = _mm_xor_si128(_mm_cmpeq_epi32(_mm_set_epi32(pix3, pix2, pix1, pix0), _mm_setzero_si128()), _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
|
||||
|
||||
local_vplce[0] = place0 + local_vince[0];
|
||||
local_vplce[1] = place1 + local_vince[1];
|
||||
local_vplce[2] = place2 + local_vince[2];
|
||||
local_vplce[3] = place3 + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
_mm_maskmoveu_si128(fg, movemask, (char*)dest);
|
||||
VEC_ALPHA_BLEND(fg, bg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
|
@ -473,17 +472,585 @@ public:
|
|||
uint32_t pix2 = bufplce[2][place2 >> bits];
|
||||
uint32_t pix3 = bufplce[3][place3 >> bits];
|
||||
|
||||
// movemask = !(pix == 0)
|
||||
__m128i movemask = _mm_xor_si128(_mm_cmpeq_epi32(_mm_set_epi32(pix3, pix2, pix1, pix0), _mm_setzero_si128()), _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
|
||||
|
||||
local_vplce[0] = place0 + local_vince[0];
|
||||
local_vplce[1] = place1 + local_vince[1];
|
||||
local_vplce[2] = place2 + local_vince[2];
|
||||
local_vplce[3] = place3 + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
_mm_maskmoveu_si128(fg, movemask, (char*)dest);
|
||||
VEC_ALPHA_BLEND(fg, bg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class VecCommand(Tmvline4AddRGBA) : public DrawerCommand
|
||||
{
|
||||
BYTE * RESTRICT _dest;
|
||||
int _count;
|
||||
int _pitch;
|
||||
ShadeConstants _shade_constants;
|
||||
fixed_t _srcalpha;
|
||||
fixed_t _destalpha;
|
||||
int tmvlinebits;
|
||||
fixed_t palookuplight[4];
|
||||
DWORD vplce[4];
|
||||
DWORD vince[4];
|
||||
const uint32 * RESTRICT bufplce[4];
|
||||
|
||||
public:
|
||||
VecCommand(Tmvline4AddRGBA)()
|
||||
{
|
||||
_dest = dc_dest;
|
||||
_count = dc_count;
|
||||
_pitch = dc_pitch;
|
||||
_shade_constants = dc_shade_constants;
|
||||
_srcalpha = dc_srcalpha;
|
||||
_destalpha = dc_destalpha;
|
||||
tmvlinebits = ::tmvlinebits;
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
palookuplight[i] = ::palookuplight[i];
|
||||
vplce[i] = ::vplce[i];
|
||||
vince[i] = ::vince[i];
|
||||
bufplce[i] = (const uint32 *)::bufplce[i];
|
||||
}
|
||||
}
|
||||
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
int count = thread->count_for_thread(_dest_y, _count);
|
||||
if (count <= 0)
|
||||
return;
|
||||
|
||||
uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
|
||||
int pitch = _pitch * thread->num_cores;
|
||||
int bits = tmvlinebits;
|
||||
|
||||
uint32_t light[4];
|
||||
light[0] = calc_light_multiplier(palookuplight[0]);
|
||||
light[1] = calc_light_multiplier(palookuplight[1]);
|
||||
light[2] = calc_light_multiplier(palookuplight[2]);
|
||||
light[3] = calc_light_multiplier(palookuplight[3]);
|
||||
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
|
||||
uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
|
||||
|
||||
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
|
||||
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
|
||||
int skipped = thread->skipped_by_thread(_dest_y);
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
local_vplce[i] += local_vince[i] * skipped;
|
||||
local_vince[i] *= thread->num_cores;
|
||||
}
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT4(light[3], light[2], light[1], light[0]);
|
||||
|
||||
__m128i msrc_alpha = _mm_set1_epi16(src_alpha);
|
||||
__m128i mdest_alpha = _mm_set1_epi16(dest_alpha);
|
||||
|
||||
do
|
||||
{
|
||||
uint32_t pix0 = bufplce[0][local_vplce[0] >> bits];
|
||||
uint32_t pix1 = bufplce[1][local_vplce[1] >> bits];
|
||||
uint32_t pix2 = bufplce[2][local_vplce[2] >> bits];
|
||||
uint32_t pix3 = bufplce[3][local_vplce[3] >> bits];
|
||||
|
||||
local_vplce[0] = local_vplce[0] + local_vince[0];
|
||||
local_vplce[1] = local_vplce[1] + local_vince[1];
|
||||
local_vplce[2] = local_vplce[2] + local_vince[2];
|
||||
local_vplce[3] = local_vplce[3] + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
|
||||
|
||||
VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
__m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
|
||||
__m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
|
||||
__m128i out = _mm_packus_epi16(out_lo, out_hi);
|
||||
|
||||
_mm_storeu_si128((__m128i*)dest, out);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT4(light[3], light[2], light[1], light[0], shade_constants);
|
||||
|
||||
__m128i msrc_alpha = _mm_set1_epi16(src_alpha);
|
||||
__m128i mdest_alpha = _mm_set1_epi16(dest_alpha);
|
||||
|
||||
do
|
||||
{
|
||||
uint32_t pix0 = bufplce[0][local_vplce[0] >> bits];
|
||||
uint32_t pix1 = bufplce[1][local_vplce[1] >> bits];
|
||||
uint32_t pix2 = bufplce[2][local_vplce[2] >> bits];
|
||||
uint32_t pix3 = bufplce[3][local_vplce[3] >> bits];
|
||||
|
||||
local_vplce[0] = local_vplce[0] + local_vince[0];
|
||||
local_vplce[1] = local_vplce[1] + local_vince[1];
|
||||
local_vplce[2] = local_vplce[2] + local_vince[2];
|
||||
local_vplce[3] = local_vplce[3] + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
|
||||
VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
__m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
|
||||
__m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
|
||||
__m128i out = _mm_packus_epi16(out_lo, out_hi);
|
||||
|
||||
_mm_storeu_si128((__m128i*)dest, out);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class VecCommand(Tmvline4AddClampRGBA) : public DrawerCommand
|
||||
{
|
||||
BYTE * RESTRICT _dest;
|
||||
int _count;
|
||||
int _pitch;
|
||||
ShadeConstants _shade_constants;
|
||||
fixed_t _srcalpha;
|
||||
fixed_t _destalpha;
|
||||
int tmvlinebits;
|
||||
fixed_t palookuplight[4];
|
||||
DWORD vplce[4];
|
||||
DWORD vince[4];
|
||||
const uint32 *RESTRICT bufplce[4];
|
||||
|
||||
public:
|
||||
VecCommand(Tmvline4AddClampRGBA)()
|
||||
{
|
||||
_dest = dc_dest;
|
||||
_count = dc_count;
|
||||
_pitch = dc_pitch;
|
||||
_shade_constants = dc_shade_constants;
|
||||
_srcalpha = dc_srcalpha;
|
||||
_destalpha = dc_destalpha;
|
||||
tmvlinebits = ::tmvlinebits;
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
palookuplight[i] = ::palookuplight[i];
|
||||
vplce[i] = ::vplce[i];
|
||||
vince[i] = ::vince[i];
|
||||
bufplce[i] = (const uint32 *)::bufplce[i];
|
||||
}
|
||||
}
|
||||
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
int count = thread->count_for_thread(_dest_y, _count);
|
||||
if (count <= 0)
|
||||
return;
|
||||
|
||||
uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
|
||||
int pitch = _pitch * thread->num_cores;
|
||||
int bits = tmvlinebits;
|
||||
|
||||
uint32_t light[4];
|
||||
light[0] = calc_light_multiplier(palookuplight[0]);
|
||||
light[1] = calc_light_multiplier(palookuplight[1]);
|
||||
light[2] = calc_light_multiplier(palookuplight[2]);
|
||||
light[3] = calc_light_multiplier(palookuplight[3]);
|
||||
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
|
||||
uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
|
||||
|
||||
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
|
||||
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
|
||||
int skipped = thread->skipped_by_thread(_dest_y);
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
local_vplce[i] += local_vince[i] * skipped;
|
||||
local_vince[i] *= thread->num_cores;
|
||||
}
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT4(light[3], light[2], light[1], light[0]);
|
||||
|
||||
__m128i msrc_alpha = _mm_set1_epi16(src_alpha);
|
||||
__m128i mdest_alpha = _mm_set1_epi16(dest_alpha);
|
||||
|
||||
do
|
||||
{
|
||||
uint32_t pix0 = bufplce[0][local_vplce[0] >> bits];
|
||||
uint32_t pix1 = bufplce[1][local_vplce[1] >> bits];
|
||||
uint32_t pix2 = bufplce[2][local_vplce[2] >> bits];
|
||||
uint32_t pix3 = bufplce[3][local_vplce[3] >> bits];
|
||||
|
||||
local_vplce[0] = local_vplce[0] + local_vince[0];
|
||||
local_vplce[1] = local_vplce[1] + local_vince[1];
|
||||
local_vplce[2] = local_vplce[2] + local_vince[2];
|
||||
local_vplce[3] = local_vplce[3] + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
|
||||
VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
__m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
|
||||
__m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
|
||||
__m128i out = _mm_packus_epi16(out_lo, out_hi);
|
||||
|
||||
_mm_storeu_si128((__m128i*)dest, out);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT4(light[3], light[2], light[1], light[0], shade_constants);
|
||||
|
||||
__m128i msrc_alpha = _mm_set1_epi16(src_alpha);
|
||||
__m128i mdest_alpha = _mm_set1_epi16(dest_alpha);
|
||||
|
||||
do
|
||||
{
|
||||
uint32_t pix0 = bufplce[0][local_vplce[0] >> bits];
|
||||
uint32_t pix1 = bufplce[1][local_vplce[1] >> bits];
|
||||
uint32_t pix2 = bufplce[2][local_vplce[2] >> bits];
|
||||
uint32_t pix3 = bufplce[3][local_vplce[3] >> bits];
|
||||
|
||||
local_vplce[0] = local_vplce[0] + local_vince[0];
|
||||
local_vplce[1] = local_vplce[1] + local_vince[1];
|
||||
local_vplce[2] = local_vplce[2] + local_vince[2];
|
||||
local_vplce[3] = local_vplce[3] + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
|
||||
VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
__m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
|
||||
__m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
|
||||
__m128i out = _mm_packus_epi16(out_lo, out_hi);
|
||||
|
||||
_mm_storeu_si128((__m128i*)dest, out);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class VecCommand(Tmvline4SubClampRGBA) : public DrawerCommand
|
||||
{
|
||||
BYTE * RESTRICT _dest;
|
||||
int _count;
|
||||
int _pitch;
|
||||
ShadeConstants _shade_constants;
|
||||
fixed_t _srcalpha;
|
||||
fixed_t _destalpha;
|
||||
int tmvlinebits;
|
||||
fixed_t palookuplight[4];
|
||||
DWORD vplce[4];
|
||||
DWORD vince[4];
|
||||
const uint32 *RESTRICT bufplce[4];
|
||||
|
||||
public:
|
||||
VecCommand(Tmvline4SubClampRGBA)()
|
||||
{
|
||||
_dest = dc_dest;
|
||||
_count = dc_count;
|
||||
_pitch = dc_pitch;
|
||||
_shade_constants = dc_shade_constants;
|
||||
_srcalpha = dc_srcalpha;
|
||||
_destalpha = dc_destalpha;
|
||||
tmvlinebits = ::tmvlinebits;
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
palookuplight[i] = ::palookuplight[i];
|
||||
vplce[i] = ::vplce[i];
|
||||
vince[i] = ::vince[i];
|
||||
bufplce[i] = (const uint32 *)::bufplce[i];
|
||||
}
|
||||
}
|
||||
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
int count = thread->count_for_thread(_dest_y, _count);
|
||||
if (count <= 0)
|
||||
return;
|
||||
|
||||
uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
|
||||
int pitch = _pitch * thread->num_cores;
|
||||
int bits = tmvlinebits;
|
||||
|
||||
uint32_t light[4];
|
||||
light[0] = calc_light_multiplier(palookuplight[0]);
|
||||
light[1] = calc_light_multiplier(palookuplight[1]);
|
||||
light[2] = calc_light_multiplier(palookuplight[2]);
|
||||
light[3] = calc_light_multiplier(palookuplight[3]);
|
||||
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
|
||||
uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
|
||||
|
||||
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
|
||||
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
|
||||
int skipped = thread->skipped_by_thread(_dest_y);
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
local_vplce[i] += local_vince[i] * skipped;
|
||||
local_vince[i] *= thread->num_cores;
|
||||
}
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT4(light[3], light[2], light[1], light[0]);
|
||||
|
||||
__m128i msrc_alpha = _mm_set1_epi16(src_alpha);
|
||||
__m128i mdest_alpha = _mm_set1_epi16(dest_alpha);
|
||||
|
||||
do
|
||||
{
|
||||
uint32_t pix0 = bufplce[0][local_vplce[0] >> bits];
|
||||
uint32_t pix1 = bufplce[1][local_vplce[1] >> bits];
|
||||
uint32_t pix2 = bufplce[2][local_vplce[2] >> bits];
|
||||
uint32_t pix3 = bufplce[3][local_vplce[3] >> bits];
|
||||
|
||||
local_vplce[0] = local_vplce[0] + local_vince[0];
|
||||
local_vplce[1] = local_vplce[1] + local_vince[1];
|
||||
local_vplce[2] = local_vplce[2] + local_vince[2];
|
||||
local_vplce[3] = local_vplce[3] + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
|
||||
VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
__m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, bg_alpha_hi), _mm_mullo_epi16(fg_hi, fg_alpha_hi)), 8);
|
||||
__m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, bg_alpha_lo), _mm_mullo_epi16(fg_lo, fg_alpha_lo)), 8);
|
||||
__m128i out = _mm_packus_epi16(out_lo, out_hi);
|
||||
|
||||
_mm_storeu_si128((__m128i*)dest, out);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT4(light[3], light[2], light[1], light[0], shade_constants);
|
||||
|
||||
__m128i msrc_alpha = _mm_set1_epi16(src_alpha);
|
||||
__m128i mdest_alpha = _mm_set1_epi16(dest_alpha);
|
||||
|
||||
do
|
||||
{
|
||||
uint32_t pix0 = bufplce[0][local_vplce[0] >> bits];
|
||||
uint32_t pix1 = bufplce[1][local_vplce[1] >> bits];
|
||||
uint32_t pix2 = bufplce[2][local_vplce[2] >> bits];
|
||||
uint32_t pix3 = bufplce[3][local_vplce[3] >> bits];
|
||||
|
||||
local_vplce[0] = local_vplce[0] + local_vince[0];
|
||||
local_vplce[1] = local_vplce[1] + local_vince[1];
|
||||
local_vplce[2] = local_vplce[2] + local_vince[2];
|
||||
local_vplce[3] = local_vplce[3] + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
|
||||
VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
__m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, bg_alpha_hi), _mm_mullo_epi16(fg_hi, fg_alpha_hi)), 8);
|
||||
__m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, bg_alpha_lo), _mm_mullo_epi16(fg_lo, fg_alpha_lo)), 8);
|
||||
__m128i out = _mm_packus_epi16(out_lo, out_hi);
|
||||
|
||||
_mm_storeu_si128((__m128i*)dest, out);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class VecCommand(Tmvline4RevSubClampRGBA) : public DrawerCommand
|
||||
{
|
||||
BYTE * RESTRICT _dest;
|
||||
int _count;
|
||||
int _pitch;
|
||||
ShadeConstants _shade_constants;
|
||||
fixed_t _srcalpha;
|
||||
fixed_t _destalpha;
|
||||
int tmvlinebits;
|
||||
fixed_t palookuplight[4];
|
||||
DWORD vplce[4];
|
||||
DWORD vince[4];
|
||||
const uint32 *RESTRICT bufplce[4];
|
||||
|
||||
public:
|
||||
VecCommand(Tmvline4RevSubClampRGBA)()
|
||||
{
|
||||
_dest = dc_dest;
|
||||
_count = dc_count;
|
||||
_pitch = dc_pitch;
|
||||
_shade_constants = dc_shade_constants;
|
||||
_srcalpha = dc_srcalpha;
|
||||
_destalpha = dc_destalpha;
|
||||
tmvlinebits = ::tmvlinebits;
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
palookuplight[i] = ::palookuplight[i];
|
||||
vplce[i] = ::vplce[i];
|
||||
vince[i] = ::vince[i];
|
||||
bufplce[i] = (const uint32 *)::bufplce[i];
|
||||
}
|
||||
}
|
||||
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
int count = thread->count_for_thread(_dest_y, _count);
|
||||
if (count <= 0)
|
||||
return;
|
||||
|
||||
uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
|
||||
int pitch = _pitch * thread->num_cores;
|
||||
int bits = tmvlinebits;
|
||||
|
||||
uint32_t light[4];
|
||||
light[0] = calc_light_multiplier(palookuplight[0]);
|
||||
light[1] = calc_light_multiplier(palookuplight[1]);
|
||||
light[2] = calc_light_multiplier(palookuplight[2]);
|
||||
light[3] = calc_light_multiplier(palookuplight[3]);
|
||||
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
|
||||
uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
|
||||
|
||||
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
|
||||
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
|
||||
int skipped = thread->skipped_by_thread(_dest_y);
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
local_vplce[i] += local_vince[i] * skipped;
|
||||
local_vince[i] *= thread->num_cores;
|
||||
}
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT4(light[3], light[2], light[1], light[0]);
|
||||
|
||||
__m128i msrc_alpha = _mm_set1_epi16(src_alpha);
|
||||
__m128i mdest_alpha = _mm_set1_epi16(dest_alpha);
|
||||
|
||||
do
|
||||
{
|
||||
uint32_t pix0 = bufplce[0][local_vplce[0] >> bits];
|
||||
uint32_t pix1 = bufplce[1][local_vplce[1] >> bits];
|
||||
uint32_t pix2 = bufplce[2][local_vplce[2] >> bits];
|
||||
uint32_t pix3 = bufplce[3][local_vplce[3] >> bits];
|
||||
|
||||
local_vplce[0] = local_vplce[0] + local_vince[0];
|
||||
local_vplce[1] = local_vplce[1] + local_vince[1];
|
||||
local_vplce[2] = local_vplce[2] + local_vince[2];
|
||||
local_vplce[3] = local_vplce[3] + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
|
||||
VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
__m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
|
||||
__m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
|
||||
__m128i out = _mm_packus_epi16(out_lo, out_hi);
|
||||
|
||||
_mm_storeu_si128((__m128i*)dest, out);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT4(light[3], light[2], light[1], light[0], shade_constants);
|
||||
|
||||
__m128i msrc_alpha = _mm_set1_epi16(src_alpha);
|
||||
__m128i mdest_alpha = _mm_set1_epi16(dest_alpha);
|
||||
|
||||
do
|
||||
{
|
||||
uint32_t pix0 = bufplce[0][local_vplce[0] >> bits];
|
||||
uint32_t pix1 = bufplce[1][local_vplce[1] >> bits];
|
||||
uint32_t pix2 = bufplce[2][local_vplce[2] >> bits];
|
||||
uint32_t pix3 = bufplce[3][local_vplce[3] >> bits];
|
||||
|
||||
local_vplce[0] = local_vplce[0] + local_vince[0];
|
||||
local_vplce[1] = local_vplce[1] + local_vince[1];
|
||||
local_vplce[2] = local_vplce[2] + local_vince[2];
|
||||
local_vplce[3] = local_vplce[3] + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
|
||||
VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
__m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
|
||||
__m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
|
||||
__m128i out = _mm_packus_epi16(out_lo, out_hi);
|
||||
|
||||
_mm_storeu_si128((__m128i*)dest, out);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
|
|
|
@ -1436,7 +1436,14 @@ void rt_subclamp1col_rgba (int hx, int sx, int yl, int yh)
|
|||
// Subtracts all four spans to the screen starting at sx with clamping.
|
||||
void rt_subclamp4cols_rgba (int sx, int yl, int yh)
|
||||
{
|
||||
#ifdef NO_SSE
|
||||
DrawerCommandQueue::QueueCommand<RtSubClamp4colsRGBACommand>(sx, yl, yh);
|
||||
#else
|
||||
if (!r_linearlight)
|
||||
DrawerCommandQueue::QueueCommand<RtSubClamp4colsRGBA_SSE_Command>(sx, yl, yh);
|
||||
else
|
||||
DrawerCommandQueue::QueueCommand<RtSubClamp4colsRGBA_AVX_Command>(sx, yl, yh);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Translates and subtracts one span at hx to the screen at sx with clamping.
|
||||
|
@ -1462,7 +1469,14 @@ void rt_revsubclamp1col_rgba (int hx, int sx, int yl, int yh)
|
|||
// Subtracts all four spans from the screen starting at sx with clamping.
|
||||
void rt_revsubclamp4cols_rgba (int sx, int yl, int yh)
|
||||
{
|
||||
#ifdef NO_SSE
|
||||
DrawerCommandQueue::QueueCommand<RtRevSubClamp4colsRGBACommand>(sx, yl, yh);
|
||||
#else
|
||||
if (!r_linearlight)
|
||||
DrawerCommandQueue::QueueCommand<RtRevSubClamp4colsRGBA_SSE_Command>(sx, yl, yh);
|
||||
else
|
||||
DrawerCommandQueue::QueueCommand<RtRevSubClamp4colsRGBA_AVX_Command>(sx, yl, yh);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Translates and subtracts one span at hx from the screen at sx with clamping.
|
||||
|
|
|
@ -493,3 +493,255 @@ public:
|
|||
}
|
||||
}
|
||||
};
|
||||
|
||||
class VecCommand(RtSubClamp4colsRGBA) : public DrawerCommand
|
||||
{
|
||||
int sx;
|
||||
int yl;
|
||||
int yh;
|
||||
BYTE * RESTRICT _destorg;
|
||||
int _pitch;
|
||||
fixed_t _light;
|
||||
fixed_t _srcalpha;
|
||||
fixed_t _destalpha;
|
||||
ShadeConstants _shade_constants;
|
||||
|
||||
public:
|
||||
VecCommand(RtSubClamp4colsRGBA)(int sx, int yl, int yh)
|
||||
{
|
||||
this->sx = sx;
|
||||
this->yl = yl;
|
||||
this->yh = yh;
|
||||
|
||||
_destorg = dc_destorg;
|
||||
_pitch = dc_pitch;
|
||||
_light = dc_light;
|
||||
_srcalpha = dc_srcalpha;
|
||||
_destalpha = dc_destalpha;
|
||||
_shade_constants = dc_shade_constants;
|
||||
}
|
||||
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
uint32_t *source;
|
||||
uint32_t *dest;
|
||||
int count;
|
||||
int pitch;
|
||||
int sincr;
|
||||
|
||||
count = thread->count_for_thread(yl, yh - yl + 1);
|
||||
if (count <= 0)
|
||||
return;
|
||||
|
||||
dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg);
|
||||
source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4;
|
||||
pitch = _pitch * thread->num_cores;
|
||||
sincr = 4 * thread->num_cores;
|
||||
|
||||
uint32_t light = calc_light_multiplier(_light);
|
||||
uint32_t *palette = (uint32_t*)GPalette.BaseColors;
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8);
|
||||
uint32_t bg_alpha = _destalpha >> (FRACBITS - 8);
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT(light);
|
||||
|
||||
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
|
||||
__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
|
||||
|
||||
do {
|
||||
uint32_t p0 = source[0];
|
||||
uint32_t p1 = source[1];
|
||||
uint32_t p2 = source[2];
|
||||
uint32_t p3 = source[3];
|
||||
|
||||
// shade_pal_index:
|
||||
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
|
||||
// unpack bg:
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
// (bg_red * bg_alpha - fg_red * fg_alpha) / 256:
|
||||
__m128i color_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, mbg_alpha), _mm_mullo_epi16(fg_hi, mfg_alpha)), 8);
|
||||
__m128i color_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, mbg_alpha), _mm_mullo_epi16(fg_lo, mfg_alpha)), 8);
|
||||
|
||||
__m128i color = _mm_packus_epi16(color_lo, color_hi);
|
||||
_mm_storeu_si128((__m128i*)dest, color);
|
||||
|
||||
source += sincr;
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT(light, shade_constants);
|
||||
|
||||
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
|
||||
__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
|
||||
|
||||
do {
|
||||
uint32_t p0 = source[0];
|
||||
uint32_t p1 = source[1];
|
||||
uint32_t p2 = source[2];
|
||||
uint32_t p3 = source[3];
|
||||
|
||||
// shade_pal_index:
|
||||
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
|
||||
// unpack bg:
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
// (bg_red * bg_alpha - fg_red * fg_alpha) / 256:
|
||||
__m128i color_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, mbg_alpha), _mm_mullo_epi16(fg_hi, mfg_alpha)), 8);
|
||||
__m128i color_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, mbg_alpha), _mm_mullo_epi16(fg_lo, mfg_alpha)), 8);
|
||||
|
||||
__m128i color = _mm_packus_epi16(color_lo, color_hi);
|
||||
_mm_storeu_si128((__m128i*)dest, color);
|
||||
|
||||
source += sincr;
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class VecCommand(RtRevSubClamp4colsRGBA) : public DrawerCommand
|
||||
{
|
||||
int sx;
|
||||
int yl;
|
||||
int yh;
|
||||
BYTE * RESTRICT _destorg;
|
||||
int _pitch;
|
||||
fixed_t _light;
|
||||
fixed_t _srcalpha;
|
||||
fixed_t _destalpha;
|
||||
ShadeConstants _shade_constants;
|
||||
|
||||
public:
|
||||
VecCommand(RtRevSubClamp4colsRGBA)(int sx, int yl, int yh)
|
||||
{
|
||||
this->sx = sx;
|
||||
this->yl = yl;
|
||||
this->yh = yh;
|
||||
|
||||
_destorg = dc_destorg;
|
||||
_pitch = dc_pitch;
|
||||
_light = dc_light;
|
||||
_srcalpha = dc_srcalpha;
|
||||
_destalpha = dc_destalpha;
|
||||
_shade_constants = dc_shade_constants;
|
||||
}
|
||||
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
uint32_t *source;
|
||||
uint32_t *dest;
|
||||
int count;
|
||||
int pitch;
|
||||
int sincr;
|
||||
|
||||
count = thread->count_for_thread(yl, yh - yl + 1);
|
||||
if (count <= 0)
|
||||
return;
|
||||
|
||||
dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg);
|
||||
source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4;
|
||||
pitch = _pitch * thread->num_cores;
|
||||
sincr = 4 * thread->num_cores;
|
||||
|
||||
uint32_t light = calc_light_multiplier(_light);
|
||||
uint32_t *palette = (uint32_t*)GPalette.BaseColors;
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8);
|
||||
uint32_t bg_alpha = _destalpha >> (FRACBITS - 8);
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT(light);
|
||||
|
||||
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
|
||||
__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
|
||||
|
||||
do {
|
||||
uint32_t p0 = source[0];
|
||||
uint32_t p1 = source[1];
|
||||
uint32_t p2 = source[2];
|
||||
uint32_t p3 = source[3];
|
||||
|
||||
// shade_pal_index:
|
||||
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
|
||||
// unpack bg:
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
// (fg_red * fg_alpha - bg_red * bg_alpha) / 256:
|
||||
__m128i color_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8);
|
||||
__m128i color_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8);
|
||||
|
||||
__m128i color = _mm_packus_epi16(color_lo, color_hi);
|
||||
_mm_storeu_si128((__m128i*)dest, color);
|
||||
|
||||
source += sincr;
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT(light, shade_constants);
|
||||
|
||||
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
|
||||
__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
|
||||
|
||||
do {
|
||||
uint32_t p0 = source[0];
|
||||
uint32_t p1 = source[1];
|
||||
uint32_t p2 = source[2];
|
||||
uint32_t p3 = source[3];
|
||||
|
||||
// shade_pal_index:
|
||||
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
|
||||
// unpack bg:
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
// (fg_red * fg_alpha - bg_red * bg_alpha) / 256:
|
||||
__m128i color_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8);
|
||||
__m128i color_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8);
|
||||
|
||||
__m128i color = _mm_packus_epi16(color_lo, color_hi);
|
||||
_mm_storeu_si128((__m128i*)dest, color);
|
||||
|
||||
source += sincr;
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
|
Loading…
Reference in a new issue