Added more SSE drawers

This commit is contained in:
Magnus Norddahl 2016-06-19 23:11:41 +02:00
parent 3f905197d0
commit 38aba81dcc
5 changed files with 1072 additions and 148 deletions

View file

@ -2411,10 +2411,7 @@ public:
do do
{ {
uint32_t pix = source[frac >> bits]; uint32_t pix = source[frac >> bits];
if (pix != 0) *dest = alpha_blend(shade_bgra(pix, light, shade_constants), *dest);
{
*dest = shade_bgra(pix, light, shade_constants);
}
frac += fracstep; frac += fracstep;
dest += pitch; dest += pitch;
} while (--count); } while (--count);
@ -2480,10 +2477,10 @@ public:
do do
{ {
uint32_t pix; uint32_t pix;
pix = bufplce[0][(place = local_vplce[0]) >> bits]; if (pix) dest[0] = shade_bgra(pix, light0, shade_constants); local_vplce[0] = place + local_vince[0]; pix = bufplce[0][(place = local_vplce[0]) >> bits]; dest[0] = alpha_blend(shade_bgra(pix, light0, shade_constants), dest[0]); local_vplce[0] = place + local_vince[0];
pix = bufplce[1][(place = local_vplce[1]) >> bits]; if (pix) dest[1] = shade_bgra(pix, light1, shade_constants); local_vplce[1] = place + local_vince[1]; pix = bufplce[1][(place = local_vplce[1]) >> bits]; dest[1] = alpha_blend(shade_bgra(pix, light1, shade_constants), dest[1]); local_vplce[1] = place + local_vince[1];
pix = bufplce[2][(place = local_vplce[2]) >> bits]; if (pix) dest[2] = shade_bgra(pix, light2, shade_constants); local_vplce[2] = place + local_vince[2]; pix = bufplce[2][(place = local_vplce[2]) >> bits]; dest[2] = alpha_blend(shade_bgra(pix, light2, shade_constants), dest[2]); local_vplce[2] = place + local_vince[2];
pix = bufplce[3][(place = local_vplce[3]) >> bits]; if (pix) dest[3] = shade_bgra(pix, light3, shade_constants); local_vplce[3] = place + local_vince[3]; pix = bufplce[3][(place = local_vplce[3]) >> bits]; dest[3] = alpha_blend(shade_bgra(pix, light3, shade_constants), dest[3]); local_vplce[3] = place + local_vince[3];
dest += pitch; dest += pitch;
} while (--count); } while (--count);
} }
@ -2535,29 +2532,31 @@ public:
uint32_t light = calc_light_multiplier(_light); uint32_t light = calc_light_multiplier(_light);
ShadeConstants shade_constants = _shade_constants; ShadeConstants shade_constants = _shade_constants;
uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8); uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
uint32_t bg_alpha = _destalpha >> (FRACBITS - 8); uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
do do
{ {
uint32_t pix = source[frac >> bits]; uint32_t pix = source[frac >> bits];
if (pix != 0)
{
uint32_t fg = shade_bgra(pix, light, shade_constants);
uint32_t fg_red = (fg >> 16) & 0xff;
uint32_t fg_green = (fg >> 8) & 0xff;
uint32_t fg_blue = fg & 0xff;
uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t fg_alpha, bg_alpha;
uint32_t bg_green = (*dest >> 8) & 0xff; calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha);
uint32_t bg_blue = (*dest) & 0xff;
uint32_t red = clamp<uint32_t>((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255); uint32_t fg = shade_bgra(pix, light, shade_constants);
uint32_t green = clamp<uint32_t>((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255); uint32_t fg_red = (fg >> 16) & 0xff;
uint32_t blue = clamp<uint32_t>((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255); uint32_t fg_green = (fg >> 8) & 0xff;
uint32_t fg_blue = fg & 0xff;
uint32_t bg_red = (*dest >> 16) & 0xff;
uint32_t bg_green = (*dest >> 8) & 0xff;
uint32_t bg_blue = (*dest) & 0xff;
uint32_t red = clamp<uint32_t>((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255);
uint32_t green = clamp<uint32_t>((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255);
uint32_t blue = clamp<uint32_t>((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255);
*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
}
frac += fracstep; frac += fracstep;
dest += pitch; dest += pitch;
} while (--count); } while (--count);
@ -2615,8 +2614,8 @@ public:
ShadeConstants shade_constants = _shade_constants; ShadeConstants shade_constants = _shade_constants;
uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8); uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
uint32_t bg_alpha = _destalpha >> (FRACBITS - 8); uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
@ -2632,23 +2631,25 @@ public:
for (int i = 0; i < 4; ++i) for (int i = 0; i < 4; ++i)
{ {
uint32_t pix = bufplce[i][local_vplce[i] >> bits]; uint32_t pix = bufplce[i][local_vplce[i] >> bits];
if (pix != 0)
{
uint32_t fg = shade_bgra(pix, light[i], shade_constants);
uint32_t fg_red = (fg >> 16) & 0xff;
uint32_t fg_green = (fg >> 8) & 0xff;
uint32_t fg_blue = fg & 0xff;
uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t fg_alpha, bg_alpha;
uint32_t bg_green = (*dest >> 8) & 0xff; calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha);
uint32_t bg_blue = (*dest) & 0xff;
uint32_t red = clamp<uint32_t>((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255); uint32_t fg = shade_bgra(pix, light[i], shade_constants);
uint32_t green = clamp<uint32_t>((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255); uint32_t fg_red = (fg >> 16) & 0xff;
uint32_t blue = clamp<uint32_t>((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255); uint32_t fg_green = (fg >> 8) & 0xff;
uint32_t fg_blue = fg & 0xff;
uint32_t bg_red = (*dest >> 16) & 0xff;
uint32_t bg_green = (*dest >> 8) & 0xff;
uint32_t bg_blue = (*dest) & 0xff;
uint32_t red = clamp<uint32_t>((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255);
uint32_t green = clamp<uint32_t>((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255);
uint32_t blue = clamp<uint32_t>((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255);
dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
}
local_vplce[i] += local_vince[i]; local_vplce[i] += local_vince[i];
} }
dest += pitch; dest += pitch;
@ -2702,29 +2703,31 @@ public:
uint32_t light = calc_light_multiplier(_light); uint32_t light = calc_light_multiplier(_light);
ShadeConstants shade_constants = _shade_constants; ShadeConstants shade_constants = _shade_constants;
uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8); uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
uint32_t bg_alpha = _destalpha >> (FRACBITS - 8); uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
do do
{ {
uint32_t pix = source[frac >> bits]; uint32_t pix = source[frac >> bits];
if (pix != 0)
{
uint32_t fg = shade_bgra(pix, light, shade_constants);
uint32_t fg_red = (fg >> 16) & 0xff;
uint32_t fg_green = (fg >> 8) & 0xff;
uint32_t fg_blue = fg & 0xff;
uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t fg_alpha, bg_alpha;
uint32_t bg_green = (*dest >> 8) & 0xff; calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha);
uint32_t bg_blue = (*dest) & 0xff;
uint32_t red = clamp<uint32_t>((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255); uint32_t fg = shade_bgra(pix, light, shade_constants);
uint32_t green = clamp<uint32_t>((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255); uint32_t fg_red = (fg >> 16) & 0xff;
uint32_t blue = clamp<uint32_t>((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255); uint32_t fg_green = (fg >> 8) & 0xff;
uint32_t fg_blue = fg & 0xff;
uint32_t bg_red = (*dest >> 16) & 0xff;
uint32_t bg_green = (*dest >> 8) & 0xff;
uint32_t bg_blue = (*dest) & 0xff;
uint32_t red = clamp<uint32_t>((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255);
uint32_t green = clamp<uint32_t>((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255);
uint32_t blue = clamp<uint32_t>((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255);
*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
}
frac += fracstep; frac += fracstep;
dest += pitch; dest += pitch;
} while (--count); } while (--count);
@ -2782,8 +2785,8 @@ public:
ShadeConstants shade_constants = _shade_constants; ShadeConstants shade_constants = _shade_constants;
uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8); uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
uint32_t bg_alpha = _destalpha >> (FRACBITS - 8); uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
@ -2799,23 +2802,25 @@ public:
for (int i = 0; i < 4; ++i) for (int i = 0; i < 4; ++i)
{ {
uint32_t pix = bufplce[i][local_vplce[i] >> bits]; uint32_t pix = bufplce[i][local_vplce[i] >> bits];
if (pix != 0)
{
uint32_t fg = shade_bgra(pix, light[i], shade_constants);
uint32_t fg_red = (fg >> 16) & 0xff;
uint32_t fg_green = (fg >> 8) & 0xff;
uint32_t fg_blue = fg & 0xff;
uint32_t bg_red = (dest[i] >> 16) & 0xff; uint32_t fg_alpha, bg_alpha;
uint32_t bg_green = (dest[i] >> 8) & 0xff; calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha);
uint32_t bg_blue = (dest[i]) & 0xff;
uint32_t red = clamp<uint32_t>((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255); uint32_t fg = shade_bgra(pix, light[i], shade_constants);
uint32_t green = clamp<uint32_t>((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255); uint32_t fg_red = (fg >> 16) & 0xff;
uint32_t blue = clamp<uint32_t>((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255); uint32_t fg_green = (fg >> 8) & 0xff;
uint32_t fg_blue = fg & 0xff;
uint32_t bg_red = (dest[i] >> 16) & 0xff;
uint32_t bg_green = (dest[i] >> 8) & 0xff;
uint32_t bg_blue = (dest[i]) & 0xff;
uint32_t red = clamp<uint32_t>((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255);
uint32_t green = clamp<uint32_t>((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255);
uint32_t blue = clamp<uint32_t>((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255);
dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
}
local_vplce[i] += local_vince[i]; local_vplce[i] += local_vince[i];
} }
dest += pitch; dest += pitch;
@ -2869,29 +2874,31 @@ public:
uint32_t light = calc_light_multiplier(_light); uint32_t light = calc_light_multiplier(_light);
ShadeConstants shade_constants = _shade_constants; ShadeConstants shade_constants = _shade_constants;
uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8); uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
uint32_t bg_alpha = _destalpha >> (FRACBITS - 8); uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
do do
{ {
uint32_t pix = source[frac >> bits]; uint32_t pix = source[frac >> bits];
if (pix != 0)
{
uint32_t fg = shade_bgra(pix, light, shade_constants);
uint32_t fg_red = (fg >> 16) & 0xff;
uint32_t fg_green = (fg >> 8) & 0xff;
uint32_t fg_blue = fg & 0xff;
uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t fg_alpha, bg_alpha;
uint32_t bg_green = (*dest >> 8) & 0xff; calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha);
uint32_t bg_blue = (*dest) & 0xff;
uint32_t red = clamp<uint32_t>((0x10000 - fg_red * fg_alpha + bg_red * bg_alpha) / 256, 256, 256 + 255) - 256; uint32_t fg = shade_bgra(pix, light, shade_constants);
uint32_t green = clamp<uint32_t>((0x10000 - fg_green * fg_alpha + bg_green * bg_alpha) / 256, 256, 256 + 255) - 256; uint32_t fg_red = (fg >> 16) & 0xff;
uint32_t blue = clamp<uint32_t>((0x10000 - fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256; uint32_t fg_green = (fg >> 8) & 0xff;
uint32_t fg_blue = fg & 0xff;
uint32_t bg_red = (*dest >> 16) & 0xff;
uint32_t bg_green = (*dest >> 8) & 0xff;
uint32_t bg_blue = (*dest) & 0xff;
uint32_t red = clamp<uint32_t>((0x10000 - fg_red * fg_alpha + bg_red * bg_alpha) / 256, 256, 256 + 255) - 256;
uint32_t green = clamp<uint32_t>((0x10000 - fg_green * fg_alpha + bg_green * bg_alpha) / 256, 256, 256 + 255) - 256;
uint32_t blue = clamp<uint32_t>((0x10000 - fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256;
*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
}
frac += fracstep; frac += fracstep;
dest += pitch; dest += pitch;
} while (--count); } while (--count);
@ -2949,8 +2956,8 @@ public:
ShadeConstants shade_constants = _shade_constants; ShadeConstants shade_constants = _shade_constants;
uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8); uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
uint32_t bg_alpha = _destalpha >> (FRACBITS - 8); uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
@ -2966,23 +2973,25 @@ public:
for (int i = 0; i < 4; ++i) for (int i = 0; i < 4; ++i)
{ {
uint32_t pix = bufplce[i][local_vplce[i] >> bits]; uint32_t pix = bufplce[i][local_vplce[i] >> bits];
if (pix != 0)
{
uint32_t fg = shade_bgra(pix, light[i], shade_constants);
uint32_t fg_red = (fg >> 16) & 0xff;
uint32_t fg_green = (fg >> 8) & 0xff;
uint32_t fg_blue = fg & 0xff;
uint32_t bg_red = (dest[i] >> 16) & 0xff; uint32_t fg_alpha, bg_alpha;
uint32_t bg_green = (dest[i] >> 8) & 0xff; calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha);
uint32_t bg_blue = (dest[i]) & 0xff;
uint32_t red = clamp<uint32_t>((0x10000 - fg_red * fg_alpha + bg_red * bg_alpha) / 256, 256, 256 + 255) - 256; uint32_t fg = shade_bgra(pix, light[i], shade_constants);
uint32_t green = clamp<uint32_t>((0x10000 - fg_green * fg_alpha + bg_green * bg_alpha) / 256, 256, 256 + 255) - 256; uint32_t fg_red = (fg >> 16) & 0xff;
uint32_t blue = clamp<uint32_t>((0x10000 - fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256; uint32_t fg_green = (fg >> 8) & 0xff;
uint32_t fg_blue = fg & 0xff;
uint32_t bg_red = (dest[i] >> 16) & 0xff;
uint32_t bg_green = (dest[i] >> 8) & 0xff;
uint32_t bg_blue = (dest[i]) & 0xff;
uint32_t red = clamp<uint32_t>((0x10000 - fg_red * fg_alpha + bg_red * bg_alpha) / 256, 256, 256 + 255) - 256;
uint32_t green = clamp<uint32_t>((0x10000 - fg_green * fg_alpha + bg_green * bg_alpha) / 256, 256, 256 + 255) - 256;
uint32_t blue = clamp<uint32_t>((0x10000 - fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256;
dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
}
local_vplce[i] += local_vince[i]; local_vplce[i] += local_vince[i];
} }
dest += pitch; dest += pitch;
@ -3036,29 +3045,31 @@ public:
uint32_t light = calc_light_multiplier(_light); uint32_t light = calc_light_multiplier(_light);
ShadeConstants shade_constants = _shade_constants; ShadeConstants shade_constants = _shade_constants;
uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8); uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
uint32_t bg_alpha = _destalpha >> (FRACBITS - 8); uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
do do
{ {
uint32_t pix = source[frac >> bits]; uint32_t pix = source[frac >> bits];
if (pix != 0)
{
uint32_t fg = shade_bgra(pix, light, shade_constants);
uint32_t fg_red = (fg >> 16) & 0xff;
uint32_t fg_green = (fg >> 8) & 0xff;
uint32_t fg_blue = fg & 0xff;
uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t fg_alpha, bg_alpha;
uint32_t bg_green = (*dest >> 8) & 0xff; calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha);
uint32_t bg_blue = (*dest) & 0xff;
uint32_t red = clamp<uint32_t>((0x10000 + fg_red * fg_alpha - bg_red * bg_alpha) / 256, 256, 256 + 255) - 256; uint32_t fg = shade_bgra(pix, light, shade_constants);
uint32_t green = clamp<uint32_t>((0x10000 + fg_green * fg_alpha - bg_green * bg_alpha) / 256, 256, 256 + 255) - 256; uint32_t fg_red = (fg >> 16) & 0xff;
uint32_t blue = clamp<uint32_t>((0x10000 + fg_blue * fg_alpha - bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256; uint32_t fg_green = (fg >> 8) & 0xff;
uint32_t fg_blue = fg & 0xff;
uint32_t bg_red = (*dest >> 16) & 0xff;
uint32_t bg_green = (*dest >> 8) & 0xff;
uint32_t bg_blue = (*dest) & 0xff;
uint32_t red = clamp<uint32_t>((0x10000 + fg_red * fg_alpha - bg_red * bg_alpha) / 256, 256, 256 + 255) - 256;
uint32_t green = clamp<uint32_t>((0x10000 + fg_green * fg_alpha - bg_green * bg_alpha) / 256, 256, 256 + 255) - 256;
uint32_t blue = clamp<uint32_t>((0x10000 + fg_blue * fg_alpha - bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256;
*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
}
frac += fracstep; frac += fracstep;
dest += pitch; dest += pitch;
} while (--count); } while (--count);
@ -3116,8 +3127,8 @@ public:
ShadeConstants shade_constants = _shade_constants; ShadeConstants shade_constants = _shade_constants;
uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8); uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
uint32_t bg_alpha = _destalpha >> (FRACBITS - 8); uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
@ -3133,23 +3144,25 @@ public:
for (int i = 0; i < 4; ++i) for (int i = 0; i < 4; ++i)
{ {
uint32_t pix = bufplce[i][local_vplce[i] >> bits]; uint32_t pix = bufplce[i][local_vplce[i] >> bits];
if (pix != 0)
{
uint32_t fg = shade_bgra(pix, light[i], shade_constants);
uint32_t fg_red = (fg >> 16) & 0xff;
uint32_t fg_green = (fg >> 8) & 0xff;
uint32_t fg_blue = fg & 0xff;
uint32_t bg_red = (dest[i] >> 16) & 0xff; uint32_t fg_alpha, bg_alpha;
uint32_t bg_green = (dest[i] >> 8) & 0xff; calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha);
uint32_t bg_blue = (dest[i]) & 0xff;
uint32_t red = clamp<uint32_t>((0x10000 + fg_red * fg_alpha - bg_red * bg_alpha) / 256, 256, 256 + 255) - 256; uint32_t fg = shade_bgra(pix, light[i], shade_constants);
uint32_t green = clamp<uint32_t>((0x10000 + fg_green * fg_alpha - bg_green * bg_alpha) / 256, 256, 256 + 255) - 256; uint32_t fg_red = (fg >> 16) & 0xff;
uint32_t blue = clamp<uint32_t>((0x10000 + fg_blue * fg_alpha - bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256; uint32_t fg_green = (fg >> 8) & 0xff;
uint32_t fg_blue = fg & 0xff;
uint32_t bg_red = (dest[i] >> 16) & 0xff;
uint32_t bg_green = (dest[i] >> 8) & 0xff;
uint32_t bg_blue = (dest[i]) & 0xff;
uint32_t red = clamp<uint32_t>((0x10000 + fg_red * fg_alpha - bg_red * bg_alpha) / 256, 256, 256 + 255) - 256;
uint32_t green = clamp<uint32_t>((0x10000 + fg_green * fg_alpha - bg_green * bg_alpha) / 256, 256, 256 + 255) - 256;
uint32_t blue = clamp<uint32_t>((0x10000 + fg_blue * fg_alpha - bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256;
dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
}
local_vplce[i] += local_vince[i]; local_vplce[i] += local_vince[i];
} }
dest += pitch; dest += pitch;
@ -3733,7 +3746,14 @@ fixed_t tmvline1_add_rgba()
void tmvline4_add_rgba() void tmvline4_add_rgba()
{ {
#ifdef NO_SSE
DrawerCommandQueue::QueueCommand<Tmvline4AddRGBACommand>(); DrawerCommandQueue::QueueCommand<Tmvline4AddRGBACommand>();
#else
if (!r_linearlight)
DrawerCommandQueue::QueueCommand<Tmvline4AddRGBA_SSE_Command>();
else
DrawerCommandQueue::QueueCommand<Tmvline4AddRGBA_AVX_Command>();
#endif
for (int i = 0; i < 4; i++) for (int i = 0; i < 4; i++)
vplce[i] += vince[i] * dc_count; vplce[i] += vince[i] * dc_count;
} }
@ -3746,7 +3766,14 @@ fixed_t tmvline1_addclamp_rgba()
void tmvline4_addclamp_rgba() void tmvline4_addclamp_rgba()
{ {
#ifdef NO_SSE
DrawerCommandQueue::QueueCommand<Tmvline4AddClampRGBACommand>(); DrawerCommandQueue::QueueCommand<Tmvline4AddClampRGBACommand>();
#else
if (!r_linearlight)
DrawerCommandQueue::QueueCommand<Tmvline4AddClampRGBA_SSE_Command>();
else
DrawerCommandQueue::QueueCommand<Tmvline4AddClampRGBA_AVX_Command>();
#endif
for (int i = 0; i < 4; i++) for (int i = 0; i < 4; i++)
vplce[i] += vince[i] * dc_count; vplce[i] += vince[i] * dc_count;
} }
@ -3759,7 +3786,14 @@ fixed_t tmvline1_subclamp_rgba()
void tmvline4_subclamp_rgba() void tmvline4_subclamp_rgba()
{ {
#ifdef NO_SSE
DrawerCommandQueue::QueueCommand<Tmvline4SubClampRGBACommand>(); DrawerCommandQueue::QueueCommand<Tmvline4SubClampRGBACommand>();
#else
if (!r_linearlight)
DrawerCommandQueue::QueueCommand<Tmvline4SubClampRGBA_SSE_Command>();
else
DrawerCommandQueue::QueueCommand<Tmvline4SubClampRGBA_AVX_Command>();
#endif
for (int i = 0; i < 4; i++) for (int i = 0; i < 4; i++)
vplce[i] += vince[i] * dc_count; vplce[i] += vince[i] * dc_count;
} }
@ -3772,7 +3806,14 @@ fixed_t tmvline1_revsubclamp_rgba()
void tmvline4_revsubclamp_rgba() void tmvline4_revsubclamp_rgba()
{ {
#ifdef NO_SSE
DrawerCommandQueue::QueueCommand<Tmvline4RevSubClampRGBACommand>(); DrawerCommandQueue::QueueCommand<Tmvline4RevSubClampRGBACommand>();
#else
if (!r_linearlight)
DrawerCommandQueue::QueueCommand<Tmvline4RevSubClampRGBA_SSE_Command>();
else
DrawerCommandQueue::QueueCommand<Tmvline4RevSubClampRGBA_AVX_Command>();
#endif
for (int i = 0; i < 4; i++) for (int i = 0; i < 4; i++)
vplce[i] += vince[i] * dc_count; vplce[i] += vince[i] * dc_count;
} }

View file

@ -417,9 +417,9 @@ FORCEINLINE uint32_t alpha_blend(uint32_t fg, uint32_t bg)
uint32_t bg_green = (bg >> 8) & 0xff; uint32_t bg_green = (bg >> 8) & 0xff;
uint32_t bg_blue = bg & 0xff; uint32_t bg_blue = bg & 0xff;
uint32_t red = ((fg_red * alpha) + (bg_red * inv_alpha)) / 256; uint32_t red = clamp<uint32_t>(fg_red + (bg_red * inv_alpha) / 256, 0, 255);
uint32_t green = ((fg_green * alpha) + (bg_green * inv_alpha)) / 256; uint32_t green = clamp<uint32_t>(fg_green + (bg_green * inv_alpha) / 256, 0, 255);
uint32_t blue = ((fg_blue * alpha) + (bg_blue * inv_alpha)) / 256; uint32_t blue = clamp<uint32_t>(fg_blue + (bg_blue * inv_alpha) / 256, 0, 255);
return 0xff000000 | (red << 16) | (green << 8) | blue; return 0xff000000 | (red << 16) | (green << 8) | blue;
} }
@ -543,7 +543,7 @@ FORCEINLINE uint32_t alpha_blend(uint32_t fg, uint32_t bg)
/* /*
// Complex shade 8 pixels // Complex shade 8 pixels
#define AVX2_SHADE(fg, shade_constants) { \ #define AVX_SHADE(fg, shade_constants) { \
__m256i fg_hi = _mm256_unpackhi_epi8(fg, _mm256_setzero_si256()); \ __m256i fg_hi = _mm256_unpackhi_epi8(fg, _mm256_setzero_si256()); \
__m256i fg_lo = _mm256_unpacklo_epi8(fg, _mm256_setzero_si256()); \ __m256i fg_lo = _mm256_unpacklo_epi8(fg, _mm256_setzero_si256()); \
\ \
@ -566,8 +566,58 @@ FORCEINLINE uint32_t alpha_blend(uint32_t fg, uint32_t bg)
} }
*/ */
// Normal premultiplied alpha blend using the alpha from fg
#define VEC_ALPHA_BLEND(fg,bg) { \
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); \
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); \
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); \
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); \
__m128i m255 = _mm_set1_epi16(255); \
__m128i inv_alpha_hi = _mm_sub_epi16(m255, _mm_shufflehi_epi16(_mm_shufflelo_epi16(fg_hi, _MM_SHUFFLE(3,3,3,3)), _MM_SHUFFLE(3,3,3,3))); \
__m128i inv_alpha_lo = _mm_sub_epi16(m255, _mm_shufflehi_epi16(_mm_shufflelo_epi16(fg_lo, _MM_SHUFFLE(3,3,3,3)), _MM_SHUFFLE(3,3,3,3))); \
inv_alpha_hi = _mm_add_epi16(inv_alpha_hi, _mm_srli_epi16(inv_alpha_hi, 7)); \
inv_alpha_lo = _mm_add_epi16(inv_alpha_lo, _mm_srli_epi16(inv_alpha_lo, 7)); \
bg_hi = _mm_mullo_epi16(bg_hi, inv_alpha_hi); \
bg_hi = _mm_srli_epi16(bg_hi, 8); \
bg_lo = _mm_mullo_epi16(bg_lo, inv_alpha_lo); \
bg_lo = _mm_srli_epi16(bg_lo, 8); \
bg = _mm_packus_epi16(bg_lo, bg_hi); \
fg = _mm_adds_epu8(fg, bg); \
}
/*
FORCEINLINE void calc_blend_alpha(uint32_t fg, uint32_t src_alpha, uint32_t dest_alpha, uint32_t &fg_alpha, uint32_t &bg_alpha)
{
fg_alpha = src_alpha;
bg_alpha = dest_alpha;
}
#define VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha) \
__m128i fg_alpha_hi = msrc_alpha; \
__m128i fg_alpha_lo = msrc_alpha; \
__m128i bg_alpha_hi = mdest_alpha; \
__m128i bg_alpha_lo = mdest_alpha;
*/
// Calculates the final alpha values to be used when combined with the source texture alpha channel
FORCEINLINE void calc_blend_alpha(uint32_t fg, uint32_t src_alpha, uint32_t dest_alpha, uint32_t &fg_alpha, uint32_t &bg_alpha)
{
fg_alpha = (fg >> 24) & 0xff;
fg_alpha += fg_alpha >> 7;
bg_alpha = (dest_alpha * (256 - fg_alpha)) >> 8;
fg_alpha = (src_alpha * fg_alpha) >> 8;
}
// Calculates the final alpha values to be used when combined with the source texture alpha channel
#define VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha) \
__m128i fg_alpha_hi = _mm_shufflehi_epi16(_mm_shufflelo_epi16(_mm_unpackhi_epi8(fg, _mm_setzero_si128()), _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)); \
__m128i fg_alpha_lo = _mm_shufflehi_epi16(_mm_shufflelo_epi16(_mm_unpacklo_epi8(fg, _mm_setzero_si128()), _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)); \
fg_alpha_hi = _mm_add_epi16(fg_alpha_hi, _mm_srli_epi16(fg_alpha_hi, 7)); \
fg_alpha_lo = _mm_add_epi16(fg_alpha_lo, _mm_srli_epi16(fg_alpha_lo, 7)); \
__m128i bg_alpha_hi = _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16(256), fg_alpha_hi), mdest_alpha), 8); \
__m128i bg_alpha_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16(256), fg_alpha_lo), mdest_alpha), 8); \
fg_alpha_hi = _mm_srli_epi16(_mm_mullo_epi16(fg_alpha_hi, msrc_alpha), 8); \
fg_alpha_lo = _mm_srli_epi16(_mm_mullo_epi16(fg_alpha_lo, msrc_alpha), 8);
// Calculate constants for a simple shade // Calculate constants for a simple shade
#define SSE_SHADE_SIMPLE_INIT(light) \ #define SSE_SHADE_SIMPLE_INIT(light) \

View file

@ -444,17 +444,16 @@ public:
uint32_t pix2 = bufplce[2][place2 >> bits]; uint32_t pix2 = bufplce[2][place2 >> bits];
uint32_t pix3 = bufplce[3][place3 >> bits]; uint32_t pix3 = bufplce[3][place3 >> bits];
// movemask = !(pix == 0)
__m128i movemask = _mm_xor_si128(_mm_cmpeq_epi32(_mm_set_epi32(pix3, pix2, pix1, pix0), _mm_setzero_si128()), _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
local_vplce[0] = place0 + local_vince[0]; local_vplce[0] = place0 + local_vince[0];
local_vplce[1] = place1 + local_vince[1]; local_vplce[1] = place1 + local_vince[1];
local_vplce[2] = place2 + local_vince[2]; local_vplce[2] = place2 + local_vince[2];
local_vplce[3] = place3 + local_vince[3]; local_vplce[3] = place3 + local_vince[3];
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0); __m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
VEC_SHADE_SIMPLE(fg); VEC_SHADE_SIMPLE(fg);
_mm_maskmoveu_si128(fg, movemask, (char*)dest); VEC_ALPHA_BLEND(fg, bg);
_mm_storeu_si128((__m128i*)dest, fg);
dest += pitch; dest += pitch;
} while (--count); } while (--count);
} }
@ -473,17 +472,585 @@ public:
uint32_t pix2 = bufplce[2][place2 >> bits]; uint32_t pix2 = bufplce[2][place2 >> bits];
uint32_t pix3 = bufplce[3][place3 >> bits]; uint32_t pix3 = bufplce[3][place3 >> bits];
// movemask = !(pix == 0)
__m128i movemask = _mm_xor_si128(_mm_cmpeq_epi32(_mm_set_epi32(pix3, pix2, pix1, pix0), _mm_setzero_si128()), _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
local_vplce[0] = place0 + local_vince[0]; local_vplce[0] = place0 + local_vince[0];
local_vplce[1] = place1 + local_vince[1]; local_vplce[1] = place1 + local_vince[1];
local_vplce[2] = place2 + local_vince[2]; local_vplce[2] = place2 + local_vince[2];
local_vplce[3] = place3 + local_vince[3]; local_vplce[3] = place3 + local_vince[3];
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0); __m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
VEC_SHADE(fg, shade_constants); VEC_SHADE(fg, shade_constants);
_mm_maskmoveu_si128(fg, movemask, (char*)dest); VEC_ALPHA_BLEND(fg, bg);
_mm_storeu_si128((__m128i*)dest, fg);
dest += pitch;
} while (--count);
}
}
};
class VecCommand(Tmvline4AddRGBA) : public DrawerCommand
{
BYTE * RESTRICT _dest;
int _count;
int _pitch;
ShadeConstants _shade_constants;
fixed_t _srcalpha;
fixed_t _destalpha;
int tmvlinebits;
fixed_t palookuplight[4];
DWORD vplce[4];
DWORD vince[4];
const uint32 * RESTRICT bufplce[4];
public:
VecCommand(Tmvline4AddRGBA)()
{
_dest = dc_dest;
_count = dc_count;
_pitch = dc_pitch;
_shade_constants = dc_shade_constants;
_srcalpha = dc_srcalpha;
_destalpha = dc_destalpha;
tmvlinebits = ::tmvlinebits;
for (int i = 0; i < 4; i++)
{
palookuplight[i] = ::palookuplight[i];
vplce[i] = ::vplce[i];
vince[i] = ::vince[i];
bufplce[i] = (const uint32 *)::bufplce[i];
}
}
void Execute(DrawerThread *thread) override
{
int count = thread->count_for_thread(_dest_y, _count);
if (count <= 0)
return;
uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
int pitch = _pitch * thread->num_cores;
int bits = tmvlinebits;
uint32_t light[4];
light[0] = calc_light_multiplier(palookuplight[0]);
light[1] = calc_light_multiplier(palookuplight[1]);
light[2] = calc_light_multiplier(palookuplight[2]);
light[3] = calc_light_multiplier(palookuplight[3]);
ShadeConstants shade_constants = _shade_constants;
uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
int skipped = thread->skipped_by_thread(_dest_y);
for (int i = 0; i < 4; i++)
{
local_vplce[i] += local_vince[i] * skipped;
local_vince[i] *= thread->num_cores;
}
if (shade_constants.simple_shade)
{
VEC_SHADE_SIMPLE_INIT4(light[3], light[2], light[1], light[0]);
__m128i msrc_alpha = _mm_set1_epi16(src_alpha);
__m128i mdest_alpha = _mm_set1_epi16(dest_alpha);
do
{
uint32_t pix0 = bufplce[0][local_vplce[0] >> bits];
uint32_t pix1 = bufplce[1][local_vplce[1] >> bits];
uint32_t pix2 = bufplce[2][local_vplce[2] >> bits];
uint32_t pix3 = bufplce[3][local_vplce[3] >> bits];
local_vplce[0] = local_vplce[0] + local_vince[0];
local_vplce[1] = local_vplce[1] + local_vince[1];
local_vplce[2] = local_vplce[2] + local_vince[2];
local_vplce[3] = local_vplce[3] + local_vince[3];
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha);
VEC_SHADE_SIMPLE(fg);
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
__m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
__m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
__m128i out = _mm_packus_epi16(out_lo, out_hi);
_mm_storeu_si128((__m128i*)dest, out);
dest += pitch;
} while (--count);
}
else
{
VEC_SHADE_INIT4(light[3], light[2], light[1], light[0], shade_constants);
__m128i msrc_alpha = _mm_set1_epi16(src_alpha);
__m128i mdest_alpha = _mm_set1_epi16(dest_alpha);
do
{
uint32_t pix0 = bufplce[0][local_vplce[0] >> bits];
uint32_t pix1 = bufplce[1][local_vplce[1] >> bits];
uint32_t pix2 = bufplce[2][local_vplce[2] >> bits];
uint32_t pix3 = bufplce[3][local_vplce[3] >> bits];
local_vplce[0] = local_vplce[0] + local_vince[0];
local_vplce[1] = local_vplce[1] + local_vince[1];
local_vplce[2] = local_vplce[2] + local_vince[2];
local_vplce[3] = local_vplce[3] + local_vince[3];
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha);
VEC_SHADE(fg, shade_constants);
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
__m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
__m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
__m128i out = _mm_packus_epi16(out_lo, out_hi);
_mm_storeu_si128((__m128i*)dest, out);
dest += pitch;
} while (--count);
}
}
};
class VecCommand(Tmvline4AddClampRGBA) : public DrawerCommand
{
BYTE * RESTRICT _dest;
int _count;
int _pitch;
ShadeConstants _shade_constants;
fixed_t _srcalpha;
fixed_t _destalpha;
int tmvlinebits;
fixed_t palookuplight[4];
DWORD vplce[4];
DWORD vince[4];
const uint32 *RESTRICT bufplce[4];
public:
VecCommand(Tmvline4AddClampRGBA)()
{
_dest = dc_dest;
_count = dc_count;
_pitch = dc_pitch;
_shade_constants = dc_shade_constants;
_srcalpha = dc_srcalpha;
_destalpha = dc_destalpha;
tmvlinebits = ::tmvlinebits;
for (int i = 0; i < 4; i++)
{
palookuplight[i] = ::palookuplight[i];
vplce[i] = ::vplce[i];
vince[i] = ::vince[i];
bufplce[i] = (const uint32 *)::bufplce[i];
}
}
void Execute(DrawerThread *thread) override
{
int count = thread->count_for_thread(_dest_y, _count);
if (count <= 0)
return;
uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
int pitch = _pitch * thread->num_cores;
int bits = tmvlinebits;
uint32_t light[4];
light[0] = calc_light_multiplier(palookuplight[0]);
light[1] = calc_light_multiplier(palookuplight[1]);
light[2] = calc_light_multiplier(palookuplight[2]);
light[3] = calc_light_multiplier(palookuplight[3]);
ShadeConstants shade_constants = _shade_constants;
uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
int skipped = thread->skipped_by_thread(_dest_y);
for (int i = 0; i < 4; i++)
{
local_vplce[i] += local_vince[i] * skipped;
local_vince[i] *= thread->num_cores;
}
if (shade_constants.simple_shade)
{
VEC_SHADE_SIMPLE_INIT4(light[3], light[2], light[1], light[0]);
__m128i msrc_alpha = _mm_set1_epi16(src_alpha);
__m128i mdest_alpha = _mm_set1_epi16(dest_alpha);
do
{
uint32_t pix0 = bufplce[0][local_vplce[0] >> bits];
uint32_t pix1 = bufplce[1][local_vplce[1] >> bits];
uint32_t pix2 = bufplce[2][local_vplce[2] >> bits];
uint32_t pix3 = bufplce[3][local_vplce[3] >> bits];
local_vplce[0] = local_vplce[0] + local_vince[0];
local_vplce[1] = local_vplce[1] + local_vince[1];
local_vplce[2] = local_vplce[2] + local_vince[2];
local_vplce[3] = local_vplce[3] + local_vince[3];
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha);
VEC_SHADE_SIMPLE(fg);
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
__m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
__m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
__m128i out = _mm_packus_epi16(out_lo, out_hi);
_mm_storeu_si128((__m128i*)dest, out);
dest += pitch;
} while (--count);
}
else
{
VEC_SHADE_INIT4(light[3], light[2], light[1], light[0], shade_constants);
__m128i msrc_alpha = _mm_set1_epi16(src_alpha);
__m128i mdest_alpha = _mm_set1_epi16(dest_alpha);
do
{
uint32_t pix0 = bufplce[0][local_vplce[0] >> bits];
uint32_t pix1 = bufplce[1][local_vplce[1] >> bits];
uint32_t pix2 = bufplce[2][local_vplce[2] >> bits];
uint32_t pix3 = bufplce[3][local_vplce[3] >> bits];
local_vplce[0] = local_vplce[0] + local_vince[0];
local_vplce[1] = local_vplce[1] + local_vince[1];
local_vplce[2] = local_vplce[2] + local_vince[2];
local_vplce[3] = local_vplce[3] + local_vince[3];
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha);
VEC_SHADE(fg, shade_constants);
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
__m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
__m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
__m128i out = _mm_packus_epi16(out_lo, out_hi);
_mm_storeu_si128((__m128i*)dest, out);
dest += pitch;
} while (--count);
}
}
};
class VecCommand(Tmvline4SubClampRGBA) : public DrawerCommand
{
BYTE * RESTRICT _dest;
int _count;
int _pitch;
ShadeConstants _shade_constants;
fixed_t _srcalpha;
fixed_t _destalpha;
int tmvlinebits;
fixed_t palookuplight[4];
DWORD vplce[4];
DWORD vince[4];
const uint32 *RESTRICT bufplce[4];
public:
VecCommand(Tmvline4SubClampRGBA)()
{
_dest = dc_dest;
_count = dc_count;
_pitch = dc_pitch;
_shade_constants = dc_shade_constants;
_srcalpha = dc_srcalpha;
_destalpha = dc_destalpha;
tmvlinebits = ::tmvlinebits;
for (int i = 0; i < 4; i++)
{
palookuplight[i] = ::palookuplight[i];
vplce[i] = ::vplce[i];
vince[i] = ::vince[i];
bufplce[i] = (const uint32 *)::bufplce[i];
}
}
void Execute(DrawerThread *thread) override
{
int count = thread->count_for_thread(_dest_y, _count);
if (count <= 0)
return;
uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
int pitch = _pitch * thread->num_cores;
int bits = tmvlinebits;
uint32_t light[4];
light[0] = calc_light_multiplier(palookuplight[0]);
light[1] = calc_light_multiplier(palookuplight[1]);
light[2] = calc_light_multiplier(palookuplight[2]);
light[3] = calc_light_multiplier(palookuplight[3]);
ShadeConstants shade_constants = _shade_constants;
uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
int skipped = thread->skipped_by_thread(_dest_y);
for (int i = 0; i < 4; i++)
{
local_vplce[i] += local_vince[i] * skipped;
local_vince[i] *= thread->num_cores;
}
if (shade_constants.simple_shade)
{
VEC_SHADE_SIMPLE_INIT4(light[3], light[2], light[1], light[0]);
__m128i msrc_alpha = _mm_set1_epi16(src_alpha);
__m128i mdest_alpha = _mm_set1_epi16(dest_alpha);
do
{
uint32_t pix0 = bufplce[0][local_vplce[0] >> bits];
uint32_t pix1 = bufplce[1][local_vplce[1] >> bits];
uint32_t pix2 = bufplce[2][local_vplce[2] >> bits];
uint32_t pix3 = bufplce[3][local_vplce[3] >> bits];
local_vplce[0] = local_vplce[0] + local_vince[0];
local_vplce[1] = local_vplce[1] + local_vince[1];
local_vplce[2] = local_vplce[2] + local_vince[2];
local_vplce[3] = local_vplce[3] + local_vince[3];
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha);
VEC_SHADE_SIMPLE(fg);
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
__m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, bg_alpha_hi), _mm_mullo_epi16(fg_hi, fg_alpha_hi)), 8);
__m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, bg_alpha_lo), _mm_mullo_epi16(fg_lo, fg_alpha_lo)), 8);
__m128i out = _mm_packus_epi16(out_lo, out_hi);
_mm_storeu_si128((__m128i*)dest, out);
dest += pitch;
} while (--count);
}
else
{
VEC_SHADE_INIT4(light[3], light[2], light[1], light[0], shade_constants);
__m128i msrc_alpha = _mm_set1_epi16(src_alpha);
__m128i mdest_alpha = _mm_set1_epi16(dest_alpha);
do
{
uint32_t pix0 = bufplce[0][local_vplce[0] >> bits];
uint32_t pix1 = bufplce[1][local_vplce[1] >> bits];
uint32_t pix2 = bufplce[2][local_vplce[2] >> bits];
uint32_t pix3 = bufplce[3][local_vplce[3] >> bits];
local_vplce[0] = local_vplce[0] + local_vince[0];
local_vplce[1] = local_vplce[1] + local_vince[1];
local_vplce[2] = local_vplce[2] + local_vince[2];
local_vplce[3] = local_vplce[3] + local_vince[3];
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha);
VEC_SHADE(fg, shade_constants);
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
__m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, bg_alpha_hi), _mm_mullo_epi16(fg_hi, fg_alpha_hi)), 8);
__m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, bg_alpha_lo), _mm_mullo_epi16(fg_lo, fg_alpha_lo)), 8);
__m128i out = _mm_packus_epi16(out_lo, out_hi);
_mm_storeu_si128((__m128i*)dest, out);
dest += pitch;
} while (--count);
}
}
};
class VecCommand(Tmvline4RevSubClampRGBA) : public DrawerCommand
{
BYTE * RESTRICT _dest;
int _count;
int _pitch;
ShadeConstants _shade_constants;
fixed_t _srcalpha;
fixed_t _destalpha;
int tmvlinebits;
fixed_t palookuplight[4];
DWORD vplce[4];
DWORD vince[4];
const uint32 *RESTRICT bufplce[4];
public:
VecCommand(Tmvline4RevSubClampRGBA)()
{
_dest = dc_dest;
_count = dc_count;
_pitch = dc_pitch;
_shade_constants = dc_shade_constants;
_srcalpha = dc_srcalpha;
_destalpha = dc_destalpha;
tmvlinebits = ::tmvlinebits;
for (int i = 0; i < 4; i++)
{
palookuplight[i] = ::palookuplight[i];
vplce[i] = ::vplce[i];
vince[i] = ::vince[i];
bufplce[i] = (const uint32 *)::bufplce[i];
}
}
void Execute(DrawerThread *thread) override
{
int count = thread->count_for_thread(_dest_y, _count);
if (count <= 0)
return;
uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
int pitch = _pitch * thread->num_cores;
int bits = tmvlinebits;
uint32_t light[4];
light[0] = calc_light_multiplier(palookuplight[0]);
light[1] = calc_light_multiplier(palookuplight[1]);
light[2] = calc_light_multiplier(palookuplight[2]);
light[3] = calc_light_multiplier(palookuplight[3]);
ShadeConstants shade_constants = _shade_constants;
uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
int skipped = thread->skipped_by_thread(_dest_y);
for (int i = 0; i < 4; i++)
{
local_vplce[i] += local_vince[i] * skipped;
local_vince[i] *= thread->num_cores;
}
if (shade_constants.simple_shade)
{
VEC_SHADE_SIMPLE_INIT4(light[3], light[2], light[1], light[0]);
__m128i msrc_alpha = _mm_set1_epi16(src_alpha);
__m128i mdest_alpha = _mm_set1_epi16(dest_alpha);
do
{
uint32_t pix0 = bufplce[0][local_vplce[0] >> bits];
uint32_t pix1 = bufplce[1][local_vplce[1] >> bits];
uint32_t pix2 = bufplce[2][local_vplce[2] >> bits];
uint32_t pix3 = bufplce[3][local_vplce[3] >> bits];
local_vplce[0] = local_vplce[0] + local_vince[0];
local_vplce[1] = local_vplce[1] + local_vince[1];
local_vplce[2] = local_vplce[2] + local_vince[2];
local_vplce[3] = local_vplce[3] + local_vince[3];
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha);
VEC_SHADE_SIMPLE(fg);
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
__m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
__m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
__m128i out = _mm_packus_epi16(out_lo, out_hi);
_mm_storeu_si128((__m128i*)dest, out);
dest += pitch;
} while (--count);
}
else
{
VEC_SHADE_INIT4(light[3], light[2], light[1], light[0], shade_constants);
__m128i msrc_alpha = _mm_set1_epi16(src_alpha);
__m128i mdest_alpha = _mm_set1_epi16(dest_alpha);
do
{
uint32_t pix0 = bufplce[0][local_vplce[0] >> bits];
uint32_t pix1 = bufplce[1][local_vplce[1] >> bits];
uint32_t pix2 = bufplce[2][local_vplce[2] >> bits];
uint32_t pix3 = bufplce[3][local_vplce[3] >> bits];
local_vplce[0] = local_vplce[0] + local_vince[0];
local_vplce[1] = local_vplce[1] + local_vince[1];
local_vplce[2] = local_vplce[2] + local_vince[2];
local_vplce[3] = local_vplce[3] + local_vince[3];
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha);
VEC_SHADE(fg, shade_constants);
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
__m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
__m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
__m128i out = _mm_packus_epi16(out_lo, out_hi);
_mm_storeu_si128((__m128i*)dest, out);
dest += pitch; dest += pitch;
} while (--count); } while (--count);
} }

View file

@ -1436,7 +1436,14 @@ void rt_subclamp1col_rgba (int hx, int sx, int yl, int yh)
// Subtracts all four spans to the screen starting at sx with clamping. // Subtracts all four spans to the screen starting at sx with clamping.
void rt_subclamp4cols_rgba (int sx, int yl, int yh) void rt_subclamp4cols_rgba (int sx, int yl, int yh)
{ {
#ifdef NO_SSE
DrawerCommandQueue::QueueCommand<RtSubClamp4colsRGBACommand>(sx, yl, yh); DrawerCommandQueue::QueueCommand<RtSubClamp4colsRGBACommand>(sx, yl, yh);
#else
if (!r_linearlight)
DrawerCommandQueue::QueueCommand<RtSubClamp4colsRGBA_SSE_Command>(sx, yl, yh);
else
DrawerCommandQueue::QueueCommand<RtSubClamp4colsRGBA_AVX_Command>(sx, yl, yh);
#endif
} }
// Translates and subtracts one span at hx to the screen at sx with clamping. // Translates and subtracts one span at hx to the screen at sx with clamping.
@ -1462,7 +1469,14 @@ void rt_revsubclamp1col_rgba (int hx, int sx, int yl, int yh)
// Subtracts all four spans from the screen starting at sx with clamping. // Subtracts all four spans from the screen starting at sx with clamping.
void rt_revsubclamp4cols_rgba (int sx, int yl, int yh) void rt_revsubclamp4cols_rgba (int sx, int yl, int yh)
{ {
#ifdef NO_SSE
DrawerCommandQueue::QueueCommand<RtRevSubClamp4colsRGBACommand>(sx, yl, yh); DrawerCommandQueue::QueueCommand<RtRevSubClamp4colsRGBACommand>(sx, yl, yh);
#else
if (!r_linearlight)
DrawerCommandQueue::QueueCommand<RtRevSubClamp4colsRGBA_SSE_Command>(sx, yl, yh);
else
DrawerCommandQueue::QueueCommand<RtRevSubClamp4colsRGBA_AVX_Command>(sx, yl, yh);
#endif
} }
// Translates and subtracts one span at hx from the screen at sx with clamping. // Translates and subtracts one span at hx from the screen at sx with clamping.

View file

@ -493,3 +493,255 @@ public:
} }
} }
}; };
class VecCommand(RtSubClamp4colsRGBA) : public DrawerCommand
{
int sx;
int yl;
int yh;
BYTE * RESTRICT _destorg;
int _pitch;
fixed_t _light;
fixed_t _srcalpha;
fixed_t _destalpha;
ShadeConstants _shade_constants;
public:
VecCommand(RtSubClamp4colsRGBA)(int sx, int yl, int yh)
{
this->sx = sx;
this->yl = yl;
this->yh = yh;
_destorg = dc_destorg;
_pitch = dc_pitch;
_light = dc_light;
_srcalpha = dc_srcalpha;
_destalpha = dc_destalpha;
_shade_constants = dc_shade_constants;
}
void Execute(DrawerThread *thread) override
{
uint32_t *source;
uint32_t *dest;
int count;
int pitch;
int sincr;
count = thread->count_for_thread(yl, yh - yl + 1);
if (count <= 0)
return;
dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg);
source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4;
pitch = _pitch * thread->num_cores;
sincr = 4 * thread->num_cores;
uint32_t light = calc_light_multiplier(_light);
uint32_t *palette = (uint32_t*)GPalette.BaseColors;
ShadeConstants shade_constants = _shade_constants;
uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8);
uint32_t bg_alpha = _destalpha >> (FRACBITS - 8);
if (shade_constants.simple_shade)
{
VEC_SHADE_SIMPLE_INIT(light);
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
do {
uint32_t p0 = source[0];
uint32_t p1 = source[1];
uint32_t p2 = source[2];
uint32_t p3 = source[3];
// shade_pal_index:
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
VEC_SHADE_SIMPLE(fg);
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
// unpack bg:
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
// (bg_red * bg_alpha - fg_red * fg_alpha) / 256:
__m128i color_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, mbg_alpha), _mm_mullo_epi16(fg_hi, mfg_alpha)), 8);
__m128i color_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, mbg_alpha), _mm_mullo_epi16(fg_lo, mfg_alpha)), 8);
__m128i color = _mm_packus_epi16(color_lo, color_hi);
_mm_storeu_si128((__m128i*)dest, color);
source += sincr;
dest += pitch;
} while (--count);
}
else
{
VEC_SHADE_INIT(light, shade_constants);
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
do {
uint32_t p0 = source[0];
uint32_t p1 = source[1];
uint32_t p2 = source[2];
uint32_t p3 = source[3];
// shade_pal_index:
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
VEC_SHADE(fg, shade_constants);
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
// unpack bg:
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
// (bg_red * bg_alpha - fg_red * fg_alpha) / 256:
__m128i color_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, mbg_alpha), _mm_mullo_epi16(fg_hi, mfg_alpha)), 8);
__m128i color_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, mbg_alpha), _mm_mullo_epi16(fg_lo, mfg_alpha)), 8);
__m128i color = _mm_packus_epi16(color_lo, color_hi);
_mm_storeu_si128((__m128i*)dest, color);
source += sincr;
dest += pitch;
} while (--count);
}
}
};
class VecCommand(RtRevSubClamp4colsRGBA) : public DrawerCommand
{
int sx;
int yl;
int yh;
BYTE * RESTRICT _destorg;
int _pitch;
fixed_t _light;
fixed_t _srcalpha;
fixed_t _destalpha;
ShadeConstants _shade_constants;
public:
VecCommand(RtRevSubClamp4colsRGBA)(int sx, int yl, int yh)
{
this->sx = sx;
this->yl = yl;
this->yh = yh;
_destorg = dc_destorg;
_pitch = dc_pitch;
_light = dc_light;
_srcalpha = dc_srcalpha;
_destalpha = dc_destalpha;
_shade_constants = dc_shade_constants;
}
void Execute(DrawerThread *thread) override
{
uint32_t *source;
uint32_t *dest;
int count;
int pitch;
int sincr;
count = thread->count_for_thread(yl, yh - yl + 1);
if (count <= 0)
return;
dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg);
source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4;
pitch = _pitch * thread->num_cores;
sincr = 4 * thread->num_cores;
uint32_t light = calc_light_multiplier(_light);
uint32_t *palette = (uint32_t*)GPalette.BaseColors;
ShadeConstants shade_constants = _shade_constants;
uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8);
uint32_t bg_alpha = _destalpha >> (FRACBITS - 8);
if (shade_constants.simple_shade)
{
VEC_SHADE_SIMPLE_INIT(light);
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
do {
uint32_t p0 = source[0];
uint32_t p1 = source[1];
uint32_t p2 = source[2];
uint32_t p3 = source[3];
// shade_pal_index:
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
VEC_SHADE_SIMPLE(fg);
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
// unpack bg:
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
// (fg_red * fg_alpha - bg_red * bg_alpha) / 256:
__m128i color_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8);
__m128i color_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8);
__m128i color = _mm_packus_epi16(color_lo, color_hi);
_mm_storeu_si128((__m128i*)dest, color);
source += sincr;
dest += pitch;
} while (--count);
}
else
{
VEC_SHADE_INIT(light, shade_constants);
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
do {
uint32_t p0 = source[0];
uint32_t p1 = source[1];
uint32_t p2 = source[2];
uint32_t p3 = source[3];
// shade_pal_index:
__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
VEC_SHADE(fg, shade_constants);
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
// unpack bg:
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
// (fg_red * fg_alpha - bg_red * bg_alpha) / 256:
__m128i color_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8);
__m128i color_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8);
__m128i color = _mm_packus_epi16(color_lo, color_hi);
_mm_storeu_si128((__m128i*)dest, color);
source += sincr;
dest += pitch;
} while (--count);
}
}
};