From 38aba81dcc816ce9bb0888f95b94f73714771f67 Mon Sep 17 00:00:00 2001 From: Magnus Norddahl Date: Sun, 19 Jun 2016 23:11:41 +0200 Subject: [PATCH] Added more SSE drawers --- src/r_draw_rgba.cpp | 313 ++++++++++++---------- src/r_draw_rgba.h | 58 +++- src/r_draw_rgba_sse.h | 583 ++++++++++++++++++++++++++++++++++++++++- src/r_drawt_rgba.cpp | 14 + src/r_drawt_rgba_sse.h | 252 ++++++++++++++++++ 5 files changed, 1072 insertions(+), 148 deletions(-) diff --git a/src/r_draw_rgba.cpp b/src/r_draw_rgba.cpp index 28c5df2ac6..96232ab0c0 100644 --- a/src/r_draw_rgba.cpp +++ b/src/r_draw_rgba.cpp @@ -2411,10 +2411,7 @@ public: do { uint32_t pix = source[frac >> bits]; - if (pix != 0) - { - *dest = shade_bgra(pix, light, shade_constants); - } + *dest = alpha_blend(shade_bgra(pix, light, shade_constants), *dest); frac += fracstep; dest += pitch; } while (--count); @@ -2480,10 +2477,10 @@ public: do { uint32_t pix; - pix = bufplce[0][(place = local_vplce[0]) >> bits]; if (pix) dest[0] = shade_bgra(pix, light0, shade_constants); local_vplce[0] = place + local_vince[0]; - pix = bufplce[1][(place = local_vplce[1]) >> bits]; if (pix) dest[1] = shade_bgra(pix, light1, shade_constants); local_vplce[1] = place + local_vince[1]; - pix = bufplce[2][(place = local_vplce[2]) >> bits]; if (pix) dest[2] = shade_bgra(pix, light2, shade_constants); local_vplce[2] = place + local_vince[2]; - pix = bufplce[3][(place = local_vplce[3]) >> bits]; if (pix) dest[3] = shade_bgra(pix, light3, shade_constants); local_vplce[3] = place + local_vince[3]; + pix = bufplce[0][(place = local_vplce[0]) >> bits]; dest[0] = alpha_blend(shade_bgra(pix, light0, shade_constants), dest[0]); local_vplce[0] = place + local_vince[0]; + pix = bufplce[1][(place = local_vplce[1]) >> bits]; dest[1] = alpha_blend(shade_bgra(pix, light1, shade_constants), dest[1]); local_vplce[1] = place + local_vince[1]; + pix = bufplce[2][(place = local_vplce[2]) >> bits]; dest[2] = alpha_blend(shade_bgra(pix, light2, shade_constants), dest[2]); local_vplce[2] = place + local_vince[2]; + pix = bufplce[3][(place = local_vplce[3]) >> bits]; dest[3] = alpha_blend(shade_bgra(pix, light3, shade_constants), dest[3]); local_vplce[3] = place + local_vince[3]; dest += pitch; } while (--count); } @@ -2535,29 +2532,31 @@ public: uint32_t light = calc_light_multiplier(_light); ShadeConstants shade_constants = _shade_constants; - uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8); - uint32_t bg_alpha = _destalpha >> (FRACBITS - 8); + uint32_t src_alpha = _srcalpha >> (FRACBITS - 8); + uint32_t dest_alpha = _destalpha >> (FRACBITS - 8); do { uint32_t pix = source[frac >> bits]; - if (pix != 0) - { - uint32_t fg = shade_bgra(pix, light, shade_constants); - uint32_t fg_red = (fg >> 16) & 0xff; - uint32_t fg_green = (fg >> 8) & 0xff; - uint32_t fg_blue = fg & 0xff; - uint32_t bg_red = (*dest >> 16) & 0xff; - uint32_t bg_green = (*dest >> 8) & 0xff; - uint32_t bg_blue = (*dest) & 0xff; + uint32_t fg_alpha, bg_alpha; + calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha); - uint32_t red = clamp((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255); - uint32_t green = clamp((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255); - uint32_t blue = clamp((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255); + uint32_t fg = shade_bgra(pix, light, shade_constants); + uint32_t fg_red = (fg >> 16) & 0xff; + uint32_t fg_green = (fg >> 8) & 0xff; + uint32_t fg_blue = fg & 0xff; + + uint32_t bg_red = (*dest >> 16) & 0xff; + uint32_t bg_green = (*dest >> 8) & 0xff; + uint32_t bg_blue = (*dest) & 0xff; + + uint32_t red = clamp((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255); + uint32_t green = clamp((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255); + uint32_t blue = clamp((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255); + + *dest = 0xff000000 | (red << 16) | (green << 8) | blue; - *dest = 0xff000000 | (red << 16) | (green << 8) | blue; - } frac += fracstep; dest += pitch; } while (--count); @@ -2615,8 +2614,8 @@ public: ShadeConstants shade_constants = _shade_constants; - uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8); - uint32_t bg_alpha = _destalpha >> (FRACBITS - 8); + uint32_t src_alpha = _srcalpha >> (FRACBITS - 8); + uint32_t dest_alpha = _destalpha >> (FRACBITS - 8); DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; @@ -2632,23 +2631,25 @@ public: for (int i = 0; i < 4; ++i) { uint32_t pix = bufplce[i][local_vplce[i] >> bits]; - if (pix != 0) - { - uint32_t fg = shade_bgra(pix, light[i], shade_constants); - uint32_t fg_red = (fg >> 16) & 0xff; - uint32_t fg_green = (fg >> 8) & 0xff; - uint32_t fg_blue = fg & 0xff; - uint32_t bg_red = (*dest >> 16) & 0xff; - uint32_t bg_green = (*dest >> 8) & 0xff; - uint32_t bg_blue = (*dest) & 0xff; + uint32_t fg_alpha, bg_alpha; + calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha); - uint32_t red = clamp((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255); - uint32_t green = clamp((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255); - uint32_t blue = clamp((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255); + uint32_t fg = shade_bgra(pix, light[i], shade_constants); + uint32_t fg_red = (fg >> 16) & 0xff; + uint32_t fg_green = (fg >> 8) & 0xff; + uint32_t fg_blue = fg & 0xff; + + uint32_t bg_red = (*dest >> 16) & 0xff; + uint32_t bg_green = (*dest >> 8) & 0xff; + uint32_t bg_blue = (*dest) & 0xff; + + uint32_t red = clamp((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255); + uint32_t green = clamp((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255); + uint32_t blue = clamp((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255); + + dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue; - dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue; - } local_vplce[i] += local_vince[i]; } dest += pitch; @@ -2702,29 +2703,31 @@ public: uint32_t light = calc_light_multiplier(_light); ShadeConstants shade_constants = _shade_constants; - uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8); - uint32_t bg_alpha = _destalpha >> (FRACBITS - 8); + uint32_t src_alpha = _srcalpha >> (FRACBITS - 8); + uint32_t dest_alpha = _destalpha >> (FRACBITS - 8); do { uint32_t pix = source[frac >> bits]; - if (pix != 0) - { - uint32_t fg = shade_bgra(pix, light, shade_constants); - uint32_t fg_red = (fg >> 16) & 0xff; - uint32_t fg_green = (fg >> 8) & 0xff; - uint32_t fg_blue = fg & 0xff; - uint32_t bg_red = (*dest >> 16) & 0xff; - uint32_t bg_green = (*dest >> 8) & 0xff; - uint32_t bg_blue = (*dest) & 0xff; + uint32_t fg_alpha, bg_alpha; + calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha); - uint32_t red = clamp((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255); - uint32_t green = clamp((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255); - uint32_t blue = clamp((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255); + uint32_t fg = shade_bgra(pix, light, shade_constants); + uint32_t fg_red = (fg >> 16) & 0xff; + uint32_t fg_green = (fg >> 8) & 0xff; + uint32_t fg_blue = fg & 0xff; + + uint32_t bg_red = (*dest >> 16) & 0xff; + uint32_t bg_green = (*dest >> 8) & 0xff; + uint32_t bg_blue = (*dest) & 0xff; + + uint32_t red = clamp((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255); + uint32_t green = clamp((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255); + uint32_t blue = clamp((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255); + + *dest = 0xff000000 | (red << 16) | (green << 8) | blue; - *dest = 0xff000000 | (red << 16) | (green << 8) | blue; - } frac += fracstep; dest += pitch; } while (--count); @@ -2782,8 +2785,8 @@ public: ShadeConstants shade_constants = _shade_constants; - uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8); - uint32_t bg_alpha = _destalpha >> (FRACBITS - 8); + uint32_t src_alpha = _srcalpha >> (FRACBITS - 8); + uint32_t dest_alpha = _destalpha >> (FRACBITS - 8); DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; @@ -2799,23 +2802,25 @@ public: for (int i = 0; i < 4; ++i) { uint32_t pix = bufplce[i][local_vplce[i] >> bits]; - if (pix != 0) - { - uint32_t fg = shade_bgra(pix, light[i], shade_constants); - uint32_t fg_red = (fg >> 16) & 0xff; - uint32_t fg_green = (fg >> 8) & 0xff; - uint32_t fg_blue = fg & 0xff; - uint32_t bg_red = (dest[i] >> 16) & 0xff; - uint32_t bg_green = (dest[i] >> 8) & 0xff; - uint32_t bg_blue = (dest[i]) & 0xff; + uint32_t fg_alpha, bg_alpha; + calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha); - uint32_t red = clamp((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255); - uint32_t green = clamp((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255); - uint32_t blue = clamp((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255); + uint32_t fg = shade_bgra(pix, light[i], shade_constants); + uint32_t fg_red = (fg >> 16) & 0xff; + uint32_t fg_green = (fg >> 8) & 0xff; + uint32_t fg_blue = fg & 0xff; + + uint32_t bg_red = (dest[i] >> 16) & 0xff; + uint32_t bg_green = (dest[i] >> 8) & 0xff; + uint32_t bg_blue = (dest[i]) & 0xff; + + uint32_t red = clamp((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255); + uint32_t green = clamp((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255); + uint32_t blue = clamp((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255); + + dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue; - dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue; - } local_vplce[i] += local_vince[i]; } dest += pitch; @@ -2869,29 +2874,31 @@ public: uint32_t light = calc_light_multiplier(_light); ShadeConstants shade_constants = _shade_constants; - uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8); - uint32_t bg_alpha = _destalpha >> (FRACBITS - 8); + uint32_t src_alpha = _srcalpha >> (FRACBITS - 8); + uint32_t dest_alpha = _destalpha >> (FRACBITS - 8); do { uint32_t pix = source[frac >> bits]; - if (pix != 0) - { - uint32_t fg = shade_bgra(pix, light, shade_constants); - uint32_t fg_red = (fg >> 16) & 0xff; - uint32_t fg_green = (fg >> 8) & 0xff; - uint32_t fg_blue = fg & 0xff; - uint32_t bg_red = (*dest >> 16) & 0xff; - uint32_t bg_green = (*dest >> 8) & 0xff; - uint32_t bg_blue = (*dest) & 0xff; + uint32_t fg_alpha, bg_alpha; + calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha); - uint32_t red = clamp((0x10000 - fg_red * fg_alpha + bg_red * bg_alpha) / 256, 256, 256 + 255) - 256; - uint32_t green = clamp((0x10000 - fg_green * fg_alpha + bg_green * bg_alpha) / 256, 256, 256 + 255) - 256; - uint32_t blue = clamp((0x10000 - fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256; + uint32_t fg = shade_bgra(pix, light, shade_constants); + uint32_t fg_red = (fg >> 16) & 0xff; + uint32_t fg_green = (fg >> 8) & 0xff; + uint32_t fg_blue = fg & 0xff; + + uint32_t bg_red = (*dest >> 16) & 0xff; + uint32_t bg_green = (*dest >> 8) & 0xff; + uint32_t bg_blue = (*dest) & 0xff; + + uint32_t red = clamp((0x10000 - fg_red * fg_alpha + bg_red * bg_alpha) / 256, 256, 256 + 255) - 256; + uint32_t green = clamp((0x10000 - fg_green * fg_alpha + bg_green * bg_alpha) / 256, 256, 256 + 255) - 256; + uint32_t blue = clamp((0x10000 - fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256; + + *dest = 0xff000000 | (red << 16) | (green << 8) | blue; - *dest = 0xff000000 | (red << 16) | (green << 8) | blue; - } frac += fracstep; dest += pitch; } while (--count); @@ -2949,8 +2956,8 @@ public: ShadeConstants shade_constants = _shade_constants; - uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8); - uint32_t bg_alpha = _destalpha >> (FRACBITS - 8); + uint32_t src_alpha = _srcalpha >> (FRACBITS - 8); + uint32_t dest_alpha = _destalpha >> (FRACBITS - 8); DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; @@ -2966,23 +2973,25 @@ public: for (int i = 0; i < 4; ++i) { uint32_t pix = bufplce[i][local_vplce[i] >> bits]; - if (pix != 0) - { - uint32_t fg = shade_bgra(pix, light[i], shade_constants); - uint32_t fg_red = (fg >> 16) & 0xff; - uint32_t fg_green = (fg >> 8) & 0xff; - uint32_t fg_blue = fg & 0xff; - uint32_t bg_red = (dest[i] >> 16) & 0xff; - uint32_t bg_green = (dest[i] >> 8) & 0xff; - uint32_t bg_blue = (dest[i]) & 0xff; + uint32_t fg_alpha, bg_alpha; + calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha); - uint32_t red = clamp((0x10000 - fg_red * fg_alpha + bg_red * bg_alpha) / 256, 256, 256 + 255) - 256; - uint32_t green = clamp((0x10000 - fg_green * fg_alpha + bg_green * bg_alpha) / 256, 256, 256 + 255) - 256; - uint32_t blue = clamp((0x10000 - fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256; + uint32_t fg = shade_bgra(pix, light[i], shade_constants); + uint32_t fg_red = (fg >> 16) & 0xff; + uint32_t fg_green = (fg >> 8) & 0xff; + uint32_t fg_blue = fg & 0xff; + + uint32_t bg_red = (dest[i] >> 16) & 0xff; + uint32_t bg_green = (dest[i] >> 8) & 0xff; + uint32_t bg_blue = (dest[i]) & 0xff; + + uint32_t red = clamp((0x10000 - fg_red * fg_alpha + bg_red * bg_alpha) / 256, 256, 256 + 255) - 256; + uint32_t green = clamp((0x10000 - fg_green * fg_alpha + bg_green * bg_alpha) / 256, 256, 256 + 255) - 256; + uint32_t blue = clamp((0x10000 - fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256; + + dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue; - dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue; - } local_vplce[i] += local_vince[i]; } dest += pitch; @@ -3036,29 +3045,31 @@ public: uint32_t light = calc_light_multiplier(_light); ShadeConstants shade_constants = _shade_constants; - uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8); - uint32_t bg_alpha = _destalpha >> (FRACBITS - 8); + uint32_t src_alpha = _srcalpha >> (FRACBITS - 8); + uint32_t dest_alpha = _destalpha >> (FRACBITS - 8); do { uint32_t pix = source[frac >> bits]; - if (pix != 0) - { - uint32_t fg = shade_bgra(pix, light, shade_constants); - uint32_t fg_red = (fg >> 16) & 0xff; - uint32_t fg_green = (fg >> 8) & 0xff; - uint32_t fg_blue = fg & 0xff; - uint32_t bg_red = (*dest >> 16) & 0xff; - uint32_t bg_green = (*dest >> 8) & 0xff; - uint32_t bg_blue = (*dest) & 0xff; + uint32_t fg_alpha, bg_alpha; + calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha); - uint32_t red = clamp((0x10000 + fg_red * fg_alpha - bg_red * bg_alpha) / 256, 256, 256 + 255) - 256; - uint32_t green = clamp((0x10000 + fg_green * fg_alpha - bg_green * bg_alpha) / 256, 256, 256 + 255) - 256; - uint32_t blue = clamp((0x10000 + fg_blue * fg_alpha - bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256; + uint32_t fg = shade_bgra(pix, light, shade_constants); + uint32_t fg_red = (fg >> 16) & 0xff; + uint32_t fg_green = (fg >> 8) & 0xff; + uint32_t fg_blue = fg & 0xff; + + uint32_t bg_red = (*dest >> 16) & 0xff; + uint32_t bg_green = (*dest >> 8) & 0xff; + uint32_t bg_blue = (*dest) & 0xff; + + uint32_t red = clamp((0x10000 + fg_red * fg_alpha - bg_red * bg_alpha) / 256, 256, 256 + 255) - 256; + uint32_t green = clamp((0x10000 + fg_green * fg_alpha - bg_green * bg_alpha) / 256, 256, 256 + 255) - 256; + uint32_t blue = clamp((0x10000 + fg_blue * fg_alpha - bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256; + + *dest = 0xff000000 | (red << 16) | (green << 8) | blue; - *dest = 0xff000000 | (red << 16) | (green << 8) | blue; - } frac += fracstep; dest += pitch; } while (--count); @@ -3116,8 +3127,8 @@ public: ShadeConstants shade_constants = _shade_constants; - uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8); - uint32_t bg_alpha = _destalpha >> (FRACBITS - 8); + uint32_t src_alpha = _srcalpha >> (FRACBITS - 8); + uint32_t dest_alpha = _destalpha >> (FRACBITS - 8); DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; @@ -3133,23 +3144,25 @@ public: for (int i = 0; i < 4; ++i) { uint32_t pix = bufplce[i][local_vplce[i] >> bits]; - if (pix != 0) - { - uint32_t fg = shade_bgra(pix, light[i], shade_constants); - uint32_t fg_red = (fg >> 16) & 0xff; - uint32_t fg_green = (fg >> 8) & 0xff; - uint32_t fg_blue = fg & 0xff; - uint32_t bg_red = (dest[i] >> 16) & 0xff; - uint32_t bg_green = (dest[i] >> 8) & 0xff; - uint32_t bg_blue = (dest[i]) & 0xff; + uint32_t fg_alpha, bg_alpha; + calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha); - uint32_t red = clamp((0x10000 + fg_red * fg_alpha - bg_red * bg_alpha) / 256, 256, 256 + 255) - 256; - uint32_t green = clamp((0x10000 + fg_green * fg_alpha - bg_green * bg_alpha) / 256, 256, 256 + 255) - 256; - uint32_t blue = clamp((0x10000 + fg_blue * fg_alpha - bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256; + uint32_t fg = shade_bgra(pix, light[i], shade_constants); + uint32_t fg_red = (fg >> 16) & 0xff; + uint32_t fg_green = (fg >> 8) & 0xff; + uint32_t fg_blue = fg & 0xff; + + uint32_t bg_red = (dest[i] >> 16) & 0xff; + uint32_t bg_green = (dest[i] >> 8) & 0xff; + uint32_t bg_blue = (dest[i]) & 0xff; + + uint32_t red = clamp((0x10000 + fg_red * fg_alpha - bg_red * bg_alpha) / 256, 256, 256 + 255) - 256; + uint32_t green = clamp((0x10000 + fg_green * fg_alpha - bg_green * bg_alpha) / 256, 256, 256 + 255) - 256; + uint32_t blue = clamp((0x10000 + fg_blue * fg_alpha - bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256; + + dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue; - dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue; - } local_vplce[i] += local_vince[i]; } dest += pitch; @@ -3733,7 +3746,14 @@ fixed_t tmvline1_add_rgba() void tmvline4_add_rgba() { +#ifdef NO_SSE DrawerCommandQueue::QueueCommand(); +#else + if (!r_linearlight) + DrawerCommandQueue::QueueCommand(); + else + DrawerCommandQueue::QueueCommand(); +#endif for (int i = 0; i < 4; i++) vplce[i] += vince[i] * dc_count; } @@ -3746,7 +3766,14 @@ fixed_t tmvline1_addclamp_rgba() void tmvline4_addclamp_rgba() { +#ifdef NO_SSE DrawerCommandQueue::QueueCommand(); +#else + if (!r_linearlight) + DrawerCommandQueue::QueueCommand(); + else + DrawerCommandQueue::QueueCommand(); +#endif for (int i = 0; i < 4; i++) vplce[i] += vince[i] * dc_count; } @@ -3759,7 +3786,14 @@ fixed_t tmvline1_subclamp_rgba() void tmvline4_subclamp_rgba() { +#ifdef NO_SSE DrawerCommandQueue::QueueCommand(); +#else + if (!r_linearlight) + DrawerCommandQueue::QueueCommand(); + else + DrawerCommandQueue::QueueCommand(); +#endif for (int i = 0; i < 4; i++) vplce[i] += vince[i] * dc_count; } @@ -3772,7 +3806,14 @@ fixed_t tmvline1_revsubclamp_rgba() void tmvline4_revsubclamp_rgba() { +#ifdef NO_SSE DrawerCommandQueue::QueueCommand(); +#else + if (!r_linearlight) + DrawerCommandQueue::QueueCommand(); + else + DrawerCommandQueue::QueueCommand(); +#endif for (int i = 0; i < 4; i++) vplce[i] += vince[i] * dc_count; } diff --git a/src/r_draw_rgba.h b/src/r_draw_rgba.h index 1744781624..66be1f38b8 100644 --- a/src/r_draw_rgba.h +++ b/src/r_draw_rgba.h @@ -417,9 +417,9 @@ FORCEINLINE uint32_t alpha_blend(uint32_t fg, uint32_t bg) uint32_t bg_green = (bg >> 8) & 0xff; uint32_t bg_blue = bg & 0xff; - uint32_t red = ((fg_red * alpha) + (bg_red * inv_alpha)) / 256; - uint32_t green = ((fg_green * alpha) + (bg_green * inv_alpha)) / 256; - uint32_t blue = ((fg_blue * alpha) + (bg_blue * inv_alpha)) / 256; + uint32_t red = clamp(fg_red + (bg_red * inv_alpha) / 256, 0, 255); + uint32_t green = clamp(fg_green + (bg_green * inv_alpha) / 256, 0, 255); + uint32_t blue = clamp(fg_blue + (bg_blue * inv_alpha) / 256, 0, 255); return 0xff000000 | (red << 16) | (green << 8) | blue; } @@ -543,7 +543,7 @@ FORCEINLINE uint32_t alpha_blend(uint32_t fg, uint32_t bg) /* // Complex shade 8 pixels -#define AVX2_SHADE(fg, shade_constants) { \ +#define AVX_SHADE(fg, shade_constants) { \ __m256i fg_hi = _mm256_unpackhi_epi8(fg, _mm256_setzero_si256()); \ __m256i fg_lo = _mm256_unpacklo_epi8(fg, _mm256_setzero_si256()); \ \ @@ -566,8 +566,58 @@ FORCEINLINE uint32_t alpha_blend(uint32_t fg, uint32_t bg) } */ +// Normal premultiplied alpha blend using the alpha from fg +#define VEC_ALPHA_BLEND(fg,bg) { \ + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); \ + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); \ + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); \ + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); \ + __m128i m255 = _mm_set1_epi16(255); \ + __m128i inv_alpha_hi = _mm_sub_epi16(m255, _mm_shufflehi_epi16(_mm_shufflelo_epi16(fg_hi, _MM_SHUFFLE(3,3,3,3)), _MM_SHUFFLE(3,3,3,3))); \ + __m128i inv_alpha_lo = _mm_sub_epi16(m255, _mm_shufflehi_epi16(_mm_shufflelo_epi16(fg_lo, _MM_SHUFFLE(3,3,3,3)), _MM_SHUFFLE(3,3,3,3))); \ + inv_alpha_hi = _mm_add_epi16(inv_alpha_hi, _mm_srli_epi16(inv_alpha_hi, 7)); \ + inv_alpha_lo = _mm_add_epi16(inv_alpha_lo, _mm_srli_epi16(inv_alpha_lo, 7)); \ + bg_hi = _mm_mullo_epi16(bg_hi, inv_alpha_hi); \ + bg_hi = _mm_srli_epi16(bg_hi, 8); \ + bg_lo = _mm_mullo_epi16(bg_lo, inv_alpha_lo); \ + bg_lo = _mm_srli_epi16(bg_lo, 8); \ + bg = _mm_packus_epi16(bg_lo, bg_hi); \ + fg = _mm_adds_epu8(fg, bg); \ +} +/* +FORCEINLINE void calc_blend_alpha(uint32_t fg, uint32_t src_alpha, uint32_t dest_alpha, uint32_t &fg_alpha, uint32_t &bg_alpha) +{ + fg_alpha = src_alpha; + bg_alpha = dest_alpha; +} +#define VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha) \ + __m128i fg_alpha_hi = msrc_alpha; \ + __m128i fg_alpha_lo = msrc_alpha; \ + __m128i bg_alpha_hi = mdest_alpha; \ + __m128i bg_alpha_lo = mdest_alpha; +*/ + +// Calculates the final alpha values to be used when combined with the source texture alpha channel +FORCEINLINE void calc_blend_alpha(uint32_t fg, uint32_t src_alpha, uint32_t dest_alpha, uint32_t &fg_alpha, uint32_t &bg_alpha) +{ + fg_alpha = (fg >> 24) & 0xff; + fg_alpha += fg_alpha >> 7; + bg_alpha = (dest_alpha * (256 - fg_alpha)) >> 8; + fg_alpha = (src_alpha * fg_alpha) >> 8; +} + +// Calculates the final alpha values to be used when combined with the source texture alpha channel +#define VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha) \ + __m128i fg_alpha_hi = _mm_shufflehi_epi16(_mm_shufflelo_epi16(_mm_unpackhi_epi8(fg, _mm_setzero_si128()), _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)); \ + __m128i fg_alpha_lo = _mm_shufflehi_epi16(_mm_shufflelo_epi16(_mm_unpacklo_epi8(fg, _mm_setzero_si128()), _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)); \ + fg_alpha_hi = _mm_add_epi16(fg_alpha_hi, _mm_srli_epi16(fg_alpha_hi, 7)); \ + fg_alpha_lo = _mm_add_epi16(fg_alpha_lo, _mm_srli_epi16(fg_alpha_lo, 7)); \ + __m128i bg_alpha_hi = _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16(256), fg_alpha_hi), mdest_alpha), 8); \ + __m128i bg_alpha_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16(256), fg_alpha_lo), mdest_alpha), 8); \ + fg_alpha_hi = _mm_srli_epi16(_mm_mullo_epi16(fg_alpha_hi, msrc_alpha), 8); \ + fg_alpha_lo = _mm_srli_epi16(_mm_mullo_epi16(fg_alpha_lo, msrc_alpha), 8); // Calculate constants for a simple shade #define SSE_SHADE_SIMPLE_INIT(light) \ diff --git a/src/r_draw_rgba_sse.h b/src/r_draw_rgba_sse.h index 14ebbbb41d..0597580e1e 100644 --- a/src/r_draw_rgba_sse.h +++ b/src/r_draw_rgba_sse.h @@ -444,17 +444,16 @@ public: uint32_t pix2 = bufplce[2][place2 >> bits]; uint32_t pix3 = bufplce[3][place3 >> bits]; - // movemask = !(pix == 0) - __m128i movemask = _mm_xor_si128(_mm_cmpeq_epi32(_mm_set_epi32(pix3, pix2, pix1, pix0), _mm_setzero_si128()), _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); - local_vplce[0] = place0 + local_vince[0]; local_vplce[1] = place1 + local_vince[1]; local_vplce[2] = place2 + local_vince[2]; local_vplce[3] = place3 + local_vince[3]; __m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0); + __m128i bg = _mm_loadu_si128((const __m128i*)dest); VEC_SHADE_SIMPLE(fg); - _mm_maskmoveu_si128(fg, movemask, (char*)dest); + VEC_ALPHA_BLEND(fg, bg); + _mm_storeu_si128((__m128i*)dest, fg); dest += pitch; } while (--count); } @@ -473,17 +472,585 @@ public: uint32_t pix2 = bufplce[2][place2 >> bits]; uint32_t pix3 = bufplce[3][place3 >> bits]; - // movemask = !(pix == 0) - __m128i movemask = _mm_xor_si128(_mm_cmpeq_epi32(_mm_set_epi32(pix3, pix2, pix1, pix0), _mm_setzero_si128()), _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); - local_vplce[0] = place0 + local_vince[0]; local_vplce[1] = place1 + local_vince[1]; local_vplce[2] = place2 + local_vince[2]; local_vplce[3] = place3 + local_vince[3]; __m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0); + __m128i bg = _mm_loadu_si128((const __m128i*)dest); VEC_SHADE(fg, shade_constants); - _mm_maskmoveu_si128(fg, movemask, (char*)dest); + VEC_ALPHA_BLEND(fg, bg); + _mm_storeu_si128((__m128i*)dest, fg); + dest += pitch; + } while (--count); + } + } +}; + +class VecCommand(Tmvline4AddRGBA) : public DrawerCommand +{ + BYTE * RESTRICT _dest; + int _count; + int _pitch; + ShadeConstants _shade_constants; + fixed_t _srcalpha; + fixed_t _destalpha; + int tmvlinebits; + fixed_t palookuplight[4]; + DWORD vplce[4]; + DWORD vince[4]; + const uint32 * RESTRICT bufplce[4]; + +public: + VecCommand(Tmvline4AddRGBA)() + { + _dest = dc_dest; + _count = dc_count; + _pitch = dc_pitch; + _shade_constants = dc_shade_constants; + _srcalpha = dc_srcalpha; + _destalpha = dc_destalpha; + tmvlinebits = ::tmvlinebits; + for (int i = 0; i < 4; i++) + { + palookuplight[i] = ::palookuplight[i]; + vplce[i] = ::vplce[i]; + vince[i] = ::vince[i]; + bufplce[i] = (const uint32 *)::bufplce[i]; + } + } + + void Execute(DrawerThread *thread) override + { + int count = thread->count_for_thread(_dest_y, _count); + if (count <= 0) + return; + + uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest); + int pitch = _pitch * thread->num_cores; + int bits = tmvlinebits; + + uint32_t light[4]; + light[0] = calc_light_multiplier(palookuplight[0]); + light[1] = calc_light_multiplier(palookuplight[1]); + light[2] = calc_light_multiplier(palookuplight[2]); + light[3] = calc_light_multiplier(palookuplight[3]); + + ShadeConstants shade_constants = _shade_constants; + + uint32_t src_alpha = _srcalpha >> (FRACBITS - 8); + uint32_t dest_alpha = _destalpha >> (FRACBITS - 8); + + DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; + DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; + int skipped = thread->skipped_by_thread(_dest_y); + for (int i = 0; i < 4; i++) + { + local_vplce[i] += local_vince[i] * skipped; + local_vince[i] *= thread->num_cores; + } + + if (shade_constants.simple_shade) + { + VEC_SHADE_SIMPLE_INIT4(light[3], light[2], light[1], light[0]); + + __m128i msrc_alpha = _mm_set1_epi16(src_alpha); + __m128i mdest_alpha = _mm_set1_epi16(dest_alpha); + + do + { + uint32_t pix0 = bufplce[0][local_vplce[0] >> bits]; + uint32_t pix1 = bufplce[1][local_vplce[1] >> bits]; + uint32_t pix2 = bufplce[2][local_vplce[2] >> bits]; + uint32_t pix3 = bufplce[3][local_vplce[3] >> bits]; + + local_vplce[0] = local_vplce[0] + local_vince[0]; + local_vplce[1] = local_vplce[1] + local_vince[1]; + local_vplce[2] = local_vplce[2] + local_vince[2]; + local_vplce[3] = local_vplce[3] + local_vince[3]; + + __m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0); + + VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha); + VEC_SHADE_SIMPLE(fg); + + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + __m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8); + __m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8); + __m128i out = _mm_packus_epi16(out_lo, out_hi); + + _mm_storeu_si128((__m128i*)dest, out); + dest += pitch; + } while (--count); + } + else + { + VEC_SHADE_INIT4(light[3], light[2], light[1], light[0], shade_constants); + + __m128i msrc_alpha = _mm_set1_epi16(src_alpha); + __m128i mdest_alpha = _mm_set1_epi16(dest_alpha); + + do + { + uint32_t pix0 = bufplce[0][local_vplce[0] >> bits]; + uint32_t pix1 = bufplce[1][local_vplce[1] >> bits]; + uint32_t pix2 = bufplce[2][local_vplce[2] >> bits]; + uint32_t pix3 = bufplce[3][local_vplce[3] >> bits]; + + local_vplce[0] = local_vplce[0] + local_vince[0]; + local_vplce[1] = local_vplce[1] + local_vince[1]; + local_vplce[2] = local_vplce[2] + local_vince[2]; + local_vplce[3] = local_vplce[3] + local_vince[3]; + + __m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0); + VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha); + VEC_SHADE(fg, shade_constants); + + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + __m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8); + __m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8); + __m128i out = _mm_packus_epi16(out_lo, out_hi); + + _mm_storeu_si128((__m128i*)dest, out); + dest += pitch; + } while (--count); + } + } +}; + +class VecCommand(Tmvline4AddClampRGBA) : public DrawerCommand +{ + BYTE * RESTRICT _dest; + int _count; + int _pitch; + ShadeConstants _shade_constants; + fixed_t _srcalpha; + fixed_t _destalpha; + int tmvlinebits; + fixed_t palookuplight[4]; + DWORD vplce[4]; + DWORD vince[4]; + const uint32 *RESTRICT bufplce[4]; + +public: + VecCommand(Tmvline4AddClampRGBA)() + { + _dest = dc_dest; + _count = dc_count; + _pitch = dc_pitch; + _shade_constants = dc_shade_constants; + _srcalpha = dc_srcalpha; + _destalpha = dc_destalpha; + tmvlinebits = ::tmvlinebits; + for (int i = 0; i < 4; i++) + { + palookuplight[i] = ::palookuplight[i]; + vplce[i] = ::vplce[i]; + vince[i] = ::vince[i]; + bufplce[i] = (const uint32 *)::bufplce[i]; + } + } + + void Execute(DrawerThread *thread) override + { + int count = thread->count_for_thread(_dest_y, _count); + if (count <= 0) + return; + + uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest); + int pitch = _pitch * thread->num_cores; + int bits = tmvlinebits; + + uint32_t light[4]; + light[0] = calc_light_multiplier(palookuplight[0]); + light[1] = calc_light_multiplier(palookuplight[1]); + light[2] = calc_light_multiplier(palookuplight[2]); + light[3] = calc_light_multiplier(palookuplight[3]); + + ShadeConstants shade_constants = _shade_constants; + + uint32_t src_alpha = _srcalpha >> (FRACBITS - 8); + uint32_t dest_alpha = _destalpha >> (FRACBITS - 8); + + DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; + DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; + int skipped = thread->skipped_by_thread(_dest_y); + for (int i = 0; i < 4; i++) + { + local_vplce[i] += local_vince[i] * skipped; + local_vince[i] *= thread->num_cores; + } + + if (shade_constants.simple_shade) + { + VEC_SHADE_SIMPLE_INIT4(light[3], light[2], light[1], light[0]); + + __m128i msrc_alpha = _mm_set1_epi16(src_alpha); + __m128i mdest_alpha = _mm_set1_epi16(dest_alpha); + + do + { + uint32_t pix0 = bufplce[0][local_vplce[0] >> bits]; + uint32_t pix1 = bufplce[1][local_vplce[1] >> bits]; + uint32_t pix2 = bufplce[2][local_vplce[2] >> bits]; + uint32_t pix3 = bufplce[3][local_vplce[3] >> bits]; + + local_vplce[0] = local_vplce[0] + local_vince[0]; + local_vplce[1] = local_vplce[1] + local_vince[1]; + local_vplce[2] = local_vplce[2] + local_vince[2]; + local_vplce[3] = local_vplce[3] + local_vince[3]; + + __m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0); + VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha); + VEC_SHADE_SIMPLE(fg); + + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + __m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8); + __m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8); + __m128i out = _mm_packus_epi16(out_lo, out_hi); + + _mm_storeu_si128((__m128i*)dest, out); + dest += pitch; + } while (--count); + } + else + { + VEC_SHADE_INIT4(light[3], light[2], light[1], light[0], shade_constants); + + __m128i msrc_alpha = _mm_set1_epi16(src_alpha); + __m128i mdest_alpha = _mm_set1_epi16(dest_alpha); + + do + { + uint32_t pix0 = bufplce[0][local_vplce[0] >> bits]; + uint32_t pix1 = bufplce[1][local_vplce[1] >> bits]; + uint32_t pix2 = bufplce[2][local_vplce[2] >> bits]; + uint32_t pix3 = bufplce[3][local_vplce[3] >> bits]; + + local_vplce[0] = local_vplce[0] + local_vince[0]; + local_vplce[1] = local_vplce[1] + local_vince[1]; + local_vplce[2] = local_vplce[2] + local_vince[2]; + local_vplce[3] = local_vplce[3] + local_vince[3]; + + __m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0); + VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha); + VEC_SHADE(fg, shade_constants); + + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + __m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8); + __m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8); + __m128i out = _mm_packus_epi16(out_lo, out_hi); + + _mm_storeu_si128((__m128i*)dest, out); + dest += pitch; + } while (--count); + } + } +}; + +class VecCommand(Tmvline4SubClampRGBA) : public DrawerCommand +{ + BYTE * RESTRICT _dest; + int _count; + int _pitch; + ShadeConstants _shade_constants; + fixed_t _srcalpha; + fixed_t _destalpha; + int tmvlinebits; + fixed_t palookuplight[4]; + DWORD vplce[4]; + DWORD vince[4]; + const uint32 *RESTRICT bufplce[4]; + +public: + VecCommand(Tmvline4SubClampRGBA)() + { + _dest = dc_dest; + _count = dc_count; + _pitch = dc_pitch; + _shade_constants = dc_shade_constants; + _srcalpha = dc_srcalpha; + _destalpha = dc_destalpha; + tmvlinebits = ::tmvlinebits; + for (int i = 0; i < 4; i++) + { + palookuplight[i] = ::palookuplight[i]; + vplce[i] = ::vplce[i]; + vince[i] = ::vince[i]; + bufplce[i] = (const uint32 *)::bufplce[i]; + } + } + + void Execute(DrawerThread *thread) override + { + int count = thread->count_for_thread(_dest_y, _count); + if (count <= 0) + return; + + uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest); + int pitch = _pitch * thread->num_cores; + int bits = tmvlinebits; + + uint32_t light[4]; + light[0] = calc_light_multiplier(palookuplight[0]); + light[1] = calc_light_multiplier(palookuplight[1]); + light[2] = calc_light_multiplier(palookuplight[2]); + light[3] = calc_light_multiplier(palookuplight[3]); + + ShadeConstants shade_constants = _shade_constants; + + uint32_t src_alpha = _srcalpha >> (FRACBITS - 8); + uint32_t dest_alpha = _destalpha >> (FRACBITS - 8); + + DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; + DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; + int skipped = thread->skipped_by_thread(_dest_y); + for (int i = 0; i < 4; i++) + { + local_vplce[i] += local_vince[i] * skipped; + local_vince[i] *= thread->num_cores; + } + + if (shade_constants.simple_shade) + { + VEC_SHADE_SIMPLE_INIT4(light[3], light[2], light[1], light[0]); + + __m128i msrc_alpha = _mm_set1_epi16(src_alpha); + __m128i mdest_alpha = _mm_set1_epi16(dest_alpha); + + do + { + uint32_t pix0 = bufplce[0][local_vplce[0] >> bits]; + uint32_t pix1 = bufplce[1][local_vplce[1] >> bits]; + uint32_t pix2 = bufplce[2][local_vplce[2] >> bits]; + uint32_t pix3 = bufplce[3][local_vplce[3] >> bits]; + + local_vplce[0] = local_vplce[0] + local_vince[0]; + local_vplce[1] = local_vplce[1] + local_vince[1]; + local_vplce[2] = local_vplce[2] + local_vince[2]; + local_vplce[3] = local_vplce[3] + local_vince[3]; + + __m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0); + VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha); + VEC_SHADE_SIMPLE(fg); + + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + __m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, bg_alpha_hi), _mm_mullo_epi16(fg_hi, fg_alpha_hi)), 8); + __m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, bg_alpha_lo), _mm_mullo_epi16(fg_lo, fg_alpha_lo)), 8); + __m128i out = _mm_packus_epi16(out_lo, out_hi); + + _mm_storeu_si128((__m128i*)dest, out); + dest += pitch; + } while (--count); + } + else + { + VEC_SHADE_INIT4(light[3], light[2], light[1], light[0], shade_constants); + + __m128i msrc_alpha = _mm_set1_epi16(src_alpha); + __m128i mdest_alpha = _mm_set1_epi16(dest_alpha); + + do + { + uint32_t pix0 = bufplce[0][local_vplce[0] >> bits]; + uint32_t pix1 = bufplce[1][local_vplce[1] >> bits]; + uint32_t pix2 = bufplce[2][local_vplce[2] >> bits]; + uint32_t pix3 = bufplce[3][local_vplce[3] >> bits]; + + local_vplce[0] = local_vplce[0] + local_vince[0]; + local_vplce[1] = local_vplce[1] + local_vince[1]; + local_vplce[2] = local_vplce[2] + local_vince[2]; + local_vplce[3] = local_vplce[3] + local_vince[3]; + + __m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0); + VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha); + VEC_SHADE(fg, shade_constants); + + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + __m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, bg_alpha_hi), _mm_mullo_epi16(fg_hi, fg_alpha_hi)), 8); + __m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, bg_alpha_lo), _mm_mullo_epi16(fg_lo, fg_alpha_lo)), 8); + __m128i out = _mm_packus_epi16(out_lo, out_hi); + + _mm_storeu_si128((__m128i*)dest, out); + dest += pitch; + } while (--count); + } + } +}; + +class VecCommand(Tmvline4RevSubClampRGBA) : public DrawerCommand +{ + BYTE * RESTRICT _dest; + int _count; + int _pitch; + ShadeConstants _shade_constants; + fixed_t _srcalpha; + fixed_t _destalpha; + int tmvlinebits; + fixed_t palookuplight[4]; + DWORD vplce[4]; + DWORD vince[4]; + const uint32 *RESTRICT bufplce[4]; + +public: + VecCommand(Tmvline4RevSubClampRGBA)() + { + _dest = dc_dest; + _count = dc_count; + _pitch = dc_pitch; + _shade_constants = dc_shade_constants; + _srcalpha = dc_srcalpha; + _destalpha = dc_destalpha; + tmvlinebits = ::tmvlinebits; + for (int i = 0; i < 4; i++) + { + palookuplight[i] = ::palookuplight[i]; + vplce[i] = ::vplce[i]; + vince[i] = ::vince[i]; + bufplce[i] = (const uint32 *)::bufplce[i]; + } + } + + void Execute(DrawerThread *thread) override + { + int count = thread->count_for_thread(_dest_y, _count); + if (count <= 0) + return; + + uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest); + int pitch = _pitch * thread->num_cores; + int bits = tmvlinebits; + + uint32_t light[4]; + light[0] = calc_light_multiplier(palookuplight[0]); + light[1] = calc_light_multiplier(palookuplight[1]); + light[2] = calc_light_multiplier(palookuplight[2]); + light[3] = calc_light_multiplier(palookuplight[3]); + + ShadeConstants shade_constants = _shade_constants; + + uint32_t src_alpha = _srcalpha >> (FRACBITS - 8); + uint32_t dest_alpha = _destalpha >> (FRACBITS - 8); + + DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; + DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; + int skipped = thread->skipped_by_thread(_dest_y); + for (int i = 0; i < 4; i++) + { + local_vplce[i] += local_vince[i] * skipped; + local_vince[i] *= thread->num_cores; + } + + if (shade_constants.simple_shade) + { + VEC_SHADE_SIMPLE_INIT4(light[3], light[2], light[1], light[0]); + + __m128i msrc_alpha = _mm_set1_epi16(src_alpha); + __m128i mdest_alpha = _mm_set1_epi16(dest_alpha); + + do + { + uint32_t pix0 = bufplce[0][local_vplce[0] >> bits]; + uint32_t pix1 = bufplce[1][local_vplce[1] >> bits]; + uint32_t pix2 = bufplce[2][local_vplce[2] >> bits]; + uint32_t pix3 = bufplce[3][local_vplce[3] >> bits]; + + local_vplce[0] = local_vplce[0] + local_vince[0]; + local_vplce[1] = local_vplce[1] + local_vince[1]; + local_vplce[2] = local_vplce[2] + local_vince[2]; + local_vplce[3] = local_vplce[3] + local_vince[3]; + + __m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0); + VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha); + VEC_SHADE_SIMPLE(fg); + + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + __m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8); + __m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8); + __m128i out = _mm_packus_epi16(out_lo, out_hi); + + _mm_storeu_si128((__m128i*)dest, out); + dest += pitch; + } while (--count); + } + else + { + VEC_SHADE_INIT4(light[3], light[2], light[1], light[0], shade_constants); + + __m128i msrc_alpha = _mm_set1_epi16(src_alpha); + __m128i mdest_alpha = _mm_set1_epi16(dest_alpha); + + do + { + uint32_t pix0 = bufplce[0][local_vplce[0] >> bits]; + uint32_t pix1 = bufplce[1][local_vplce[1] >> bits]; + uint32_t pix2 = bufplce[2][local_vplce[2] >> bits]; + uint32_t pix3 = bufplce[3][local_vplce[3] >> bits]; + + local_vplce[0] = local_vplce[0] + local_vince[0]; + local_vplce[1] = local_vplce[1] + local_vince[1]; + local_vplce[2] = local_vplce[2] + local_vince[2]; + local_vplce[3] = local_vplce[3] + local_vince[3]; + + __m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0); + VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha); + VEC_SHADE(fg, shade_constants); + + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + __m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8); + __m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8); + __m128i out = _mm_packus_epi16(out_lo, out_hi); + + _mm_storeu_si128((__m128i*)dest, out); dest += pitch; } while (--count); } diff --git a/src/r_drawt_rgba.cpp b/src/r_drawt_rgba.cpp index 4da963430e..1e1236f0e6 100644 --- a/src/r_drawt_rgba.cpp +++ b/src/r_drawt_rgba.cpp @@ -1436,7 +1436,14 @@ void rt_subclamp1col_rgba (int hx, int sx, int yl, int yh) // Subtracts all four spans to the screen starting at sx with clamping. void rt_subclamp4cols_rgba (int sx, int yl, int yh) { +#ifdef NO_SSE DrawerCommandQueue::QueueCommand(sx, yl, yh); +#else + if (!r_linearlight) + DrawerCommandQueue::QueueCommand(sx, yl, yh); + else + DrawerCommandQueue::QueueCommand(sx, yl, yh); +#endif } // Translates and subtracts one span at hx to the screen at sx with clamping. @@ -1462,7 +1469,14 @@ void rt_revsubclamp1col_rgba (int hx, int sx, int yl, int yh) // Subtracts all four spans from the screen starting at sx with clamping. void rt_revsubclamp4cols_rgba (int sx, int yl, int yh) { +#ifdef NO_SSE DrawerCommandQueue::QueueCommand(sx, yl, yh); +#else + if (!r_linearlight) + DrawerCommandQueue::QueueCommand(sx, yl, yh); + else + DrawerCommandQueue::QueueCommand(sx, yl, yh); +#endif } // Translates and subtracts one span at hx from the screen at sx with clamping. diff --git a/src/r_drawt_rgba_sse.h b/src/r_drawt_rgba_sse.h index 5b8ae8081d..684be2b6ae 100644 --- a/src/r_drawt_rgba_sse.h +++ b/src/r_drawt_rgba_sse.h @@ -493,3 +493,255 @@ public: } } }; + +class VecCommand(RtSubClamp4colsRGBA) : public DrawerCommand +{ + int sx; + int yl; + int yh; + BYTE * RESTRICT _destorg; + int _pitch; + fixed_t _light; + fixed_t _srcalpha; + fixed_t _destalpha; + ShadeConstants _shade_constants; + +public: + VecCommand(RtSubClamp4colsRGBA)(int sx, int yl, int yh) + { + this->sx = sx; + this->yl = yl; + this->yh = yh; + + _destorg = dc_destorg; + _pitch = dc_pitch; + _light = dc_light; + _srcalpha = dc_srcalpha; + _destalpha = dc_destalpha; + _shade_constants = dc_shade_constants; + } + + void Execute(DrawerThread *thread) override + { + uint32_t *source; + uint32_t *dest; + int count; + int pitch; + int sincr; + + count = thread->count_for_thread(yl, yh - yl + 1); + if (count <= 0) + return; + + dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg); + source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4; + pitch = _pitch * thread->num_cores; + sincr = 4 * thread->num_cores; + + uint32_t light = calc_light_multiplier(_light); + uint32_t *palette = (uint32_t*)GPalette.BaseColors; + ShadeConstants shade_constants = _shade_constants; + + uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8); + uint32_t bg_alpha = _destalpha >> (FRACBITS - 8); + + if (shade_constants.simple_shade) + { + VEC_SHADE_SIMPLE_INIT(light); + + __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); + __m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha); + + do { + uint32_t p0 = source[0]; + uint32_t p1 = source[1]; + uint32_t p2 = source[2]; + uint32_t p3 = source[3]; + + // shade_pal_index: + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE_SIMPLE(fg); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + + // unpack bg: + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + // (bg_red * bg_alpha - fg_red * fg_alpha) / 256: + __m128i color_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, mbg_alpha), _mm_mullo_epi16(fg_hi, mfg_alpha)), 8); + __m128i color_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, mbg_alpha), _mm_mullo_epi16(fg_lo, mfg_alpha)), 8); + + __m128i color = _mm_packus_epi16(color_lo, color_hi); + _mm_storeu_si128((__m128i*)dest, color); + + source += sincr; + dest += pitch; + } while (--count); + } + else + { + VEC_SHADE_INIT(light, shade_constants); + + __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); + __m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha); + + do { + uint32_t p0 = source[0]; + uint32_t p1 = source[1]; + uint32_t p2 = source[2]; + uint32_t p3 = source[3]; + + // shade_pal_index: + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE(fg, shade_constants); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + + // unpack bg: + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + // (bg_red * bg_alpha - fg_red * fg_alpha) / 256: + __m128i color_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, mbg_alpha), _mm_mullo_epi16(fg_hi, mfg_alpha)), 8); + __m128i color_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, mbg_alpha), _mm_mullo_epi16(fg_lo, mfg_alpha)), 8); + + __m128i color = _mm_packus_epi16(color_lo, color_hi); + _mm_storeu_si128((__m128i*)dest, color); + + source += sincr; + dest += pitch; + } while (--count); + } + } +}; + +class VecCommand(RtRevSubClamp4colsRGBA) : public DrawerCommand +{ + int sx; + int yl; + int yh; + BYTE * RESTRICT _destorg; + int _pitch; + fixed_t _light; + fixed_t _srcalpha; + fixed_t _destalpha; + ShadeConstants _shade_constants; + +public: + VecCommand(RtRevSubClamp4colsRGBA)(int sx, int yl, int yh) + { + this->sx = sx; + this->yl = yl; + this->yh = yh; + + _destorg = dc_destorg; + _pitch = dc_pitch; + _light = dc_light; + _srcalpha = dc_srcalpha; + _destalpha = dc_destalpha; + _shade_constants = dc_shade_constants; + } + + void Execute(DrawerThread *thread) override + { + uint32_t *source; + uint32_t *dest; + int count; + int pitch; + int sincr; + + count = thread->count_for_thread(yl, yh - yl + 1); + if (count <= 0) + return; + + dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg); + source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4; + pitch = _pitch * thread->num_cores; + sincr = 4 * thread->num_cores; + + uint32_t light = calc_light_multiplier(_light); + uint32_t *palette = (uint32_t*)GPalette.BaseColors; + ShadeConstants shade_constants = _shade_constants; + + uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8); + uint32_t bg_alpha = _destalpha >> (FRACBITS - 8); + + if (shade_constants.simple_shade) + { + VEC_SHADE_SIMPLE_INIT(light); + + __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); + __m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha); + + do { + uint32_t p0 = source[0]; + uint32_t p1 = source[1]; + uint32_t p2 = source[2]; + uint32_t p3 = source[3]; + + // shade_pal_index: + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE_SIMPLE(fg); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + + // unpack bg: + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + // (fg_red * fg_alpha - bg_red * bg_alpha) / 256: + __m128i color_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8); + __m128i color_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8); + + __m128i color = _mm_packus_epi16(color_lo, color_hi); + _mm_storeu_si128((__m128i*)dest, color); + + source += sincr; + dest += pitch; + } while (--count); + } + else + { + VEC_SHADE_INIT(light, shade_constants); + + __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); + __m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha); + + do { + uint32_t p0 = source[0]; + uint32_t p1 = source[1]; + uint32_t p2 = source[2]; + uint32_t p3 = source[3]; + + // shade_pal_index: + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE(fg, shade_constants); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + + // unpack bg: + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + // (fg_red * fg_alpha - bg_red * bg_alpha) / 256: + __m128i color_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8); + __m128i color_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8); + + __m128i color = _mm_packus_epi16(color_lo, color_hi); + _mm_storeu_si128((__m128i*)dest, color); + + source += sincr; + dest += pitch; + } while (--count); + } + } +};