Added more SSE drawers

2025-01-31 10:40:33 +00:00 · 2016-06-19 23:11:41 +02:00 · 2016-06-19 23:11:41 +02:00 · 38aba81dcc
commit 38aba81dcc
parent 3f905197d0
5 changed files with 1072 additions and 148 deletions
--- a/src/r_draw_rgba.cpp
+++ b/src/r_draw_rgba.cpp
@ -2411,10 +2411,7 @@ public:
 		do
 		{
 			uint32_t pix = source[frac >> bits];
-			if (pix != 0)
-			{
-				*dest = shade_bgra(pix, light, shade_constants);
-			}
+			*dest = alpha_blend(shade_bgra(pix, light, shade_constants), *dest);
 			frac += fracstep;
 			dest += pitch;
 		} while (--count);
@ -2480,10 +2477,10 @@ public:
 		do
 		{
 			uint32_t pix;
-			pix = bufplce[0][(place = local_vplce[0]) >> bits]; if (pix) dest[0] = shade_bgra(pix, light0, shade_constants); local_vplce[0] = place + local_vince[0];
-			pix = bufplce[1][(place = local_vplce[1]) >> bits]; if (pix) dest[1] = shade_bgra(pix, light1, shade_constants); local_vplce[1] = place + local_vince[1];
-			pix = bufplce[2][(place = local_vplce[2]) >> bits]; if (pix) dest[2] = shade_bgra(pix, light2, shade_constants); local_vplce[2] = place + local_vince[2];
-			pix = bufplce[3][(place = local_vplce[3]) >> bits]; if (pix) dest[3] = shade_bgra(pix, light3, shade_constants); local_vplce[3] = place + local_vince[3];
+			pix = bufplce[0][(place = local_vplce[0]) >> bits]; dest[0] = alpha_blend(shade_bgra(pix, light0, shade_constants), dest[0]); local_vplce[0] = place + local_vince[0];
+			pix = bufplce[1][(place = local_vplce[1]) >> bits]; dest[1] = alpha_blend(shade_bgra(pix, light1, shade_constants), dest[1]); local_vplce[1] = place + local_vince[1];
+			pix = bufplce[2][(place = local_vplce[2]) >> bits]; dest[2] = alpha_blend(shade_bgra(pix, light2, shade_constants), dest[2]); local_vplce[2] = place + local_vince[2];
+			pix = bufplce[3][(place = local_vplce[3]) >> bits]; dest[3] = alpha_blend(shade_bgra(pix, light3, shade_constants), dest[3]); local_vplce[3] = place + local_vince[3];
 			dest += pitch;
 		} while (--count);
 	}
@ -2535,29 +2532,31 @@ public:
 		uint32_t light = calc_light_multiplier(_light);
 		ShadeConstants shade_constants = _shade_constants;

-		uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8);
-		uint32_t bg_alpha = _destalpha >> (FRACBITS - 8);
+		uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
+		uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);

 		do
 		{
 			uint32_t pix = source[frac >> bits];
-			if (pix != 0)
-			{
-				uint32_t fg = shade_bgra(pix, light, shade_constants);
-				uint32_t fg_red = (fg >> 16) & 0xff;
-				uint32_t fg_green = (fg >> 8) & 0xff;
-				uint32_t fg_blue = fg & 0xff;

-				uint32_t bg_red = (*dest >> 16) & 0xff;
-				uint32_t bg_green = (*dest >> 8) & 0xff;
-				uint32_t bg_blue = (*dest) & 0xff;
+			uint32_t fg_alpha, bg_alpha;
+			calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha);

-				uint32_t red = clamp<uint32_t>((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255);
-				uint32_t green = clamp<uint32_t>((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255);
-				uint32_t blue = clamp<uint32_t>((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255);
+			uint32_t fg = shade_bgra(pix, light, shade_constants);
+			uint32_t fg_red = (fg >> 16) & 0xff;
+			uint32_t fg_green = (fg >> 8) & 0xff;
+			uint32_t fg_blue = fg & 0xff;
+
+			uint32_t bg_red = (*dest >> 16) & 0xff;
+			uint32_t bg_green = (*dest >> 8) & 0xff;
+			uint32_t bg_blue = (*dest) & 0xff;
+
+			uint32_t red = clamp<uint32_t>((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255);
+			uint32_t green = clamp<uint32_t>((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255);
+			uint32_t blue = clamp<uint32_t>((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255);
+
+			*dest = 0xff000000 | (red << 16) | (green << 8) | blue;

-				*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
-			}
 			frac += fracstep;
 			dest += pitch;
 		} while (--count);
@ -2615,8 +2614,8 @@ public:

 		ShadeConstants shade_constants = _shade_constants;

-		uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8);
-		uint32_t bg_alpha = _destalpha >> (FRACBITS - 8);
+		uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
+		uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);

 		DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
 		DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
@ -2632,23 +2631,25 @@ public:
 			for (int i = 0; i < 4; ++i)
 			{
 				uint32_t pix = bufplce[i][local_vplce[i] >> bits];
-				if (pix != 0)
-				{
-					uint32_t fg = shade_bgra(pix, light[i], shade_constants);
-					uint32_t fg_red = (fg >> 16) & 0xff;
-					uint32_t fg_green = (fg >> 8) & 0xff;
-					uint32_t fg_blue = fg & 0xff;

-					uint32_t bg_red = (*dest >> 16) & 0xff;
-					uint32_t bg_green = (*dest >> 8) & 0xff;
-					uint32_t bg_blue = (*dest) & 0xff;
+				uint32_t fg_alpha, bg_alpha;
+				calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha);

-					uint32_t red = clamp<uint32_t>((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255);
-					uint32_t green = clamp<uint32_t>((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255);
-					uint32_t blue = clamp<uint32_t>((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255);
+				uint32_t fg = shade_bgra(pix, light[i], shade_constants);
+				uint32_t fg_red = (fg >> 16) & 0xff;
+				uint32_t fg_green = (fg >> 8) & 0xff;
+				uint32_t fg_blue = fg & 0xff;
+
+				uint32_t bg_red = (*dest >> 16) & 0xff;
+				uint32_t bg_green = (*dest >> 8) & 0xff;
+				uint32_t bg_blue = (*dest) & 0xff;
+
+				uint32_t red = clamp<uint32_t>((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255);
+				uint32_t green = clamp<uint32_t>((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255);
+				uint32_t blue = clamp<uint32_t>((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255);
+
+				dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;

-					dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
-				}
 				local_vplce[i] += local_vince[i];
 			}
 			dest += pitch;
@ -2702,29 +2703,31 @@ public:
 		uint32_t light = calc_light_multiplier(_light);
 		ShadeConstants shade_constants = _shade_constants;

-		uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8);
-		uint32_t bg_alpha = _destalpha >> (FRACBITS - 8);
+		uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
+		uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);

 		do
 		{
 			uint32_t pix = source[frac >> bits];
-			if (pix != 0)
-			{
-				uint32_t fg = shade_bgra(pix, light, shade_constants);
-				uint32_t fg_red = (fg >> 16) & 0xff;
-				uint32_t fg_green = (fg >> 8) & 0xff;
-				uint32_t fg_blue = fg & 0xff;

-				uint32_t bg_red = (*dest >> 16) & 0xff;
-				uint32_t bg_green = (*dest >> 8) & 0xff;
-				uint32_t bg_blue = (*dest) & 0xff;
+			uint32_t fg_alpha, bg_alpha;
+			calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha);

-				uint32_t red = clamp<uint32_t>((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255);
-				uint32_t green = clamp<uint32_t>((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255);
-				uint32_t blue = clamp<uint32_t>((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255);
+			uint32_t fg = shade_bgra(pix, light, shade_constants);
+			uint32_t fg_red = (fg >> 16) & 0xff;
+			uint32_t fg_green = (fg >> 8) & 0xff;
+			uint32_t fg_blue = fg & 0xff;
+
+			uint32_t bg_red = (*dest >> 16) & 0xff;
+			uint32_t bg_green = (*dest >> 8) & 0xff;
+			uint32_t bg_blue = (*dest) & 0xff;
+
+			uint32_t red = clamp<uint32_t>((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255);
+			uint32_t green = clamp<uint32_t>((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255);
+			uint32_t blue = clamp<uint32_t>((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255);
+
+			*dest = 0xff000000 | (red << 16) | (green << 8) | blue;

-				*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
-			}
 			frac += fracstep;
 			dest += pitch;
 		} while (--count);
@ -2782,8 +2785,8 @@ public:

 		ShadeConstants shade_constants = _shade_constants;

-		uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8);
-		uint32_t bg_alpha = _destalpha >> (FRACBITS - 8);
+		uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
+		uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);

 		DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
 		DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
@ -2799,23 +2802,25 @@ public:
 			for (int i = 0; i < 4; ++i)
 			{
 				uint32_t pix = bufplce[i][local_vplce[i] >> bits];
-				if (pix != 0)
-				{
-					uint32_t fg = shade_bgra(pix, light[i], shade_constants);
-					uint32_t fg_red = (fg >> 16) & 0xff;
-					uint32_t fg_green = (fg >> 8) & 0xff;
-					uint32_t fg_blue = fg & 0xff;

-					uint32_t bg_red = (dest[i] >> 16) & 0xff;
-					uint32_t bg_green = (dest[i] >> 8) & 0xff;
-					uint32_t bg_blue = (dest[i]) & 0xff;
+				uint32_t fg_alpha, bg_alpha;
+				calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha);

-					uint32_t red = clamp<uint32_t>((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255);
-					uint32_t green = clamp<uint32_t>((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255);
-					uint32_t blue = clamp<uint32_t>((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255);
+				uint32_t fg = shade_bgra(pix, light[i], shade_constants);
+				uint32_t fg_red = (fg >> 16) & 0xff;
+				uint32_t fg_green = (fg >> 8) & 0xff;
+				uint32_t fg_blue = fg & 0xff;
+
+				uint32_t bg_red = (dest[i] >> 16) & 0xff;
+				uint32_t bg_green = (dest[i] >> 8) & 0xff;
+				uint32_t bg_blue = (dest[i]) & 0xff;
+
+				uint32_t red = clamp<uint32_t>((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255);
+				uint32_t green = clamp<uint32_t>((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255);
+				uint32_t blue = clamp<uint32_t>((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255);
+
+				dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;

-					dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
-				}
 				local_vplce[i] += local_vince[i];
 			}
 			dest += pitch;
@ -2869,29 +2874,31 @@ public:
 		uint32_t light = calc_light_multiplier(_light);
 		ShadeConstants shade_constants = _shade_constants;

-		uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8);
-		uint32_t bg_alpha = _destalpha >> (FRACBITS - 8);
+		uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
+		uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);

 		do
 		{
 			uint32_t pix = source[frac >> bits];
-			if (pix != 0)
-			{
-				uint32_t fg = shade_bgra(pix, light, shade_constants);
-				uint32_t fg_red = (fg >> 16) & 0xff;
-				uint32_t fg_green = (fg >> 8) & 0xff;
-				uint32_t fg_blue = fg & 0xff;

-				uint32_t bg_red = (*dest >> 16) & 0xff;
-				uint32_t bg_green = (*dest >> 8) & 0xff;
-				uint32_t bg_blue = (*dest) & 0xff;
+			uint32_t fg_alpha, bg_alpha;
+			calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha);

-				uint32_t red = clamp<uint32_t>((0x10000 - fg_red * fg_alpha + bg_red * bg_alpha) / 256, 256, 256 + 255) - 256;
-				uint32_t green = clamp<uint32_t>((0x10000 - fg_green * fg_alpha + bg_green * bg_alpha) / 256, 256, 256 + 255) - 256;
-				uint32_t blue = clamp<uint32_t>((0x10000 - fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256;
+			uint32_t fg = shade_bgra(pix, light, shade_constants);
+			uint32_t fg_red = (fg >> 16) & 0xff;
+			uint32_t fg_green = (fg >> 8) & 0xff;
+			uint32_t fg_blue = fg & 0xff;
+
+			uint32_t bg_red = (*dest >> 16) & 0xff;
+			uint32_t bg_green = (*dest >> 8) & 0xff;
+			uint32_t bg_blue = (*dest) & 0xff;
+
+			uint32_t red = clamp<uint32_t>((0x10000 - fg_red * fg_alpha + bg_red * bg_alpha) / 256, 256, 256 + 255) - 256;
+			uint32_t green = clamp<uint32_t>((0x10000 - fg_green * fg_alpha + bg_green * bg_alpha) / 256, 256, 256 + 255) - 256;
+			uint32_t blue = clamp<uint32_t>((0x10000 - fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256;
+
+			*dest = 0xff000000 | (red << 16) | (green << 8) | blue;

-				*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
-			}
 			frac += fracstep;
 			dest += pitch;
 		} while (--count);
@ -2949,8 +2956,8 @@ public:

 		ShadeConstants shade_constants = _shade_constants;

-		uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8);
-		uint32_t bg_alpha = _destalpha >> (FRACBITS - 8);
+		uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
+		uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);

 		DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
 		DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
@ -2966,23 +2973,25 @@ public:
 			for (int i = 0; i < 4; ++i)
 			{
 				uint32_t pix = bufplce[i][local_vplce[i] >> bits];
-				if (pix != 0)
-				{
-					uint32_t fg = shade_bgra(pix, light[i], shade_constants);
-					uint32_t fg_red = (fg >> 16) & 0xff;
-					uint32_t fg_green = (fg >> 8) & 0xff;
-					uint32_t fg_blue = fg & 0xff;

-					uint32_t bg_red = (dest[i] >> 16) & 0xff;
-					uint32_t bg_green = (dest[i] >> 8) & 0xff;
-					uint32_t bg_blue = (dest[i]) & 0xff;
+				uint32_t fg_alpha, bg_alpha;
+				calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha);

-					uint32_t red = clamp<uint32_t>((0x10000 - fg_red * fg_alpha + bg_red * bg_alpha) / 256, 256, 256 + 255) - 256;
-					uint32_t green = clamp<uint32_t>((0x10000 - fg_green * fg_alpha + bg_green * bg_alpha) / 256, 256, 256 + 255) - 256;
-					uint32_t blue = clamp<uint32_t>((0x10000 - fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256;
+				uint32_t fg = shade_bgra(pix, light[i], shade_constants);
+				uint32_t fg_red = (fg >> 16) & 0xff;
+				uint32_t fg_green = (fg >> 8) & 0xff;
+				uint32_t fg_blue = fg & 0xff;
+
+				uint32_t bg_red = (dest[i] >> 16) & 0xff;
+				uint32_t bg_green = (dest[i] >> 8) & 0xff;
+				uint32_t bg_blue = (dest[i]) & 0xff;
+
+				uint32_t red = clamp<uint32_t>((0x10000 - fg_red * fg_alpha + bg_red * bg_alpha) / 256, 256, 256 + 255) - 256;
+				uint32_t green = clamp<uint32_t>((0x10000 - fg_green * fg_alpha + bg_green * bg_alpha) / 256, 256, 256 + 255) - 256;
+				uint32_t blue = clamp<uint32_t>((0x10000 - fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256;
+
+				dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;

-					dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
-				}
 				local_vplce[i] += local_vince[i];
 			}
 			dest += pitch;
@ -3036,29 +3045,31 @@ public:
 		uint32_t light = calc_light_multiplier(_light);
 		ShadeConstants shade_constants = _shade_constants;

-		uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8);
-		uint32_t bg_alpha = _destalpha >> (FRACBITS - 8);
+		uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
+		uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);

 		do
 		{
 			uint32_t pix = source[frac >> bits];
-			if (pix != 0)
-			{
-				uint32_t fg = shade_bgra(pix, light, shade_constants);
-				uint32_t fg_red = (fg >> 16) & 0xff;
-				uint32_t fg_green = (fg >> 8) & 0xff;
-				uint32_t fg_blue = fg & 0xff;

-				uint32_t bg_red = (*dest >> 16) & 0xff;
-				uint32_t bg_green = (*dest >> 8) & 0xff;
-				uint32_t bg_blue = (*dest) & 0xff;
+			uint32_t fg_alpha, bg_alpha;
+			calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha);

-				uint32_t red = clamp<uint32_t>((0x10000 + fg_red * fg_alpha - bg_red * bg_alpha) / 256, 256, 256 + 255) - 256;
-				uint32_t green = clamp<uint32_t>((0x10000 + fg_green * fg_alpha - bg_green * bg_alpha) / 256, 256, 256 + 255) - 256;
-				uint32_t blue = clamp<uint32_t>((0x10000 + fg_blue * fg_alpha - bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256;
+			uint32_t fg = shade_bgra(pix, light, shade_constants);
+			uint32_t fg_red = (fg >> 16) & 0xff;
+			uint32_t fg_green = (fg >> 8) & 0xff;
+			uint32_t fg_blue = fg & 0xff;
+
+			uint32_t bg_red = (*dest >> 16) & 0xff;
+			uint32_t bg_green = (*dest >> 8) & 0xff;
+			uint32_t bg_blue = (*dest) & 0xff;
+
+			uint32_t red = clamp<uint32_t>((0x10000 + fg_red * fg_alpha - bg_red * bg_alpha) / 256, 256, 256 + 255) - 256;
+			uint32_t green = clamp<uint32_t>((0x10000 + fg_green * fg_alpha - bg_green * bg_alpha) / 256, 256, 256 + 255) - 256;
+			uint32_t blue = clamp<uint32_t>((0x10000 + fg_blue * fg_alpha - bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256;
+
+			*dest = 0xff000000 | (red << 16) | (green << 8) | blue;

-				*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
-			}
 			frac += fracstep;
 			dest += pitch;
 		} while (--count);
@ -3116,8 +3127,8 @@ public:

 		ShadeConstants shade_constants = _shade_constants;

-		uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8);
-		uint32_t bg_alpha = _destalpha >> (FRACBITS - 8);
+		uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
+		uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);

 		DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
 		DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
@ -3133,23 +3144,25 @@ public:
 			for (int i = 0; i < 4; ++i)
 			{
 				uint32_t pix = bufplce[i][local_vplce[i] >> bits];
-				if (pix != 0)
-				{
-					uint32_t fg = shade_bgra(pix, light[i], shade_constants);
-					uint32_t fg_red = (fg >> 16) & 0xff;
-					uint32_t fg_green = (fg >> 8) & 0xff;
-					uint32_t fg_blue = fg & 0xff;

-					uint32_t bg_red = (dest[i] >> 16) & 0xff;
-					uint32_t bg_green = (dest[i] >> 8) & 0xff;
-					uint32_t bg_blue = (dest[i]) & 0xff;
+				uint32_t fg_alpha, bg_alpha;
+				calc_blend_alpha(pix, src_alpha, dest_alpha, fg_alpha, bg_alpha);

-					uint32_t red = clamp<uint32_t>((0x10000 + fg_red * fg_alpha - bg_red * bg_alpha) / 256, 256, 256 + 255) - 256;
-					uint32_t green = clamp<uint32_t>((0x10000 + fg_green * fg_alpha - bg_green * bg_alpha) / 256, 256, 256 + 255) - 256;
-					uint32_t blue = clamp<uint32_t>((0x10000 + fg_blue * fg_alpha - bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256;
+				uint32_t fg = shade_bgra(pix, light[i], shade_constants);
+				uint32_t fg_red = (fg >> 16) & 0xff;
+				uint32_t fg_green = (fg >> 8) & 0xff;
+				uint32_t fg_blue = fg & 0xff;
+
+				uint32_t bg_red = (dest[i] >> 16) & 0xff;
+				uint32_t bg_green = (dest[i] >> 8) & 0xff;
+				uint32_t bg_blue = (dest[i]) & 0xff;
+
+				uint32_t red = clamp<uint32_t>((0x10000 + fg_red * fg_alpha - bg_red * bg_alpha) / 256, 256, 256 + 255) - 256;
+				uint32_t green = clamp<uint32_t>((0x10000 + fg_green * fg_alpha - bg_green * bg_alpha) / 256, 256, 256 + 255) - 256;
+				uint32_t blue = clamp<uint32_t>((0x10000 + fg_blue * fg_alpha - bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256;
+
+				dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;

-					dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
-				}
 				local_vplce[i] += local_vince[i];
 			}
 			dest += pitch;
@ -3733,7 +3746,14 @@ fixed_t tmvline1_add_rgba()

 void tmvline4_add_rgba()
 {
+#ifdef NO_SSE
 	DrawerCommandQueue::QueueCommand<Tmvline4AddRGBACommand>();
+#else
+	if (!r_linearlight)
+		DrawerCommandQueue::QueueCommand<Tmvline4AddRGBA_SSE_Command>();
+	else
+		DrawerCommandQueue::QueueCommand<Tmvline4AddRGBA_AVX_Command>();
+#endif
 	for (int i = 0; i < 4; i++)
 		vplce[i] += vince[i] * dc_count;
 }
@ -3746,7 +3766,14 @@ fixed_t tmvline1_addclamp_rgba()

 void tmvline4_addclamp_rgba()
 {
+#ifdef NO_SSE
 	DrawerCommandQueue::QueueCommand<Tmvline4AddClampRGBACommand>();
+#else
+	if (!r_linearlight)
+		DrawerCommandQueue::QueueCommand<Tmvline4AddClampRGBA_SSE_Command>();
+	else
+		DrawerCommandQueue::QueueCommand<Tmvline4AddClampRGBA_AVX_Command>();
+#endif
 	for (int i = 0; i < 4; i++)
 		vplce[i] += vince[i] * dc_count;
 }
@ -3759,7 +3786,14 @@ fixed_t tmvline1_subclamp_rgba()

 void tmvline4_subclamp_rgba()
 {
+#ifdef NO_SSE
 	DrawerCommandQueue::QueueCommand<Tmvline4SubClampRGBACommand>();
+#else
+	if (!r_linearlight)
+		DrawerCommandQueue::QueueCommand<Tmvline4SubClampRGBA_SSE_Command>();
+	else
+		DrawerCommandQueue::QueueCommand<Tmvline4SubClampRGBA_AVX_Command>();
+#endif
 	for (int i = 0; i < 4; i++)
 		vplce[i] += vince[i] * dc_count;
 }
@ -3772,7 +3806,14 @@ fixed_t tmvline1_revsubclamp_rgba()

 void tmvline4_revsubclamp_rgba()
 {
+#ifdef NO_SSE
 	DrawerCommandQueue::QueueCommand<Tmvline4RevSubClampRGBACommand>();
+#else
+	if (!r_linearlight)
+		DrawerCommandQueue::QueueCommand<Tmvline4RevSubClampRGBA_SSE_Command>();
+	else
+		DrawerCommandQueue::QueueCommand<Tmvline4RevSubClampRGBA_AVX_Command>();
+#endif
 	for (int i = 0; i < 4; i++)
 		vplce[i] += vince[i] * dc_count;
 }
--- a/src/r_draw_rgba.h
+++ b/src/r_draw_rgba.h
@ -417,9 +417,9 @@ FORCEINLINE uint32_t alpha_blend(uint32_t fg, uint32_t bg)
 	uint32_t bg_green = (bg >> 8) & 0xff;
 	uint32_t bg_blue = bg & 0xff;

-	uint32_t red = ((fg_red * alpha) + (bg_red * inv_alpha)) / 256;
-	uint32_t green = ((fg_green * alpha) + (bg_green * inv_alpha)) / 256;
-	uint32_t blue = ((fg_blue * alpha) + (bg_blue * inv_alpha)) / 256;
+	uint32_t red = clamp<uint32_t>(fg_red + (bg_red * inv_alpha) / 256, 0, 255);
+	uint32_t green = clamp<uint32_t>(fg_green + (bg_green * inv_alpha) / 256, 0, 255);
+	uint32_t blue = clamp<uint32_t>(fg_blue + (bg_blue * inv_alpha) / 256, 0, 255);

 	return 0xff000000 | (red << 16) | (green << 8) | blue;
 }
@ -543,7 +543,7 @@ FORCEINLINE uint32_t alpha_blend(uint32_t fg, uint32_t bg)

 /*
 // Complex shade 8 pixels
-#define AVX2_SHADE(fg, shade_constants) { \
+#define AVX_SHADE(fg, shade_constants) { \
 	__m256i fg_hi = _mm256_unpackhi_epi8(fg, _mm256_setzero_si256()); \
 	__m256i fg_lo = _mm256_unpacklo_epi8(fg, _mm256_setzero_si256()); \
 	 \
@ -566,8 +566,58 @@ FORCEINLINE uint32_t alpha_blend(uint32_t fg, uint32_t bg)
 }
 */

+// Normal premultiplied alpha blend using the alpha from fg
+#define VEC_ALPHA_BLEND(fg,bg) { \
+	__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); \
+	__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); \
+	__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); \
+	__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); \
+	__m128i m255 = _mm_set1_epi16(255); \
+	__m128i inv_alpha_hi = _mm_sub_epi16(m255, _mm_shufflehi_epi16(_mm_shufflelo_epi16(fg_hi, _MM_SHUFFLE(3,3,3,3)), _MM_SHUFFLE(3,3,3,3))); \
+	__m128i inv_alpha_lo = _mm_sub_epi16(m255, _mm_shufflehi_epi16(_mm_shufflelo_epi16(fg_lo, _MM_SHUFFLE(3,3,3,3)), _MM_SHUFFLE(3,3,3,3))); \
+	inv_alpha_hi = _mm_add_epi16(inv_alpha_hi, _mm_srli_epi16(inv_alpha_hi, 7)); \
+	inv_alpha_lo = _mm_add_epi16(inv_alpha_lo, _mm_srli_epi16(inv_alpha_lo, 7)); \
+	bg_hi = _mm_mullo_epi16(bg_hi, inv_alpha_hi); \
+	bg_hi = _mm_srli_epi16(bg_hi, 8); \
+	bg_lo = _mm_mullo_epi16(bg_lo, inv_alpha_lo); \
+	bg_lo = _mm_srli_epi16(bg_lo, 8); \
+	bg = _mm_packus_epi16(bg_lo, bg_hi); \
+	fg = _mm_adds_epu8(fg, bg); \
+}

+/*
+FORCEINLINE void calc_blend_alpha(uint32_t fg, uint32_t src_alpha, uint32_t dest_alpha, uint32_t &fg_alpha, uint32_t &bg_alpha)
+{
+	fg_alpha = src_alpha;
+	bg_alpha = dest_alpha;
+}

+#define VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha) \
+	__m128i fg_alpha_hi = msrc_alpha; \
+	__m128i fg_alpha_lo = msrc_alpha; \
+	__m128i bg_alpha_hi = mdest_alpha; \
+	__m128i bg_alpha_lo = mdest_alpha;
+*/
+
+// Calculates the final alpha values to be used when combined with the source texture alpha channel
+FORCEINLINE void calc_blend_alpha(uint32_t fg, uint32_t src_alpha, uint32_t dest_alpha, uint32_t &fg_alpha, uint32_t &bg_alpha)
+{
+	fg_alpha = (fg >> 24) & 0xff;
+	fg_alpha += fg_alpha >> 7;
+	bg_alpha = (dest_alpha * (256 - fg_alpha)) >> 8;
+	fg_alpha = (src_alpha * fg_alpha) >> 8;
+}
+
+// Calculates the final alpha values to be used when combined with the source texture alpha channel
+#define VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha) \
+	__m128i fg_alpha_hi = _mm_shufflehi_epi16(_mm_shufflelo_epi16(_mm_unpackhi_epi8(fg, _mm_setzero_si128()), _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)); \
+	__m128i fg_alpha_lo = _mm_shufflehi_epi16(_mm_shufflelo_epi16(_mm_unpacklo_epi8(fg, _mm_setzero_si128()), _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)); \
+	fg_alpha_hi = _mm_add_epi16(fg_alpha_hi, _mm_srli_epi16(fg_alpha_hi, 7)); \
+	fg_alpha_lo = _mm_add_epi16(fg_alpha_lo, _mm_srli_epi16(fg_alpha_lo, 7)); \
+	__m128i bg_alpha_hi = _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16(256), fg_alpha_hi), mdest_alpha), 8); \
+	__m128i bg_alpha_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16(256), fg_alpha_lo), mdest_alpha), 8); \
+	fg_alpha_hi = _mm_srli_epi16(_mm_mullo_epi16(fg_alpha_hi, msrc_alpha), 8); \
+	fg_alpha_lo = _mm_srli_epi16(_mm_mullo_epi16(fg_alpha_lo, msrc_alpha), 8);

 // Calculate constants for a simple shade
 #define SSE_SHADE_SIMPLE_INIT(light) \
--- a/src/r_draw_rgba_sse.h
+++ b/src/r_draw_rgba_sse.h
@ -444,17 +444,16 @@ public:
 				uint32_t pix2 = bufplce[2][place2 >> bits];
 				uint32_t pix3 = bufplce[3][place3 >> bits];

-				// movemask = !(pix == 0)
-				__m128i movemask = _mm_xor_si128(_mm_cmpeq_epi32(_mm_set_epi32(pix3, pix2, pix1, pix0), _mm_setzero_si128()), _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
-
 				local_vplce[0] = place0 + local_vince[0];
 				local_vplce[1] = place1 + local_vince[1];
 				local_vplce[2] = place2 + local_vince[2];
 				local_vplce[3] = place3 + local_vince[3];

 				__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
+				__m128i bg = _mm_loadu_si128((const __m128i*)dest);
 				VEC_SHADE_SIMPLE(fg);
-				_mm_maskmoveu_si128(fg, movemask, (char*)dest);
+				VEC_ALPHA_BLEND(fg, bg);
+				_mm_storeu_si128((__m128i*)dest, fg);
 				dest += pitch;
 			} while (--count);
 		}
@ -473,17 +472,585 @@ public:
 				uint32_t pix2 = bufplce[2][place2 >> bits];
 				uint32_t pix3 = bufplce[3][place3 >> bits];

-				// movemask = !(pix == 0)
-				__m128i movemask = _mm_xor_si128(_mm_cmpeq_epi32(_mm_set_epi32(pix3, pix2, pix1, pix0), _mm_setzero_si128()), _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
-
 				local_vplce[0] = place0 + local_vince[0];
 				local_vplce[1] = place1 + local_vince[1];
 				local_vplce[2] = place2 + local_vince[2];
 				local_vplce[3] = place3 + local_vince[3];

 				__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
+				__m128i bg = _mm_loadu_si128((const __m128i*)dest);
 				VEC_SHADE(fg, shade_constants);
-				_mm_maskmoveu_si128(fg, movemask, (char*)dest);
+				VEC_ALPHA_BLEND(fg, bg);
+				_mm_storeu_si128((__m128i*)dest, fg);
+				dest += pitch;
+			} while (--count);
+		}
+	}
+};
+
+class VecCommand(Tmvline4AddRGBA) : public DrawerCommand
+{
+	BYTE * RESTRICT _dest;
+	int _count;
+	int _pitch;
+	ShadeConstants _shade_constants;
+	fixed_t _srcalpha;
+	fixed_t _destalpha;
+	int tmvlinebits;
+	fixed_t palookuplight[4];
+	DWORD vplce[4];
+	DWORD vince[4];
+	const uint32 * RESTRICT bufplce[4];
+
+public:
+	VecCommand(Tmvline4AddRGBA)()
+	{
+		_dest = dc_dest;
+		_count = dc_count;
+		_pitch = dc_pitch;
+		_shade_constants = dc_shade_constants;
+		_srcalpha = dc_srcalpha;
+		_destalpha = dc_destalpha;
+		tmvlinebits = ::tmvlinebits;
+		for (int i = 0; i < 4; i++)
+		{
+			palookuplight[i] = ::palookuplight[i];
+			vplce[i] = ::vplce[i];
+			vince[i] = ::vince[i];
+			bufplce[i] = (const uint32 *)::bufplce[i];
+		}
+	}
+
+	void Execute(DrawerThread *thread) override
+	{
+		int count = thread->count_for_thread(_dest_y, _count);
+		if (count <= 0)
+			return;
+
+		uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
+		int pitch = _pitch * thread->num_cores;
+		int bits = tmvlinebits;
+
+		uint32_t light[4];
+		light[0] = calc_light_multiplier(palookuplight[0]);
+		light[1] = calc_light_multiplier(palookuplight[1]);
+		light[2] = calc_light_multiplier(palookuplight[2]);
+		light[3] = calc_light_multiplier(palookuplight[3]);
+
+		ShadeConstants shade_constants = _shade_constants;
+
+		uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
+		uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
+
+		DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
+		DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
+		int skipped = thread->skipped_by_thread(_dest_y);
+		for (int i = 0; i < 4; i++)
+		{
+			local_vplce[i] += local_vince[i] * skipped;
+			local_vince[i] *= thread->num_cores;
+		}
+
+		if (shade_constants.simple_shade)
+		{
+			VEC_SHADE_SIMPLE_INIT4(light[3], light[2], light[1], light[0]);
+
+			__m128i msrc_alpha = _mm_set1_epi16(src_alpha);
+			__m128i mdest_alpha = _mm_set1_epi16(dest_alpha);
+
+			do
+			{
+				uint32_t pix0 = bufplce[0][local_vplce[0] >> bits];
+				uint32_t pix1 = bufplce[1][local_vplce[1] >> bits];
+				uint32_t pix2 = bufplce[2][local_vplce[2] >> bits];
+				uint32_t pix3 = bufplce[3][local_vplce[3] >> bits];
+
+				local_vplce[0] = local_vplce[0] + local_vince[0];
+				local_vplce[1] = local_vplce[1] + local_vince[1];
+				local_vplce[2] = local_vplce[2] + local_vince[2];
+				local_vplce[3] = local_vplce[3] + local_vince[3];
+
+				__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
+
+				VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha);
+				VEC_SHADE_SIMPLE(fg);
+
+				__m128i bg = _mm_loadu_si128((const __m128i*)dest);
+
+				__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
+				__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
+				__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
+				__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
+
+				__m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
+				__m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
+				__m128i out = _mm_packus_epi16(out_lo, out_hi);
+
+				_mm_storeu_si128((__m128i*)dest, out);
+				dest += pitch;
+			} while (--count);
+		}
+		else
+		{
+			VEC_SHADE_INIT4(light[3], light[2], light[1], light[0], shade_constants);
+
+			__m128i msrc_alpha = _mm_set1_epi16(src_alpha);
+			__m128i mdest_alpha = _mm_set1_epi16(dest_alpha);
+
+			do
+			{
+				uint32_t pix0 = bufplce[0][local_vplce[0] >> bits];
+				uint32_t pix1 = bufplce[1][local_vplce[1] >> bits];
+				uint32_t pix2 = bufplce[2][local_vplce[2] >> bits];
+				uint32_t pix3 = bufplce[3][local_vplce[3] >> bits];
+
+				local_vplce[0] = local_vplce[0] + local_vince[0];
+				local_vplce[1] = local_vplce[1] + local_vince[1];
+				local_vplce[2] = local_vplce[2] + local_vince[2];
+				local_vplce[3] = local_vplce[3] + local_vince[3];
+
+				__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
+				VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha);
+				VEC_SHADE(fg, shade_constants);
+
+				__m128i bg = _mm_loadu_si128((const __m128i*)dest);
+
+				__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
+				__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
+				__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
+				__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
+
+				__m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
+				__m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
+				__m128i out = _mm_packus_epi16(out_lo, out_hi);
+
+				_mm_storeu_si128((__m128i*)dest, out);
+				dest += pitch;
+			} while (--count);
+		}
+	}
+};
+
+class VecCommand(Tmvline4AddClampRGBA) : public DrawerCommand
+{
+	BYTE * RESTRICT _dest;
+	int _count;
+	int _pitch;
+	ShadeConstants _shade_constants;
+	fixed_t _srcalpha;
+	fixed_t _destalpha;
+	int tmvlinebits;
+	fixed_t palookuplight[4];
+	DWORD vplce[4];
+	DWORD vince[4];
+	const uint32 *RESTRICT bufplce[4];
+
+public:
+	VecCommand(Tmvline4AddClampRGBA)()
+	{
+		_dest = dc_dest;
+		_count = dc_count;
+		_pitch = dc_pitch;
+		_shade_constants = dc_shade_constants;
+		_srcalpha = dc_srcalpha;
+		_destalpha = dc_destalpha;
+		tmvlinebits = ::tmvlinebits;
+		for (int i = 0; i < 4; i++)
+		{
+			palookuplight[i] = ::palookuplight[i];
+			vplce[i] = ::vplce[i];
+			vince[i] = ::vince[i];
+			bufplce[i] = (const uint32 *)::bufplce[i];
+		}
+	}
+
+	void Execute(DrawerThread *thread) override
+	{
+		int count = thread->count_for_thread(_dest_y, _count);
+		if (count <= 0)
+			return;
+
+		uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
+		int pitch = _pitch * thread->num_cores;
+		int bits = tmvlinebits;
+
+		uint32_t light[4];
+		light[0] = calc_light_multiplier(palookuplight[0]);
+		light[1] = calc_light_multiplier(palookuplight[1]);
+		light[2] = calc_light_multiplier(palookuplight[2]);
+		light[3] = calc_light_multiplier(palookuplight[3]);
+
+		ShadeConstants shade_constants = _shade_constants;
+
+		uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
+		uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
+
+		DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
+		DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
+		int skipped = thread->skipped_by_thread(_dest_y);
+		for (int i = 0; i < 4; i++)
+		{
+			local_vplce[i] += local_vince[i] * skipped;
+			local_vince[i] *= thread->num_cores;
+		}
+
+		if (shade_constants.simple_shade)
+		{
+			VEC_SHADE_SIMPLE_INIT4(light[3], light[2], light[1], light[0]);
+
+			__m128i msrc_alpha = _mm_set1_epi16(src_alpha);
+			__m128i mdest_alpha = _mm_set1_epi16(dest_alpha);
+
+			do
+			{
+				uint32_t pix0 = bufplce[0][local_vplce[0] >> bits];
+				uint32_t pix1 = bufplce[1][local_vplce[1] >> bits];
+				uint32_t pix2 = bufplce[2][local_vplce[2] >> bits];
+				uint32_t pix3 = bufplce[3][local_vplce[3] >> bits];
+
+				local_vplce[0] = local_vplce[0] + local_vince[0];
+				local_vplce[1] = local_vplce[1] + local_vince[1];
+				local_vplce[2] = local_vplce[2] + local_vince[2];
+				local_vplce[3] = local_vplce[3] + local_vince[3];
+
+				__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
+				VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha);
+				VEC_SHADE_SIMPLE(fg);
+
+				__m128i bg = _mm_loadu_si128((const __m128i*)dest);
+
+				__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
+				__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
+				__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
+				__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
+
+				__m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
+				__m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
+				__m128i out = _mm_packus_epi16(out_lo, out_hi);
+
+				_mm_storeu_si128((__m128i*)dest, out);
+				dest += pitch;
+			} while (--count);
+		}
+		else
+		{
+			VEC_SHADE_INIT4(light[3], light[2], light[1], light[0], shade_constants);
+
+			__m128i msrc_alpha = _mm_set1_epi16(src_alpha);
+			__m128i mdest_alpha = _mm_set1_epi16(dest_alpha);
+
+			do
+			{
+				uint32_t pix0 = bufplce[0][local_vplce[0] >> bits];
+				uint32_t pix1 = bufplce[1][local_vplce[1] >> bits];
+				uint32_t pix2 = bufplce[2][local_vplce[2] >> bits];
+				uint32_t pix3 = bufplce[3][local_vplce[3] >> bits];
+
+				local_vplce[0] = local_vplce[0] + local_vince[0];
+				local_vplce[1] = local_vplce[1] + local_vince[1];
+				local_vplce[2] = local_vplce[2] + local_vince[2];
+				local_vplce[3] = local_vplce[3] + local_vince[3];
+
+				__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
+				VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha);
+				VEC_SHADE(fg, shade_constants);
+
+				__m128i bg = _mm_loadu_si128((const __m128i*)dest);
+
+				__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
+				__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
+				__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
+				__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
+
+				__m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
+				__m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
+				__m128i out = _mm_packus_epi16(out_lo, out_hi);
+
+				_mm_storeu_si128((__m128i*)dest, out);
+				dest += pitch;
+			} while (--count);
+		}
+	}
+};
+
+class VecCommand(Tmvline4SubClampRGBA) : public DrawerCommand
+{
+	BYTE * RESTRICT _dest;
+	int _count;
+	int _pitch;
+	ShadeConstants _shade_constants;
+	fixed_t _srcalpha;
+	fixed_t _destalpha;
+	int tmvlinebits;
+	fixed_t palookuplight[4];
+	DWORD vplce[4];
+	DWORD vince[4];
+	const uint32 *RESTRICT bufplce[4];
+
+public:
+	VecCommand(Tmvline4SubClampRGBA)()
+	{
+		_dest = dc_dest;
+		_count = dc_count;
+		_pitch = dc_pitch;
+		_shade_constants = dc_shade_constants;
+		_srcalpha = dc_srcalpha;
+		_destalpha = dc_destalpha;
+		tmvlinebits = ::tmvlinebits;
+		for (int i = 0; i < 4; i++)
+		{
+			palookuplight[i] = ::palookuplight[i];
+			vplce[i] = ::vplce[i];
+			vince[i] = ::vince[i];
+			bufplce[i] = (const uint32 *)::bufplce[i];
+		}
+	}
+
+	void Execute(DrawerThread *thread) override
+	{
+		int count = thread->count_for_thread(_dest_y, _count);
+		if (count <= 0)
+			return;
+
+		uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
+		int pitch = _pitch * thread->num_cores;
+		int bits = tmvlinebits;
+
+		uint32_t light[4];
+		light[0] = calc_light_multiplier(palookuplight[0]);
+		light[1] = calc_light_multiplier(palookuplight[1]);
+		light[2] = calc_light_multiplier(palookuplight[2]);
+		light[3] = calc_light_multiplier(palookuplight[3]);
+
+		ShadeConstants shade_constants = _shade_constants;
+
+		uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
+		uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
+
+		DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
+		DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
+		int skipped = thread->skipped_by_thread(_dest_y);
+		for (int i = 0; i < 4; i++)
+		{
+			local_vplce[i] += local_vince[i] * skipped;
+			local_vince[i] *= thread->num_cores;
+		}
+
+		if (shade_constants.simple_shade)
+		{
+			VEC_SHADE_SIMPLE_INIT4(light[3], light[2], light[1], light[0]);
+
+			__m128i msrc_alpha = _mm_set1_epi16(src_alpha);
+			__m128i mdest_alpha = _mm_set1_epi16(dest_alpha);
+
+			do
+			{
+				uint32_t pix0 = bufplce[0][local_vplce[0] >> bits];
+				uint32_t pix1 = bufplce[1][local_vplce[1] >> bits];
+				uint32_t pix2 = bufplce[2][local_vplce[2] >> bits];
+				uint32_t pix3 = bufplce[3][local_vplce[3] >> bits];
+
+				local_vplce[0] = local_vplce[0] + local_vince[0];
+				local_vplce[1] = local_vplce[1] + local_vince[1];
+				local_vplce[2] = local_vplce[2] + local_vince[2];
+				local_vplce[3] = local_vplce[3] + local_vince[3];
+
+				__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
+				VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha);
+				VEC_SHADE_SIMPLE(fg);
+
+				__m128i bg = _mm_loadu_si128((const __m128i*)dest);
+
+				__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
+				__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
+				__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
+				__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
+
+				__m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, bg_alpha_hi), _mm_mullo_epi16(fg_hi, fg_alpha_hi)), 8);
+				__m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, bg_alpha_lo), _mm_mullo_epi16(fg_lo, fg_alpha_lo)), 8);
+				__m128i out = _mm_packus_epi16(out_lo, out_hi);
+
+				_mm_storeu_si128((__m128i*)dest, out);
+				dest += pitch;
+			} while (--count);
+		}
+		else
+		{
+			VEC_SHADE_INIT4(light[3], light[2], light[1], light[0], shade_constants);
+
+			__m128i msrc_alpha = _mm_set1_epi16(src_alpha);
+			__m128i mdest_alpha = _mm_set1_epi16(dest_alpha);
+
+			do
+			{
+				uint32_t pix0 = bufplce[0][local_vplce[0] >> bits];
+				uint32_t pix1 = bufplce[1][local_vplce[1] >> bits];
+				uint32_t pix2 = bufplce[2][local_vplce[2] >> bits];
+				uint32_t pix3 = bufplce[3][local_vplce[3] >> bits];
+
+				local_vplce[0] = local_vplce[0] + local_vince[0];
+				local_vplce[1] = local_vplce[1] + local_vince[1];
+				local_vplce[2] = local_vplce[2] + local_vince[2];
+				local_vplce[3] = local_vplce[3] + local_vince[3];
+
+				__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
+				VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha);
+				VEC_SHADE(fg, shade_constants);
+
+				__m128i bg = _mm_loadu_si128((const __m128i*)dest);
+
+				__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
+				__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
+				__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
+				__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
+
+				__m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, bg_alpha_hi), _mm_mullo_epi16(fg_hi, fg_alpha_hi)), 8);
+				__m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, bg_alpha_lo), _mm_mullo_epi16(fg_lo, fg_alpha_lo)), 8);
+				__m128i out = _mm_packus_epi16(out_lo, out_hi);
+
+				_mm_storeu_si128((__m128i*)dest, out);
+				dest += pitch;
+			} while (--count);
+		}
+	}
+};
+
+class VecCommand(Tmvline4RevSubClampRGBA) : public DrawerCommand
+{
+	BYTE * RESTRICT _dest;
+	int _count;
+	int _pitch;
+	ShadeConstants _shade_constants;
+	fixed_t _srcalpha;
+	fixed_t _destalpha;
+	int tmvlinebits;
+	fixed_t palookuplight[4];
+	DWORD vplce[4];
+	DWORD vince[4];
+	const uint32 *RESTRICT bufplce[4];
+
+public:
+	VecCommand(Tmvline4RevSubClampRGBA)()
+	{
+		_dest = dc_dest;
+		_count = dc_count;
+		_pitch = dc_pitch;
+		_shade_constants = dc_shade_constants;
+		_srcalpha = dc_srcalpha;
+		_destalpha = dc_destalpha;
+		tmvlinebits = ::tmvlinebits;
+		for (int i = 0; i < 4; i++)
+		{
+			palookuplight[i] = ::palookuplight[i];
+			vplce[i] = ::vplce[i];
+			vince[i] = ::vince[i];
+			bufplce[i] = (const uint32 *)::bufplce[i];
+		}
+	}
+
+	void Execute(DrawerThread *thread) override
+	{
+		int count = thread->count_for_thread(_dest_y, _count);
+		if (count <= 0)
+			return;
+
+		uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
+		int pitch = _pitch * thread->num_cores;
+		int bits = tmvlinebits;
+
+		uint32_t light[4];
+		light[0] = calc_light_multiplier(palookuplight[0]);
+		light[1] = calc_light_multiplier(palookuplight[1]);
+		light[2] = calc_light_multiplier(palookuplight[2]);
+		light[3] = calc_light_multiplier(palookuplight[3]);
+
+		ShadeConstants shade_constants = _shade_constants;
+
+		uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
+		uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
+
+		DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
+		DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
+		int skipped = thread->skipped_by_thread(_dest_y);
+		for (int i = 0; i < 4; i++)
+		{
+			local_vplce[i] += local_vince[i] * skipped;
+			local_vince[i] *= thread->num_cores;
+		}
+
+		if (shade_constants.simple_shade)
+		{
+			VEC_SHADE_SIMPLE_INIT4(light[3], light[2], light[1], light[0]);
+
+			__m128i msrc_alpha = _mm_set1_epi16(src_alpha);
+			__m128i mdest_alpha = _mm_set1_epi16(dest_alpha);
+
+			do
+			{
+				uint32_t pix0 = bufplce[0][local_vplce[0] >> bits];
+				uint32_t pix1 = bufplce[1][local_vplce[1] >> bits];
+				uint32_t pix2 = bufplce[2][local_vplce[2] >> bits];
+				uint32_t pix3 = bufplce[3][local_vplce[3] >> bits];
+
+				local_vplce[0] = local_vplce[0] + local_vince[0];
+				local_vplce[1] = local_vplce[1] + local_vince[1];
+				local_vplce[2] = local_vplce[2] + local_vince[2];
+				local_vplce[3] = local_vplce[3] + local_vince[3];
+
+				__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
+				VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha);
+				VEC_SHADE_SIMPLE(fg);
+
+				__m128i bg = _mm_loadu_si128((const __m128i*)dest);
+
+				__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
+				__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
+				__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
+				__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
+
+				__m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
+				__m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
+				__m128i out = _mm_packus_epi16(out_lo, out_hi);
+
+				_mm_storeu_si128((__m128i*)dest, out);
+				dest += pitch;
+			} while (--count);
+		}
+		else
+		{
+			VEC_SHADE_INIT4(light[3], light[2], light[1], light[0], shade_constants);
+
+			__m128i msrc_alpha = _mm_set1_epi16(src_alpha);
+			__m128i mdest_alpha = _mm_set1_epi16(dest_alpha);
+
+			do
+			{
+				uint32_t pix0 = bufplce[0][local_vplce[0] >> bits];
+				uint32_t pix1 = bufplce[1][local_vplce[1] >> bits];
+				uint32_t pix2 = bufplce[2][local_vplce[2] >> bits];
+				uint32_t pix3 = bufplce[3][local_vplce[3] >> bits];
+
+				local_vplce[0] = local_vplce[0] + local_vince[0];
+				local_vplce[1] = local_vplce[1] + local_vince[1];
+				local_vplce[2] = local_vplce[2] + local_vince[2];
+				local_vplce[3] = local_vplce[3] + local_vince[3];
+
+				__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
+				VEC_CALC_BLEND_ALPHA(fg, msrc_alpha, mdest_alpha);
+				VEC_SHADE(fg, shade_constants);
+
+				__m128i bg = _mm_loadu_si128((const __m128i*)dest);
+
+				__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
+				__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
+				__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
+				__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
+
+				__m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
+				__m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
+				__m128i out = _mm_packus_epi16(out_lo, out_hi);
+
+				_mm_storeu_si128((__m128i*)dest, out);
 				dest += pitch;
 			} while (--count);
 		}
--- a/src/r_drawt_rgba.cpp
+++ b/src/r_drawt_rgba.cpp
@ -1436,7 +1436,14 @@ void rt_subclamp1col_rgba (int hx, int sx, int yl, int yh)
 // Subtracts all four spans to the screen starting at sx with clamping.
 void rt_subclamp4cols_rgba (int sx, int yl, int yh)
 {
+#ifdef NO_SSE
 	DrawerCommandQueue::QueueCommand<RtSubClamp4colsRGBACommand>(sx, yl, yh);
+#else
+	if (!r_linearlight)
+		DrawerCommandQueue::QueueCommand<RtSubClamp4colsRGBA_SSE_Command>(sx, yl, yh);
+	else
+		DrawerCommandQueue::QueueCommand<RtSubClamp4colsRGBA_AVX_Command>(sx, yl, yh);
+#endif
 }

 // Translates and subtracts one span at hx to the screen at sx with clamping.
@ -1462,7 +1469,14 @@ void rt_revsubclamp1col_rgba (int hx, int sx, int yl, int yh)
 // Subtracts all four spans from the screen starting at sx with clamping.
 void rt_revsubclamp4cols_rgba (int sx, int yl, int yh)
 {
+#ifdef NO_SSE
 	DrawerCommandQueue::QueueCommand<RtRevSubClamp4colsRGBACommand>(sx, yl, yh);
+#else
+	if (!r_linearlight)
+		DrawerCommandQueue::QueueCommand<RtRevSubClamp4colsRGBA_SSE_Command>(sx, yl, yh);
+	else
+		DrawerCommandQueue::QueueCommand<RtRevSubClamp4colsRGBA_AVX_Command>(sx, yl, yh);
+#endif
 }

 // Translates and subtracts one span at hx from the screen at sx with clamping.
--- a/src/r_drawt_rgba_sse.h
+++ b/src/r_drawt_rgba_sse.h
@ -493,3 +493,255 @@ public:
 		}
 	}
 };
+
+class VecCommand(RtSubClamp4colsRGBA) : public DrawerCommand
+{
+	int sx;
+	int yl;
+	int yh;
+	BYTE * RESTRICT _destorg;
+	int _pitch;
+	fixed_t _light;
+	fixed_t _srcalpha;
+	fixed_t _destalpha;
+	ShadeConstants _shade_constants;
+
+public:
+	VecCommand(RtSubClamp4colsRGBA)(int sx, int yl, int yh)
+	{
+		this->sx = sx;
+		this->yl = yl;
+		this->yh = yh;
+
+		_destorg = dc_destorg;
+		_pitch = dc_pitch;
+		_light = dc_light;
+		_srcalpha = dc_srcalpha;
+		_destalpha = dc_destalpha;
+		_shade_constants = dc_shade_constants;
+	}
+
+	void Execute(DrawerThread *thread) override
+	{
+		uint32_t *source;
+		uint32_t *dest;
+		int count;
+		int pitch;
+		int sincr;
+
+		count = thread->count_for_thread(yl, yh - yl + 1);
+		if (count <= 0)
+			return;
+
+		dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg);
+		source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4;
+		pitch = _pitch * thread->num_cores;
+		sincr = 4 * thread->num_cores;
+
+		uint32_t light = calc_light_multiplier(_light);
+		uint32_t *palette = (uint32_t*)GPalette.BaseColors;
+		ShadeConstants shade_constants = _shade_constants;
+
+		uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8);
+		uint32_t bg_alpha = _destalpha >> (FRACBITS - 8);
+
+		if (shade_constants.simple_shade)
+		{
+			VEC_SHADE_SIMPLE_INIT(light);
+
+			__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
+			__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
+
+			do {
+				uint32_t p0 = source[0];
+				uint32_t p1 = source[1];
+				uint32_t p2 = source[2];
+				uint32_t p3 = source[3];
+
+				// shade_pal_index:
+				__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
+				VEC_SHADE_SIMPLE(fg);
+
+				__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
+				__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
+
+				// unpack bg:
+				__m128i bg = _mm_loadu_si128((const __m128i*)dest);
+				__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
+				__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
+
+				// (bg_red * bg_alpha - fg_red * fg_alpha) / 256:
+				__m128i color_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, mbg_alpha), _mm_mullo_epi16(fg_hi, mfg_alpha)), 8);
+				__m128i color_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, mbg_alpha), _mm_mullo_epi16(fg_lo, mfg_alpha)), 8);
+
+				__m128i color = _mm_packus_epi16(color_lo, color_hi);
+				_mm_storeu_si128((__m128i*)dest, color);
+
+				source += sincr;
+				dest += pitch;
+			} while (--count);
+		}
+		else
+		{
+			VEC_SHADE_INIT(light, shade_constants);
+
+			__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
+			__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
+
+			do {
+				uint32_t p0 = source[0];
+				uint32_t p1 = source[1];
+				uint32_t p2 = source[2];
+				uint32_t p3 = source[3];
+
+				// shade_pal_index:
+				__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
+				VEC_SHADE(fg, shade_constants);
+
+				__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
+				__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
+
+				// unpack bg:
+				__m128i bg = _mm_loadu_si128((const __m128i*)dest);
+				__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
+				__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
+
+				// (bg_red * bg_alpha - fg_red * fg_alpha) / 256:
+				__m128i color_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, mbg_alpha), _mm_mullo_epi16(fg_hi, mfg_alpha)), 8);
+				__m128i color_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, mbg_alpha), _mm_mullo_epi16(fg_lo, mfg_alpha)), 8);
+
+				__m128i color = _mm_packus_epi16(color_lo, color_hi);
+				_mm_storeu_si128((__m128i*)dest, color);
+
+				source += sincr;
+				dest += pitch;
+			} while (--count);
+		}
+	}
+};
+
+class VecCommand(RtRevSubClamp4colsRGBA) : public DrawerCommand
+{
+	int sx;
+	int yl;
+	int yh;
+	BYTE * RESTRICT _destorg;
+	int _pitch;
+	fixed_t _light;
+	fixed_t _srcalpha;
+	fixed_t _destalpha;
+	ShadeConstants _shade_constants;
+
+public:
+	VecCommand(RtRevSubClamp4colsRGBA)(int sx, int yl, int yh)
+	{
+		this->sx = sx;
+		this->yl = yl;
+		this->yh = yh;
+
+		_destorg = dc_destorg;
+		_pitch = dc_pitch;
+		_light = dc_light;
+		_srcalpha = dc_srcalpha;
+		_destalpha = dc_destalpha;
+		_shade_constants = dc_shade_constants;
+	}
+
+	void Execute(DrawerThread *thread) override
+	{
+		uint32_t *source;
+		uint32_t *dest;
+		int count;
+		int pitch;
+		int sincr;
+
+		count = thread->count_for_thread(yl, yh - yl + 1);
+		if (count <= 0)
+			return;
+
+		dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg);
+		source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4;
+		pitch = _pitch * thread->num_cores;
+		sincr = 4 * thread->num_cores;
+
+		uint32_t light = calc_light_multiplier(_light);
+		uint32_t *palette = (uint32_t*)GPalette.BaseColors;
+		ShadeConstants shade_constants = _shade_constants;
+
+		uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8);
+		uint32_t bg_alpha = _destalpha >> (FRACBITS - 8);
+
+		if (shade_constants.simple_shade)
+		{
+			VEC_SHADE_SIMPLE_INIT(light);
+
+			__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
+			__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
+
+			do {
+				uint32_t p0 = source[0];
+				uint32_t p1 = source[1];
+				uint32_t p2 = source[2];
+				uint32_t p3 = source[3];
+
+				// shade_pal_index:
+				__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
+				VEC_SHADE_SIMPLE(fg);
+
+				__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
+				__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
+
+				// unpack bg:
+				__m128i bg = _mm_loadu_si128((const __m128i*)dest);
+				__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
+				__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
+
+				// (fg_red * fg_alpha - bg_red * bg_alpha) / 256:
+				__m128i color_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8);
+				__m128i color_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8);
+
+				__m128i color = _mm_packus_epi16(color_lo, color_hi);
+				_mm_storeu_si128((__m128i*)dest, color);
+
+				source += sincr;
+				dest += pitch;
+			} while (--count);
+		}
+		else
+		{
+			VEC_SHADE_INIT(light, shade_constants);
+
+			__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
+			__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
+
+			do {
+				uint32_t p0 = source[0];
+				uint32_t p1 = source[1];
+				uint32_t p2 = source[2];
+				uint32_t p3 = source[3];
+
+				// shade_pal_index:
+				__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
+				VEC_SHADE(fg, shade_constants);
+
+				__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
+				__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
+
+				// unpack bg:
+				__m128i bg = _mm_loadu_si128((const __m128i*)dest);
+				__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
+				__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
+
+				// (fg_red * fg_alpha - bg_red * bg_alpha) / 256:
+				__m128i color_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8);
+				__m128i color_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8);
+
+				__m128i color = _mm_packus_epi16(color_lo, color_hi);
+				_mm_storeu_si128((__m128i*)dest, color);
+
+				source += sincr;
+				dest += pitch;
+			} while (--count);
+		}
+	}
+};