Removed old SSE macros and drawers

2024-12-02 00:42:08 +00:00 · 2016-10-07 06:56:20 +02:00 · 2016-10-07 06:56:20 +02:00 · 78415461b9
commit 78415461b9
parent 5a0f67308f
4 changed files with 31 additions and 1757 deletions
--- a/src/r_draw_rgba.cpp
+++ b/src/r_draw_rgba.cpp
@ -63,9 +63,6 @@ CVAR(Bool, r_mipmap, true, CVAR_ARCHIVE | CVAR_GLOBALCONFIG);

 class DrawSpanLLVMCommand : public DrawerCommand
 {
-protected:
-	DrawSpanArgs args;
-
 public:
 	DrawSpanLLVMCommand()
 	{
@ -96,7 +93,7 @@ public:
 		args.flags = 0;
 		if (ds_shade_constants.simple_shade)
 			args.flags |= DrawSpanArgs::simple_shade;
-		if (!SampleBgra::span_sampler_setup(args.source, args.xbits, args.ybits, args.xstep, args.ystep, ds_source_mipmapped))
+		if (!sampler_setup(args.source, args.xbits, args.ybits, args.xstep, args.ystep, ds_source_mipmapped))
 			args.flags |= DrawSpanArgs::nearest_filter;
 	}

@ -106,6 +103,36 @@ public:
 			return;
 		LLVMDrawers::Instance()->DrawSpan(&args);
 	}
+
+protected:
+	DrawSpanArgs args;
+
+private:
+	inline static bool sampler_setup(const uint32_t * &source, int &xbits, int &ybits, fixed_t xstep, fixed_t ystep, bool mipmapped)
+	{
+		// Is this a magfilter or minfilter?
+		fixed_t xmagnitude = abs(xstep) >> (32 - xbits - FRACBITS);
+		fixed_t ymagnitude = abs(ystep) >> (32 - ybits - FRACBITS);
+		fixed_t magnitude = (xmagnitude + ymagnitude) * 2 + (1 << (FRACBITS - 1));
+		bool magnifying = (magnitude >> FRACBITS == 0);
+
+		if (r_mipmap && mipmapped)
+		{
+			int level = magnitude >> (FRACBITS + 1);
+			while (level != 0)
+			{
+				if (xbits <= 2 || ybits <= 2)
+					break;
+
+				source += (1 << (xbits)) * (1 << (ybits));
+				xbits -= 1;
+				ybits -= 1;
+				level >>= 1;
+			}
+		}
+
+		return (magnifying && r_magfilter) || (!magnifying && r_minfilter);
+	}
 };

 class DrawSpanMaskedLLVMCommand : public DrawSpanLLVMCommand
--- a/src/r_draw_rgba.h
+++ b/src/r_draw_rgba.h
@ -283,554 +283,4 @@ public:
 	}
 };

-class BlendBgra
-{
-public:
-	FORCEINLINE static uint32_t copy(uint32_t fg)
-	{
-		return fg;
-	}
-
-	FORCEINLINE static uint32_t add(uint32_t fg, uint32_t bg, uint32_t srcalpha, uint32_t destalpha)
-	{
-		uint32_t red = MIN<uint32_t>((RPART(fg) * srcalpha + RPART(bg) * destalpha) >> 8, 255);
-		uint32_t green = MIN<uint32_t>((GPART(fg) * srcalpha + GPART(bg) * destalpha) >> 8, 255);
-		uint32_t blue = MIN<uint32_t>((BPART(fg) * srcalpha + BPART(bg) * destalpha) >> 8, 255);
-		return 0xff000000 | (red << 16) | (green << 8) | blue;
-	}
-
-	FORCEINLINE static uint32_t sub(uint32_t fg, uint32_t bg, uint32_t srcalpha, uint32_t destalpha)
-	{
-		uint32_t red = clamp<uint32_t>((0x10000 - RPART(fg) * srcalpha + RPART(bg) * destalpha) >> 8, 256, 256 + 255) - 256;
-		uint32_t green = clamp<uint32_t>((0x10000 - GPART(fg) * srcalpha + GPART(bg) * destalpha) >> 8, 256, 256 + 255) - 256;
-		uint32_t blue = clamp<uint32_t>((0x10000 - BPART(fg) * srcalpha + BPART(bg) * destalpha) >> 8, 256, 256 + 255) - 256;
-		return 0xff000000 | (red << 16) | (green << 8) | blue;
-	}
-
-	FORCEINLINE static uint32_t revsub(uint32_t fg, uint32_t bg, uint32_t srcalpha, uint32_t destalpha)
-	{
-		uint32_t red = clamp<uint32_t>((0x10000 + RPART(fg) * srcalpha - RPART(bg) * destalpha) >> 8, 256, 256 + 255) - 256;
-		uint32_t green = clamp<uint32_t>((0x10000 + GPART(fg) * srcalpha - GPART(bg) * destalpha) >> 8, 256, 256 + 255) - 256;
-		uint32_t blue = clamp<uint32_t>((0x10000 + BPART(fg) * srcalpha - BPART(bg) * destalpha) >> 8, 256, 256 + 255) - 256;
-		return 0xff000000 | (red << 16) | (green << 8) | blue;
-	}
-
-	FORCEINLINE static uint32_t alpha_blend(uint32_t fg, uint32_t bg)
-	{
-		uint32_t alpha = APART(fg) + (APART(fg) >> 7); // 255 -> 256
-		uint32_t inv_alpha = 256 - alpha;
-		uint32_t red = MIN<uint32_t>(RPART(fg) * alpha + (RPART(bg) * inv_alpha) / 256, 255);
-		uint32_t green = MIN<uint32_t>(GPART(fg) * alpha + (GPART(bg) * inv_alpha) / 256, 255);
-		uint32_t blue = MIN<uint32_t>(BPART(fg) * alpha + (BPART(bg) * inv_alpha) / 256, 255);
-		return 0xff000000 | (red << 16) | (green << 8) | blue;
-	}
-};
-
-class SampleBgra
-{
-public:
-	inline static bool span_sampler_setup(const uint32_t * RESTRICT &source, int &xbits, int &ybits, fixed_t xstep, fixed_t ystep, bool mipmapped)
-	{
-		// Is this a magfilter or minfilter?
-		fixed_t xmagnitude = abs(xstep) >> (32 - xbits - FRACBITS);
-		fixed_t ymagnitude = abs(ystep) >> (32 - ybits - FRACBITS);
-		fixed_t magnitude = (xmagnitude + ymagnitude) * 2 + (1 << (FRACBITS - 1));
-		bool magnifying = (magnitude >> FRACBITS == 0);
-
-		if (r_mipmap && mipmapped)
-		{
-			int level = magnitude >> (FRACBITS + 1);
-			while (level != 0)
-			{
-				if (xbits <= 2 || ybits <= 2)
-					break;
-
-				source += (1 << (xbits)) * (1 << (ybits));
-				xbits -= 1;
-				ybits -= 1;
-				level >>= 1;
-			}
-		}
-
-		return (magnifying && r_magfilter) || (!magnifying && r_minfilter);
-	}
-
-	FORCEINLINE static uint32_t sample_bilinear(const uint32_t *col0, const uint32_t *col1, uint32_t texturefracx, uint32_t texturefracy, uint32_t one, uint32_t height)
-	{
-		uint32_t frac_y0 = (texturefracy >> FRACBITS) * height;
-		uint32_t frac_y1 = ((texturefracy + one) >> FRACBITS) * height;
-		uint32_t y0 = frac_y0 >> FRACBITS;
-		uint32_t y1 = frac_y1 >> FRACBITS;
-
-		uint32_t p00 = col0[y0];
-		uint32_t p01 = col0[y1];
-		uint32_t p10 = col1[y0];
-		uint32_t p11 = col1[y1];
-
-		uint32_t inv_b = texturefracx;
-		uint32_t inv_a = (frac_y1 >> (FRACBITS - 4)) & 15;
-		uint32_t a = 16 - inv_a;
-		uint32_t b = 16 - inv_b;
-
-		uint32_t red = (RPART(p00) * a * b + RPART(p01) * inv_a * b + RPART(p10) * a * inv_b + RPART(p11) * inv_a * inv_b + 127) >> 8;
-		uint32_t green = (GPART(p00) * a * b + GPART(p01) * inv_a * b + GPART(p10) * a * inv_b + GPART(p11) * inv_a * inv_b + 127) >> 8;
-		uint32_t blue = (BPART(p00) * a * b + BPART(p01) * inv_a * b + BPART(p10) * a * inv_b + BPART(p11) * inv_a * inv_b + 127) >> 8;
-		uint32_t alpha = (APART(p00) * a * b + APART(p01) * inv_a * b + APART(p10) * a * inv_b + APART(p11) * inv_a * inv_b + 127) >> 8;
-
-		return (alpha << 24) | (red << 16) | (green << 8) | blue;
-	}
-
-	FORCEINLINE static uint32_t sample_bilinear(const uint32_t *texture, dsfixed_t xfrac, dsfixed_t yfrac, int xbits, int ybits)
-	{
-		int xshift = (32 - xbits);
-		int yshift = (32 - ybits);
-		int xmask = (1 << xshift) - 1;
-		int ymask = (1 << yshift) - 1;
-		uint32_t x = xfrac >> xbits;
-		uint32_t y = yfrac >> ybits;
-
-		uint32_t p00 = texture[(y & ymask) + ((x & xmask) << yshift)];
-		uint32_t p01 = texture[((y + 1) & ymask) + ((x & xmask) << yshift)];
-		uint32_t p10 = texture[(y & ymask) + (((x + 1) & xmask) << yshift)];
-		uint32_t p11 = texture[((y + 1) & ymask) + (((x + 1) & xmask) << yshift)];
-
-		uint32_t inv_b = (xfrac >> (xbits - 4)) & 15;
-		uint32_t inv_a = (yfrac >> (ybits - 4)) & 15;
-		uint32_t a = 16 - inv_a;
-		uint32_t b = 16 - inv_b;
-
-		uint32_t red = (RPART(p00) * a * b + RPART(p01) * inv_a * b + RPART(p10) * a * inv_b + RPART(p11) * inv_a * inv_b + 127) >> 8;
-		uint32_t green = (GPART(p00) * a * b + GPART(p01) * inv_a * b + GPART(p10) * a * inv_b + GPART(p11) * inv_a * inv_b + 127) >> 8;
-		uint32_t blue = (BPART(p00) * a * b + BPART(p01) * inv_a * b + BPART(p10) * a * inv_b + BPART(p11) * inv_a * inv_b + 127) >> 8;
-		uint32_t alpha = (APART(p00) * a * b + APART(p01) * inv_a * b + APART(p10) * a * inv_b + APART(p11) * inv_a * inv_b + 127) >> 8;
-
-		return (alpha << 24) | (red << 16) | (green << 8) | blue;
-	}
-};
-
-/////////////////////////////////////////////////////////////////////////////
-// SSE/AVX shading macros:
-
-#define AVX2_SAMPLE_BILINEAR4_COLUMN_INIT(col0, col1, one, height, texturefracx) \
-	const uint32_t *baseptr = col0[0]; \
-	__m128i coloffsets0 = _mm_setr_epi32(col0[0] - baseptr, col0[1] - baseptr, col0[2] - baseptr, col0[3] - baseptr); \
-	__m128i coloffsets1 = _mm_setr_epi32(col1[0] - baseptr, col1[1] - baseptr, col1[2] - baseptr, col1[3] - baseptr); \
-	__m128i mone = _mm_loadu_si128((const __m128i*)one); \
-	__m128i m127 = _mm_set1_epi16(127); \
-	__m128i m16 = _mm_set1_epi32(16); \
-	__m128i m15 = _mm_set1_epi32(15); \
-	__m128i mheight = _mm_loadu_si128((const __m128i*)height); \
-	__m128i mtexturefracx = _mm_loadu_si128((const __m128i*)texturefracx);
-
-#define AVX2_SAMPLE_BILINEAR4_COLUMN(fg, texturefracy) { \
-	__m128i mtexturefracy = _mm_loadu_si128((const __m128i*)texturefracy); \
-	__m128i multmp0 = _mm_srli_epi32(mtexturefracy, FRACBITS); \
-	__m128i multmp1 = _mm_srli_epi32(_mm_add_epi32(mtexturefracy, mone), FRACBITS); \
-	__m128i frac_y0 = _mm_or_si128(_mm_mul_epu32(multmp0, mheight), _mm_slli_si128(_mm_mul_epu32(_mm_srli_si128(multmp0, 4), _mm_srli_si128(mheight, 4)), 4)); \
-	__m128i frac_y1 = _mm_or_si128(_mm_mul_epu32(multmp1, mheight), _mm_slli_si128(_mm_mul_epu32(_mm_srli_si128(multmp1, 4), _mm_srli_si128(mheight, 4)), 4)); \
-	__m128i y0 = _mm_srli_epi32(frac_y0, FRACBITS); \
-	__m128i y1 = _mm_srli_epi32(frac_y1, FRACBITS); \
-	__m128i inv_b = mtexturefracx; \
-	__m128i inv_a = _mm_and_si128(_mm_srli_epi32(frac_y1, FRACBITS - 4), m15); \
-	__m128i a = _mm_sub_epi32(m16, inv_a); \
-	__m128i b = _mm_sub_epi32(m16, inv_b); \
-	__m128i ab = _mm_mullo_epi16(a, b); \
-	__m128i invab = _mm_mullo_epi16(inv_a, b); \
-	__m128i ainvb = _mm_mullo_epi16(a, inv_b); \
-	__m128i invainvb = _mm_mullo_epi16(inv_a, inv_b); \
-	__m128i ab_lo = _mm_shuffle_epi32(ab, _MM_SHUFFLE(1, 1, 0, 0)); \
-	__m128i ab_hi = _mm_shuffle_epi32(ab, _MM_SHUFFLE(3, 3, 2, 2)); \
-	__m128i invab_lo = _mm_shuffle_epi32(invab, _MM_SHUFFLE(1, 1, 0, 0)); \
-	__m128i invab_hi = _mm_shuffle_epi32(invab, _MM_SHUFFLE(3, 3, 2, 2)); \
-	__m128i ainvb_lo = _mm_shuffle_epi32(ainvb, _MM_SHUFFLE(1, 1, 0, 0)); \
-	__m128i ainvb_hi = _mm_shuffle_epi32(ainvb, _MM_SHUFFLE(3, 3, 2, 2)); \
-	__m128i invainvb_lo = _mm_shuffle_epi32(invainvb, _MM_SHUFFLE(1, 1, 0, 0)); \
-	__m128i invainvb_hi = _mm_shuffle_epi32(invainvb, _MM_SHUFFLE(3, 3, 2, 2)); \
-	ab_lo = _mm_or_si128(ab_lo, _mm_slli_epi32(ab_lo, 16)); \
-	ab_hi = _mm_or_si128(ab_hi, _mm_slli_epi32(ab_hi, 16)); \
-	invab_lo = _mm_or_si128(invab_lo, _mm_slli_epi32(invab_lo, 16)); \
-	invab_hi = _mm_or_si128(invab_hi, _mm_slli_epi32(invab_hi, 16)); \
-	ainvb_lo = _mm_or_si128(ainvb_lo, _mm_slli_epi32(ainvb_lo, 16)); \
-	ainvb_hi = _mm_or_si128(ainvb_hi, _mm_slli_epi32(ainvb_hi, 16)); \
-	invainvb_lo = _mm_or_si128(invainvb_lo, _mm_slli_epi32(invainvb_lo, 16)); \
-	invainvb_hi = _mm_or_si128(invainvb_hi, _mm_slli_epi32(invainvb_hi, 16)); \
-	__m128i p00 = _mm_i32gather_epi32((const int *)baseptr, _mm_add_epi32(y0, coloffsets0), 4); \
-	__m128i p01 = _mm_i32gather_epi32((const int *)baseptr, _mm_add_epi32(y1, coloffsets0), 4); \
-	__m128i p10 = _mm_i32gather_epi32((const int *)baseptr, _mm_add_epi32(y0, coloffsets1), 4); \
-	__m128i p11 = _mm_i32gather_epi32((const int *)baseptr, _mm_add_epi32(y1, coloffsets1), 4); \
-	__m128i p00_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(p00, _mm_setzero_si128()), ab_lo); \
-	__m128i p01_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(p01, _mm_setzero_si128()), invab_lo); \
-	__m128i p10_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(p10, _mm_setzero_si128()), ainvb_lo); \
-	__m128i p11_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(p11, _mm_setzero_si128()), invainvb_lo); \
-	__m128i p00_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(p00, _mm_setzero_si128()), ab_hi); \
-	__m128i p01_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(p01, _mm_setzero_si128()), invab_hi); \
-	__m128i p10_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(p10, _mm_setzero_si128()), ainvb_hi); \
-	__m128i p11_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(p11, _mm_setzero_si128()), invainvb_hi); \
-	__m128i fg_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_adds_epu16(_mm_adds_epu16(p00_lo, p01_lo), _mm_adds_epu16(p10_lo, p11_lo)), m127), 8); \
-	__m128i fg_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_adds_epu16(_mm_adds_epu16(p00_hi, p01_hi), _mm_adds_epu16(p10_hi, p11_hi)), m127), 8); \
-	fg = _mm_packus_epi16(fg_lo, fg_hi); \
-}
-
-#define VEC_SAMPLE_BILINEAR4_COLUMN(fg, col0, col1, texturefracx, texturefracy, one, height) { \
-	__m128i m127 = _mm_set1_epi16(127); \
-	fg = _mm_setzero_si128(); \
-	for (int i = 0; i < 4; i++) \
-	{ \
-		uint32_t frac_y0 = (texturefracy[i] >> FRACBITS) * height[i]; \
-		uint32_t frac_y1 = ((texturefracy[i] + one[i]) >> FRACBITS) * height[i]; \
-		uint32_t y0 = (frac_y0 >> FRACBITS); \
-		uint32_t y1 = (frac_y1 >> FRACBITS); \
-		 \
-		uint32_t inv_b = texturefracx[i]; \
-		uint32_t inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; \
-		 \
-		__m128i ab_invab = _mm_load_si128(SampleBgra::samplertable + inv_b * 32 + inv_a * 2); \
-		__m128i ainvb_invainvb = _mm_load_si128(SampleBgra::samplertable + inv_b * 32 + inv_a * 2 + 1); \
-		 \
-		__m128i gather = _mm_set_epi32(col1[i][y1], col1[i][y0], col0[i][y1], col0[i][y0]); \
-		__m128i p0 = _mm_unpacklo_epi8(gather, _mm_setzero_si128()); \
-		__m128i p1 = _mm_unpackhi_epi8(gather, _mm_setzero_si128()); \
-		 \
-		__m128i tmp = _mm_adds_epu16(_mm_mullo_epi16(p0, ab_invab), _mm_mullo_epi16(p1, ainvb_invainvb)); \
-		__m128i color = _mm_srli_epi16(_mm_adds_epu16(_mm_adds_epu16(_mm_srli_si128(tmp, 8), tmp), m127), 8); \
-		 \
-		fg = _mm_or_si128(_mm_srli_si128(fg, 4), _mm_slli_si128(_mm_packus_epi16(color, _mm_setzero_si128()), 12)); \
-	} \
-}
-
-#define VEC_SAMPLE_MIP_NEAREST4_COLUMN(fg, col0, col1, mipfrac, texturefracy, height0, height1) { \
-	uint32_t y0[4], y1[4]; \
-	for (int i = 0; i < 4; i++) \
-	{ \
-		 y0[i] = (texturefracy[i] >> FRACBITS) * height0[i]; \
-		 y1[i] = (texturefracy[i] >> FRACBITS) * height1[i]; \
-	} \
-	__m128i p0 = _mm_set_epi32(col0[y0[3]], col0[y0[2]], col0[y0[1]], col0[y0[0]]); \
-	__m128i p1 = _mm_set_epi32(col1[y1[3]], col1[y1[2]], col1[y1[1]], col1[y1[0]]); \
-	__m128i t = _mm_loadu_si128((const __m128i*)mipfrac); \
-	__m128i inv_t = _mm_sub_epi32(_mm_set1_epi32(256), mipfrac); \
-	__m128i p0_lo = _mm_unpacklo_epi8(p0, _mm_setzero_si128()); \
-	__m128i p0_hi = _mm_unpackhi_epi8(p0, _mm_setzero_si128()); \
-	__m128i p1_lo = _mm_unpacklo_epi8(p1, _mm_setzero_si128()); \
-	__m128i p1_hi = _mm_unpackhi_epi8(p1, _mm_setzero_si128()); \
-	__m128i fg_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(p0_lo, t), _mm_mullo_epi16(p1_lo, inv_t)), 8); \
-	__m128i fg_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(p0_hi, t), _mm_mullo_epi16(p1_hi, inv_t)), 8); \
-	fg = _mm_packus_epi16(fg_lo, fg_hi); \
-}
-
-#define VEC_SAMPLE_BILINEAR4_SPAN(fg, texture, xfrac, yfrac, xstep, ystep, xbits, ybits) { \
-	int xshift = (32 - xbits); \
-	int yshift = (32 - ybits); \
-	int xmask = (1 << xshift) - 1; \
-	int ymask = (1 << yshift) - 1; \
-	 \
-	__m128i m127 = _mm_set1_epi16(127); \
-	fg = _mm_setzero_si128(); \
-	for (int i = 0; i < 4; i++) \
-	{ \
-		uint32_t x = xfrac >> xbits; \
-		uint32_t y = yfrac >> ybits; \
-		 \
-		uint32_t p00 = texture[(y & ymask) + ((x & xmask) << yshift)]; \
-		uint32_t p01 = texture[((y + 1) & ymask) + ((x & xmask) << yshift)]; \
-		uint32_t p10 = texture[(y & ymask) + (((x + 1) & xmask) << yshift)]; \
-		uint32_t p11 = texture[((y + 1) & ymask) + (((x + 1) & xmask) << yshift)]; \
-		 \
-		uint32_t inv_b = (xfrac >> (xbits - 4)) & 15; \
-		uint32_t inv_a = (yfrac >> (ybits - 4)) & 15; \
-		 \
-		__m128i ab_invab = _mm_load_si128(SampleBgra::samplertable + inv_b * 32 + inv_a * 2); \
-		__m128i ainvb_invainvb = _mm_load_si128(SampleBgra::samplertable + inv_b * 32 + inv_a * 2 + 1); \
-		 \
-		__m128i p0 = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, p01, p00), _mm_setzero_si128()); \
-		__m128i p1 = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, p11, p10), _mm_setzero_si128()); \
-		 \
-		__m128i tmp = _mm_adds_epu16(_mm_mullo_epi16(p0, ab_invab), _mm_mullo_epi16(p1, ainvb_invainvb)); \
-		__m128i color = _mm_srli_epi16(_mm_adds_epu16(_mm_adds_epu16(_mm_srli_si128(tmp, 8), tmp), m127), 8); \
-		 \
-		fg = _mm_or_si128(_mm_srli_si128(fg, 4), _mm_slli_si128(_mm_packus_epi16(color, _mm_setzero_si128()), 12)); \
-		 \
-		xfrac += xstep; \
-		yfrac += ystep; \
-	} \
-}
-
-// Calculate constants for a simple shade with gamma correction
-#define AVX_LINEAR_SHADE_SIMPLE_INIT(light) \
-	__m256 mlight_hi = _mm256_set_ps(1.0f, light * (1.0f/256.0f), light * (1.0f/256.0f), light * (1.0f/256.0f), 1.0f, light * (1.0f/256.0f), light * (1.0f/256.0f), light * (1.0f/256.0f)); \
-	mlight_hi = _mm256_mul_ps(mlight_hi, mlight_hi); \
-	__m256 mlight_lo = mlight_hi; \
-	__m256 mrcp_255 = _mm256_set1_ps(1.0f/255.0f); \
-	__m256 m255 = _mm256_set1_ps(255.0f);
-
-// Calculate constants for a simple shade with different light levels for each pixel and gamma correction
-#define AVX_LINEAR_SHADE_SIMPLE_INIT4(light3, light2, light1, light0) \
-	__m256 mlight_hi = _mm256_set_ps(1.0f, light1 * (1.0f/256.0f), light1 * (1.0f/256.0f), light1 * (1.0f/256.0f), 1.0f, light0 * (1.0f/256.0f), light0 * (1.0f/256.0f), light0 * (1.0f/256.0f)); \
-	__m256 mlight_lo = _mm256_set_ps(1.0f, light3 * (1.0f/256.0f), light3 * (1.0f/256.0f), light3 * (1.0f/256.0f), 1.0f, light2 * (1.0f/256.0f), light2 * (1.0f/256.0f), light2 * (1.0f/256.0f)); \
-	mlight_hi = _mm256_mul_ps(mlight_hi, mlight_hi); \
-	mlight_lo = _mm256_mul_ps(mlight_lo, mlight_lo); \
-	__m256 mrcp_255 = _mm256_set1_ps(1.0f/255.0f); \
-	__m256 m255 = _mm256_set1_ps(255.0f);
-
-// Simple shade 4 pixels with gamma correction
-#define AVX_LINEAR_SHADE_SIMPLE(fg) { \
-	__m256i fg_16 = _mm256_set_m128i(_mm_unpackhi_epi8(fg, _mm_setzero_si128()), _mm_unpacklo_epi8(fg, _mm_setzero_si128())); \
-	__m256 fg_hi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(fg_16, _mm256_setzero_si256())); \
-	__m256 fg_lo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(fg_16, _mm256_setzero_si256())); \
-	fg_hi = _mm256_mul_ps(fg_hi, mrcp_255); \
-	fg_hi = _mm256_mul_ps(fg_hi, fg_hi); \
-	fg_hi = _mm256_mul_ps(fg_hi, mlight_hi); \
-	fg_hi = _mm256_sqrt_ps(fg_hi); \
-	fg_hi = _mm256_mul_ps(fg_hi, m255); \
-	fg_lo = _mm256_mul_ps(fg_lo, mrcp_255); \
-	fg_lo = _mm256_mul_ps(fg_lo, fg_lo); \
-	fg_lo = _mm256_mul_ps(fg_lo, mlight_lo); \
-	fg_lo = _mm256_sqrt_ps(fg_lo); \
-	fg_lo = _mm256_mul_ps(fg_lo, m255); \
-	fg_16 = _mm256_packus_epi32(_mm256_cvtps_epi32(fg_lo), _mm256_cvtps_epi32(fg_hi)); \
-	fg = _mm_packus_epi16(_mm256_extractf128_si256(fg_16, 0), _mm256_extractf128_si256(fg_16, 1)); \
-}
-
-// Calculate constants for a complex shade with gamma correction
-#define AVX_LINEAR_SHADE_INIT(light, shade_constants) \
-	__m256 mlight_hi = _mm256_set_ps(1.0f, light * (1.0f/256.0f), light * (1.0f/256.0f), light * (1.0f/256.0f), 1.0f, light * (1.0f/256.0f), light * (1.0f/256.0f), light * (1.0f/256.0f)); \
-	mlight_hi = _mm256_mul_ps(mlight_hi, mlight_hi); \
-	__m256 mlight_lo = mlight_hi; \
-	__m256 mrcp_255 = _mm256_set1_ps(1.0f/255.0f); \
-	__m256 m255 = _mm256_set1_ps(255.0f); \
-	__m256 color = _mm256_set_ps( \
-		1.0f, shade_constants.light_red * (1.0f/256.0f), shade_constants.light_green * (1.0f/256.0f), shade_constants.light_blue * (1.0f/256.0f), \
-		1.0f, shade_constants.light_red * (1.0f/256.0f), shade_constants.light_green * (1.0f/256.0f), shade_constants.light_blue * (1.0f/256.0f)); \
-	__m256 fade = _mm256_set_ps( \
-		0.0f, shade_constants.fade_red * (1.0f/256.0f), shade_constants.fade_green * (1.0f/256.0f), shade_constants.fade_blue * (1.0f/256.0f), \
-		0.0f, shade_constants.fade_red * (1.0f/256.0f), shade_constants.fade_green * (1.0f/256.0f), shade_constants.fade_blue * (1.0f/256.0f)); \
-	__m256 fade_amount_hi = _mm256_mul_ps(fade, _mm256_sub_ps(_mm256_set1_ps(1.0f), mlight_hi)); \
-	__m256 fade_amount_lo = _mm256_mul_ps(fade, _mm256_sub_ps(_mm256_set1_ps(1.0f), mlight_lo)); \
-	__m256 inv_desaturate = _mm256_set1_ps((256 - shade_constants.desaturate) * (1.0f/256.0f)); \
-	__m128 ss_desaturate = _mm_set_ss(shade_constants.desaturate * (1.0f/256.0f)); \
-	__m128 intensity_weight = _mm_set_ps(0.0f, 77.0f/256.0f, 143.0f/256.0f, 37.0f/256.0f);
-
-// Calculate constants for a complex shade with different light levels for each pixel and gamma correction
-#define AVX_LINEAR_SHADE_INIT4(light3, light2, light1, light0, shade_constants) \
-	__m256 mlight_hi = _mm256_set_ps(1.0f, light1 * (1.0f/256.0f), light1 * (1.0f/256.0f), light1 * (1.0f/256.0f), 1.0f, light0 * (1.0f/256.0f), light0 * (1.0f/256.0f), light0 * (1.0f/256.0f)); \
-	__m256 mlight_lo = _mm256_set_ps(1.0f, light3 * (1.0f/256.0f), light3 * (1.0f/256.0f), light3 * (1.0f/256.0f), 1.0f, light2 * (1.0f/256.0f), light2 * (1.0f/256.0f), light2 * (1.0f/256.0f)); \
-	mlight_hi = _mm256_mul_ps(mlight_hi, mlight_hi); \
-	mlight_lo = _mm256_mul_ps(mlight_lo, mlight_lo); \
-	__m256 mrcp_255 = _mm256_set1_ps(1.0f/255.0f); \
-	__m256 m255 = _mm256_set1_ps(255.0f); \
-	__m256 color = _mm256_set_ps( \
-		1.0f, shade_constants.light_red * (1.0f/256.0f), shade_constants.light_green * (1.0f/256.0f), shade_constants.light_blue * (1.0f/256.0f), \
-		1.0f, shade_constants.light_red * (1.0f/256.0f), shade_constants.light_green * (1.0f/256.0f), shade_constants.light_blue * (1.0f/256.0f)); \
-	__m256 fade = _mm256_set_ps( \
-		0.0f, shade_constants.fade_red * (1.0f/256.0f), shade_constants.fade_green * (1.0f/256.0f), shade_constants.fade_blue * (1.0f/256.0f), \
-		0.0f, shade_constants.fade_red * (1.0f/256.0f), shade_constants.fade_green * (1.0f/256.0f), shade_constants.fade_blue * (1.0f/256.0f)); \
-	__m256 fade_amount_hi = _mm256_mul_ps(fade, _mm256_sub_ps(_mm256_set1_ps(1.0f), mlight_hi)); \
-	__m256 fade_amount_lo = _mm256_mul_ps(fade, _mm256_sub_ps(_mm256_set1_ps(1.0f), mlight_lo)); \
-	__m256 inv_desaturate = _mm256_set1_ps((256 - shade_constants.desaturate) * (1.0f/256.0f)); \
-	__m128 ss_desaturate = _mm_set_ss(shade_constants.desaturate * (1.0f/256.0f)); \
-	__m128 intensity_weight = _mm_set_ps(0.0f, 77.0f/256.0f, 143.0f/256.0f, 37.0f/256.0f);
-
-// Complex shade 4 pixels with gamma correction
-#define AVX_LINEAR_SHADE(fg, shade_constants) { \
-	__m256i fg_16 = _mm256_set_m128i(_mm_unpackhi_epi8(fg, _mm_setzero_si128()), _mm_unpacklo_epi8(fg, _mm_setzero_si128())); \
-	__m256 fg_hi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(fg_16, _mm256_setzero_si256())); \
-	__m256 fg_lo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(fg_16, _mm256_setzero_si256())); \
-	fg_hi = _mm256_mul_ps(fg_hi, mrcp_255); \
-	fg_hi = _mm256_mul_ps(fg_hi, fg_hi); \
-	fg_lo = _mm256_mul_ps(fg_lo, mrcp_255); \
-	fg_lo = _mm256_mul_ps(fg_lo, fg_lo); \
-	 \
-	__m128 intensity_hi0 = _mm_mul_ps(_mm256_extractf128_ps(fg_hi, 0), intensity_weight); \
-	__m128 intensity_hi1 = _mm_mul_ps(_mm256_extractf128_ps(fg_hi, 1), intensity_weight); \
-	intensity_hi0 = _mm_mul_ss(_mm_add_ss(_mm_add_ss(intensity_hi0, _mm_shuffle_ps(intensity_hi0, intensity_hi0, _MM_SHUFFLE(1,1,1,1))), _mm_shuffle_ps(intensity_hi0, intensity_hi0, _MM_SHUFFLE(2,2,2,2))), ss_desaturate); \
-	intensity_hi0 = _mm_shuffle_ps(intensity_hi0, intensity_hi0, _MM_SHUFFLE(0,0,0,0)); \
-	intensity_hi1 = _mm_mul_ss(_mm_add_ss(_mm_add_ss(intensity_hi1, _mm_shuffle_ps(intensity_hi1, intensity_hi1, _MM_SHUFFLE(1,1,1,1))), _mm_shuffle_ps(intensity_hi1, intensity_hi1, _MM_SHUFFLE(2,2,2,2))), ss_desaturate); \
-	intensity_hi1 = _mm_shuffle_ps(intensity_hi1, intensity_hi1, _MM_SHUFFLE(0,0,0,0)); \
-	__m256 intensity_hi = _mm256_set_m128(intensity_hi1, intensity_hi0); \
-	 \
-	fg_hi = _mm256_add_ps(_mm256_mul_ps(fg_hi, inv_desaturate), intensity_hi); \
-	fg_hi = _mm256_add_ps(_mm256_mul_ps(fg_hi, mlight_hi), fade_amount_hi); \
-	fg_hi = _mm256_mul_ps(fg_hi, color); \
-	 \
-	__m128 intensity_lo0 = _mm_mul_ps(_mm256_extractf128_ps(fg_lo, 0), intensity_weight); \
-	__m128 intensity_lo1 = _mm_mul_ps(_mm256_extractf128_ps(fg_lo, 1), intensity_weight); \
-	intensity_lo0 = _mm_mul_ss(_mm_add_ss(_mm_add_ss(intensity_lo0, _mm_shuffle_ps(intensity_lo0, intensity_lo0, _MM_SHUFFLE(1,1,1,1))), _mm_shuffle_ps(intensity_lo0, intensity_lo0, _MM_SHUFFLE(2,2,2,2))), ss_desaturate); \
-	intensity_lo0 = _mm_shuffle_ps(intensity_lo0, intensity_lo0, _MM_SHUFFLE(0,0,0,0)); \
-	intensity_lo1 = _mm_mul_ss(_mm_add_ss(_mm_add_ss(intensity_lo1, _mm_shuffle_ps(intensity_lo1, intensity_lo1, _MM_SHUFFLE(1,1,1,1))), _mm_shuffle_ps(intensity_lo1, intensity_lo1, _MM_SHUFFLE(2,2,2,2))), ss_desaturate); \
-	intensity_lo1 = _mm_shuffle_ps(intensity_lo1, intensity_lo1, _MM_SHUFFLE(0,0,0,0)); \
-	__m256 intensity_lo = _mm256_set_m128(intensity_lo1, intensity_lo0); \
-	 \
-	fg_lo = _mm256_add_ps(_mm256_mul_ps(fg_lo, inv_desaturate), intensity_lo); \
-	fg_lo = _mm256_add_ps(_mm256_mul_ps(fg_lo, mlight_lo), fade_amount_lo); \
-	fg_lo = _mm256_mul_ps(fg_lo, color); \
-	 \
-	fg_hi = _mm256_sqrt_ps(fg_hi); \
-	fg_hi = _mm256_mul_ps(fg_hi, m255); \
-	fg_lo = _mm256_sqrt_ps(fg_lo); \
-	fg_lo = _mm256_mul_ps(fg_lo, m255); \
-	fg_16 = _mm256_packus_epi32(_mm256_cvtps_epi32(fg_lo), _mm256_cvtps_epi32(fg_hi)); \
-	fg = _mm_packus_epi16(_mm256_extractf128_si256(fg_16, 0), _mm256_extractf128_si256(fg_16, 1)); \
-}
-
-/*
-// Complex shade 8 pixels
-#define AVX_SHADE(fg, shade_constants) { \
-	__m256i fg_hi = _mm256_unpackhi_epi8(fg, _mm256_setzero_si256()); \
-	__m256i fg_lo = _mm256_unpacklo_epi8(fg, _mm256_setzero_si256()); \
-	 \
-	__m256i intensity_hi = _mm256_mullo_epi16(fg_hi, _mm256_set_epi16(0, 77, 143, 37, 0, 77, 143, 37, 0, 77, 143, 37, 0, 77, 143, 37)); \
-	__m256i intensity_lo = _mm256_mullo_epi16(fg_lo, _mm256_set_epi16(0, 77, 143, 37, 0, 77, 143, 37, 0, 77, 143, 37, 0, 77, 143, 37)); \
-	__m256i intensity = _mm256_mullo_epi16(_mm256_srli_epi16(_mm256_hadd_epi16(_mm256_hadd_epi16(intensity_lo, intensity_hi), _mm256_setzero_si256()), 8), desaturate); \
-	intensity = _mm256_unpacklo_epi16(intensity, intensity); \
-	intensity_hi = _mm256_unpackhi_epi32(intensity, intensity); \
-	intensity_lo = _mm256_unpacklo_epi32(intensity, intensity); \
-	 \
-	fg_hi = _mm256_srli_epi16(_mm256_adds_epu16(_mm256_mullo_epi16(fg_hi, inv_desaturate), intensity_hi), 8); \
-	fg_hi = _mm256_srli_epi16(_mm256_adds_epu16(_mm256_mullo_epi16(fg_hi, mlight), fade_amount), 8); \
-	fg_hi = _mm256_srli_epi16(_mm256_mullo_epi16(fg_hi, color), 8); \
-	 \
-	fg_lo = _mm256_srli_epi16(_mm256_adds_epu16(_mm256_mullo_epi16(fg_lo, inv_desaturate), intensity_lo), 8); \
-	fg_lo = _mm256_srli_epi16(_mm256_adds_epu16(_mm256_mullo_epi16(fg_lo, mlight), fade_amount), 8); \
-	fg_lo = _mm256_srli_epi16(_mm256_mullo_epi16(fg_lo, color), 8); \
-	 \
-	fg = _mm256_packus_epi16(fg_lo, fg_hi); \
-}
-*/
-
-// Normal premultiplied alpha blend using the alpha from fg
-#define VEC_ALPHA_BLEND(fg,bg) { \
-	__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); \
-	__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); \
-	__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); \
-	__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); \
-	__m128i m256 = _mm_set1_epi16(256); \
-	__m128i alpha_hi = _mm_shufflehi_epi16(_mm_shufflelo_epi16(fg_hi, _MM_SHUFFLE(3,3,3,3)), _MM_SHUFFLE(3,3,3,3)); \
-	__m128i alpha_lo = _mm_shufflehi_epi16(_mm_shufflelo_epi16(fg_lo, _MM_SHUFFLE(3,3,3,3)), _MM_SHUFFLE(3,3,3,3)); \
-	alpha_hi = _mm_add_epi16(alpha_hi, _mm_srli_epi16(alpha_hi, 7)); \
-	alpha_lo = _mm_add_epi16(alpha_lo, _mm_srli_epi16(alpha_lo, 7)); \
-	__m128i inv_alpha_hi = _mm_sub_epi16(m256, alpha_hi); \
-	__m128i inv_alpha_lo = _mm_sub_epi16(m256, alpha_lo); \
-	fg_hi = _mm_mullo_epi16(fg_hi, alpha_hi); \
-	fg_hi = _mm_srli_epi16(fg_hi, 8); \
-	fg_lo = _mm_mullo_epi16(fg_lo, alpha_lo); \
-	fg_lo = _mm_srli_epi16(fg_lo, 8); \
-	fg = _mm_packus_epi16(fg_lo, fg_hi); \
-	bg_hi = _mm_mullo_epi16(bg_hi, inv_alpha_hi); \
-	bg_hi = _mm_srli_epi16(bg_hi, 8); \
-	bg_lo = _mm_mullo_epi16(bg_lo, inv_alpha_lo); \
-	bg_lo = _mm_srli_epi16(bg_lo, 8); \
-	bg = _mm_packus_epi16(bg_lo, bg_hi); \
-	fg = _mm_adds_epu8(fg, bg); \
-}
-
-// Calculates the final alpha values to be used when combined with the source texture alpha channel
-FORCEINLINE uint32_t calc_blend_bgalpha(uint32_t fg, uint32_t dest_alpha)
-{
-	uint32_t alpha = fg >> 24;
-	alpha += alpha >> 7;
-	uint32_t inv_alpha = 256 - alpha;
-	return (dest_alpha * alpha + 256 * inv_alpha + 128) >> 8;
-}
-
-#define VEC_CALC_BLEND_ALPHA_VARS() __m128i msrc_alpha, mdest_alpha, m256, m255, m128;
-
-#define VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha) \
-	msrc_alpha = _mm_set1_epi16(src_alpha); \
-	mdest_alpha = _mm_set1_epi16(dest_alpha * 255 / 256); \
-	m256 = _mm_set1_epi16(256); \
-	m255 = _mm_set1_epi16(255); \
-	m128 = _mm_set1_epi16(128);
-
-// Calculates the final alpha values to be used when combined with the source texture alpha channel
-#define VEC_CALC_BLEND_ALPHA(fg) \
-	__m128i fg_alpha_hi, fg_alpha_lo, bg_alpha_hi, bg_alpha_lo; { \
-		__m128i alpha_hi = _mm_shufflehi_epi16(_mm_shufflelo_epi16(_mm_unpackhi_epi8(fg, _mm_setzero_si128()), _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)); \
-		__m128i alpha_lo = _mm_shufflehi_epi16(_mm_shufflelo_epi16(_mm_unpacklo_epi8(fg, _mm_setzero_si128()), _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)); \
-		alpha_hi = _mm_add_epi16(alpha_hi, _mm_srli_epi16(alpha_hi, 7)); \
-		alpha_lo = _mm_add_epi16(alpha_lo, _mm_srli_epi16(alpha_lo, 7)); \
-		bg_alpha_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_adds_epu16(_mm_mullo_epi16(mdest_alpha, alpha_hi), _mm_mullo_epi16(m255, _mm_sub_epi16(m256, alpha_hi))), m128), 8); \
-		bg_alpha_hi = _mm_add_epi16(bg_alpha_hi, _mm_srli_epi16(bg_alpha_hi, 7)); \
-		bg_alpha_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_adds_epu16(_mm_mullo_epi16(mdest_alpha, alpha_lo), _mm_mullo_epi16(m255, _mm_sub_epi16(m256, alpha_lo))), m128), 8); \
-		bg_alpha_lo = _mm_add_epi16(bg_alpha_lo, _mm_srli_epi16(bg_alpha_lo, 7)); \
-		fg_alpha_hi = msrc_alpha; \
-		fg_alpha_lo = msrc_alpha; \
-	}
-
-#define SSE_SHADE_VARS() __m128i mlight_hi, mlight_lo, color, fade, fade_amount_hi, fade_amount_lo, inv_desaturate;
-
-// Calculate constants for a simple shade
-#define SSE_SHADE_SIMPLE_INIT(light) \
-	mlight_hi = _mm_set_epi16(256, light, light, light, 256, light, light, light); \
-	mlight_lo = mlight_hi;
-
-// Calculate constants for a simple shade with different light levels for each pixel
-#define SSE_SHADE_SIMPLE_INIT4(light3, light2, light1, light0) \
-	mlight_hi = _mm_set_epi16(256, light1, light1, light1, 256, light0, light0, light0); \
-	mlight_lo = _mm_set_epi16(256, light3, light3, light3, 256, light2, light2, light2);
-
-// Simple shade 4 pixels
-#define SSE_SHADE_SIMPLE(fg) { \
-	__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); \
-	__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); \
-	fg_hi = _mm_mullo_epi16(fg_hi, mlight_hi); \
-	fg_hi = _mm_srli_epi16(fg_hi, 8); \
-	fg_lo = _mm_mullo_epi16(fg_lo, mlight_lo); \
-	fg_lo = _mm_srli_epi16(fg_lo, 8); \
-	fg = _mm_packus_epi16(fg_lo, fg_hi); \
-}
-
-// Calculate constants for a complex shade
-#define SSE_SHADE_INIT(light, shade_constants) \
-	mlight_hi = _mm_set_epi16(256, light, light, light, 256, light, light, light); \
-	mlight_lo = mlight_hi; \
-	color = _mm_set_epi16( \
-		256, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, \
-		256, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); \
-	fade = _mm_set_epi16( \
-		0, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, \
-		0, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); \
-	fade_amount_hi = _mm_mullo_epi16(fade, _mm_subs_epu16(_mm_set1_epi16(256), mlight_hi)); \
-	fade_amount_lo = fade_amount_hi; \
-	inv_desaturate = _mm_set1_epi16(256 - shade_constants.desaturate); \
-
-// Calculate constants for a complex shade with different light levels for each pixel
-#define SSE_SHADE_INIT4(light3, light2, light1, light0, shade_constants) \
-	mlight_hi = _mm_set_epi16(256, light1, light1, light1, 256, light0, light0, light0); \
-	mlight_lo = _mm_set_epi16(256, light3, light3, light3, 256, light2, light2, light2); \
-	color = _mm_set_epi16( \
-		256, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, \
-		256, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); \
-	fade = _mm_set_epi16( \
-		0, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, \
-		0, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); \
-	fade_amount_hi = _mm_mullo_epi16(fade, _mm_subs_epu16(_mm_set1_epi16(256), mlight_hi)); \
-	fade_amount_lo = _mm_mullo_epi16(fade, _mm_subs_epu16(_mm_set1_epi16(256), mlight_lo)); \
-	inv_desaturate = _mm_set1_epi16(256 - shade_constants.desaturate); \
-
-// Complex shade 4 pixels
-#define SSE_SHADE(fg, shade_constants) { \
-	__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); \
-	__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); \
-	 \
-	__m128i intensity_hi = _mm_mullo_epi16(fg_hi, _mm_set_epi16(0, 77, 143, 37, 0, 77, 143, 37)); \
-	uint16_t intensity_hi0 = ((_mm_extract_epi16(intensity_hi, 2) + _mm_extract_epi16(intensity_hi, 1) + _mm_extract_epi16(intensity_hi, 0)) >> 8) * shade_constants.desaturate; \
-	uint16_t intensity_hi1 = ((_mm_extract_epi16(intensity_hi, 6) + _mm_extract_epi16(intensity_hi, 5) + _mm_extract_epi16(intensity_hi, 4)) >> 8) * shade_constants.desaturate; \
-	intensity_hi = _mm_set_epi16(intensity_hi1, intensity_hi1, intensity_hi1, intensity_hi1, intensity_hi0, intensity_hi0, intensity_hi0, intensity_hi0); \
-	 \
-	fg_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, inv_desaturate), intensity_hi), 8); \
-	fg_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, mlight_hi), fade_amount_hi), 8); \
-	fg_hi = _mm_srli_epi16(_mm_mullo_epi16(fg_hi, color), 8); \
-	 \
-	__m128i intensity_lo = _mm_mullo_epi16(fg_lo, _mm_set_epi16(0, 77, 143, 37, 0, 77, 143, 37)); \
-	uint16_t intensity_lo0 = ((_mm_extract_epi16(intensity_lo, 2) + _mm_extract_epi16(intensity_lo, 1) + _mm_extract_epi16(intensity_lo, 0)) >> 8) * shade_constants.desaturate; \
-	uint16_t intensity_lo1 = ((_mm_extract_epi16(intensity_lo, 6) + _mm_extract_epi16(intensity_lo, 5) + _mm_extract_epi16(intensity_lo, 4)) >> 8) * shade_constants.desaturate; \
-	intensity_lo = _mm_set_epi16(intensity_lo1, intensity_lo1, intensity_lo1, intensity_lo1, intensity_lo0, intensity_lo0, intensity_lo0, intensity_lo0); \
-	 \
-	fg_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, inv_desaturate), intensity_lo), 8); \
-	fg_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, mlight_lo), fade_amount_lo), 8); \
-	fg_lo = _mm_srli_epi16(_mm_mullo_epi16(fg_lo, color), 8); \
-	 \
-	fg = _mm_packus_epi16(fg_lo, fg_hi); \
-}
-
 #endif
--- a/src/r_drawt_rgba.cpp
+++ b/src/r_drawt_rgba.cpp
@ -44,50 +44,11 @@
 #include "v_video.h"
 #include "r_draw_rgba.h"
 #include "r_compiler/llvmdrawers.h"
-#ifndef NO_SSE
-#include <emmintrin.h>
-#endif

 extern unsigned int dc_tspans[4][MAXHEIGHT];
 extern unsigned int *dc_ctspan[4];
 extern unsigned int *horizspan[4];

-#ifndef NO_SSE
-
-#ifdef _MSC_VER
-#pragma warning(disable: 4101) // warning C4101: unreferenced local variable
-#endif
-
-// Generate SSE drawers:
-#define VecCommand(name) name##_SSE_Command
-#define VEC_SHADE_VARS SSE_SHADE_VARS
-#define VEC_SHADE_SIMPLE_INIT SSE_SHADE_SIMPLE_INIT
-#define VEC_SHADE_SIMPLE_INIT4 SSE_SHADE_SIMPLE_INIT4
-#define VEC_SHADE_SIMPLE SSE_SHADE_SIMPLE
-#define VEC_SHADE_INIT SSE_SHADE_INIT
-#define VEC_SHADE_INIT4 SSE_SHADE_INIT4
-#define VEC_SHADE SSE_SHADE
-#include "r_drawt_rgba_sse.h"
-/*
-// Generate AVX drawers:
-#undef VecCommand
-#undef VEC_SHADE_SIMPLE_INIT
-#undef VEC_SHADE_SIMPLE_INIT4
-#undef VEC_SHADE_SIMPLE
-#undef VEC_SHADE_INIT
-#undef VEC_SHADE_INIT4
-#undef VEC_SHADE
-#define VecCommand(name) name##_AVX_Command
-#define VEC_SHADE_SIMPLE_INIT AVX_LINEAR_SHADE_SIMPLE_INIT
-#define VEC_SHADE_SIMPLE_INIT4 AVX_LINEAR_SHADE_SIMPLE_INIT4
-#define VEC_SHADE_SIMPLE AVX_LINEAR_SHADE_SIMPLE
-#define VEC_SHADE_INIT AVX_LINEAR_SHADE_INIT
-#define VEC_SHADE_INIT4 AVX_LINEAR_SHADE_INIT4
-#define VEC_SHADE AVX_LINEAR_SHADE
-#include "r_drawt_rgba_sse.h"
-*/
-#endif
-
 /////////////////////////////////////////////////////////////////////////////

 class DrawColumnRt1LLVMCommand : public DrawerCommand
@ -173,413 +134,6 @@ DECLARE_DRAW_COMMAND(DrawColumnRt4RevSubClamp, DrawColumnRt4RevSubClamp, DrawCol

 /////////////////////////////////////////////////////////////////////////////

-class DrawerRt1colCommand : public DrawerCommand
-{
-public:
-	int hx;
-	int sx;
-	int yl;
-	int yh;
-	BYTE * RESTRICT _destorg;
-	int _pitch;
-
-	uint32_t _light;
-	ShadeConstants _shade_constants;
-	BYTE * RESTRICT _colormap;
-
-	uint32_t _srcalpha;
-	uint32_t _destalpha;
-
-	DrawerRt1colCommand(int hx, int sx, int yl, int yh)
-	{
-		this->hx = hx;
-		this->sx = sx;
-		this->yl = yl;
-		this->yh = yh;
-
-		_destorg = dc_destorg;
-		_pitch = dc_pitch;
-
-		_light = LightBgra::calc_light_multiplier(dc_light);
-		_shade_constants = dc_shade_constants;
-		_colormap = dc_colormap;
-
-		_srcalpha = dc_srcalpha >> (FRACBITS - 8);
-		_destalpha = dc_destalpha >> (FRACBITS - 8);
-	}
-
-	class LoopIterator
-	{
-	public:
-		uint32_t *source;
-		uint32_t *dest;
-		int count;
-		int pitch, sincr;
-
-		LoopIterator(DrawerRt1colCommand *command, DrawerThread *thread)
-		{
-			count = thread->count_for_thread(command->yl, (command->yh - command->yl + 1));
-			if (count <= 0)
-				return;
-
-			dest = thread->dest_for_thread(command->yl, command->_pitch, ylookup[command->yl] + command->sx + (uint32_t*)command->_destorg);
-			source = &thread->dc_temp_rgba[command->yl * 4 + command->hx] + thread->skipped_by_thread(command->yl) * 4;
-			pitch = command->_pitch * thread->num_cores;
-			sincr = thread->num_cores * 4;
-		}
-
-		explicit operator bool()
-		{
-			return count > 0;
-		}
-
-		bool next()
-		{
-			dest += pitch;
-			source += sincr;
-			return (--count) != 0;
-		}
-	};
-};
-
-class DrawerRt4colsCommand : public DrawerCommand
-{
-public:
-	int sx;
-	int yl;
-	int yh;
-	uint32_t _light;
-	ShadeConstants _shade_constants;
-	BYTE * RESTRICT _destorg;
-	int _pitch;
-	BYTE * RESTRICT _colormap;
-	uint32_t _srcalpha;
-	uint32_t _destalpha;
-
-	DrawerRt4colsCommand(int sx, int yl, int yh)
-	{
-		this->sx = sx;
-		this->yl = yl;
-		this->yh = yh;
-
-		_light = LightBgra::calc_light_multiplier(dc_light);
-		_shade_constants = dc_shade_constants;
-		_destorg = dc_destorg;
-		_pitch = dc_pitch;
-		_colormap = dc_colormap;
-
-		_srcalpha = dc_srcalpha >> (FRACBITS - 8);
-		_destalpha = dc_destalpha >> (FRACBITS - 8);
-	}
-
-	class LoopIterator
-	{
-	public:
-		uint32_t *source;
-		uint32_t *dest;
-		int count;
-		int pitch;
-		int sincr;
-
-		LoopIterator(DrawerRt4colsCommand *command, DrawerThread *thread)
-		{
-			count = thread->count_for_thread(command->yl, command->yh - command->yl + 1);
-			if (count <= 0)
-				return;
-
-			dest = thread->dest_for_thread(command->yl, command->_pitch, ylookup[command->yl] + command->sx + (uint32_t*)command->_destorg);
-			source = &thread->dc_temp_rgba[command->yl * 4] + thread->skipped_by_thread(command->yl) * 4;
-			pitch = command->_pitch * thread->num_cores;
-			sincr = thread->num_cores * 4;
-		}
-
-		explicit operator bool()
-		{
-			return count > 0;
-		}
-
-		bool next()
-		{
-			dest += pitch;
-			source += sincr;
-			return (--count) != 0;
-		}
-	};
-};
-
-class RtCopy1colRGBACommand : public DrawerRt1colCommand
-{
-public:
-	RtCopy1colRGBACommand(int hx, int sx, int yl, int yh) : DrawerRt1colCommand(hx, sx, yl, yh)
-	{
-	}
-
-	void Execute(DrawerThread *thread) override
-	{
-		LoopIterator loop(this, thread);
-		if (!loop) return;
-		do
-		{
-			uint32_t fg = GPalette.BaseColors[*loop.source];
-			*loop.dest = BlendBgra::copy(fg);
-		} while (loop.next());
-	}
-};
-
-class RtMap1colRGBACommand : public DrawerRt1colCommand
-{
-public:
-	RtMap1colRGBACommand(int hx, int sx, int yl, int yh) : DrawerRt1colCommand(hx, sx, yl, yh)
-	{
-	}
-
-	void Execute(DrawerThread *thread) override
-	{
-		LoopIterator loop(this, thread);
-		if (!loop) return;
-		do
-		{
-			uint32_t fg = LightBgra::shade_pal_index(_colormap[*loop.source], _light, _shade_constants);
-			*loop.dest = BlendBgra::copy(fg);
-		} while (loop.next());
-	}
-};
-
-class RtMap4colsRGBACommand : public DrawerRt4colsCommand
-{
-public:
-	RtMap4colsRGBACommand(int sx, int yl, int yh) : DrawerRt4colsCommand(sx, yl, yh)
-	{
-	}
-
-	void Execute(DrawerThread *thread) override
-	{
-		LoopIterator loop(this, thread);
-		if (!loop) return;
-		do
-		{
-			for (int i = 0; i < 4; i++)
-			{
-				uint32_t fg = LightBgra::shade_pal_index(_colormap[loop.source[i]], _light, _shade_constants);
-				loop.dest[i] = BlendBgra::copy(fg);
-			}
-		} while (loop.next());
-	}
-};
-
-class RtAdd1colRGBACommand : public DrawerRt1colCommand
-{
-public:
-	RtAdd1colRGBACommand(int hx, int sx, int yl, int yh) : DrawerRt1colCommand(hx, sx, yl, yh)
-	{
-	}
-
-	void Execute(DrawerThread *thread) override
-	{
-		LoopIterator loop(this, thread);
-		if (!loop) return;
-		do
-		{
-			uint32_t fg = LightBgra::shade_pal_index(_colormap[*loop.source], _light, _shade_constants);
-			*loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, _destalpha);
-		} while (loop.next());
-	}
-};
-
-class RtAdd4colsRGBACommand : public DrawerRt4colsCommand
-{
-public:
-	RtAdd4colsRGBACommand(int sx, int yl, int yh) : DrawerRt4colsCommand(sx, yl, yh)
-	{
-	}
-
-	void Execute(DrawerThread *thread) override
-	{
-		LoopIterator loop(this, thread);
-		if (!loop) return;
-		do
-		{
-			for (int i = 0; i < 4; i++)
-			{
-				uint32_t fg = LightBgra::shade_pal_index(_colormap[loop.source[i]], _light, _shade_constants);
-				loop.dest[i] = BlendBgra::add(fg, loop.dest[i], _srcalpha, _destalpha);
-			}
-		} while (loop.next());
-	}
-};
-
-class RtShaded1colRGBACommand : public DrawerRt1colCommand
-{
-	uint32_t _color;
-
-public:
-	RtShaded1colRGBACommand(int hx, int sx, int yl, int yh) : DrawerRt1colCommand(hx, sx, yl, yh)
-	{
-		_color = LightBgra::shade_pal_index(dc_color, _light, _shade_constants);
-	}
-
-	void Execute(DrawerThread *thread) override
-	{
-		LoopIterator loop(this, thread);
-		if (!loop) return;
-		do
-		{
-			uint32_t alpha = _colormap[*loop.source] * 4;
-			uint32_t inv_alpha = 256 - alpha;
-			*loop.dest = BlendBgra::add(_color, *loop.dest, alpha, inv_alpha);
-		} while (loop.next());
-	}
-};
-
-class RtShaded4colsRGBACommand : public DrawerRt4colsCommand
-{
-	uint32_t _color;
-
-public:
-	RtShaded4colsRGBACommand(int sx, int yl, int yh) : DrawerRt4colsCommand(sx, yl, yh)
-	{
-		_color = LightBgra::shade_pal_index(dc_color, _light, _shade_constants);
-	}
-
-	void Execute(DrawerThread *thread) override
-	{
-		LoopIterator loop(this, thread);
-		if (!loop) return;
-		do
-		{
-			for (int i = 0; i < 4; i++)
-			{
-				uint32_t alpha = _colormap[loop.source[i]] * 4;
-				uint32_t inv_alpha = 256 - alpha;
-				loop.dest[i] = BlendBgra::add(_color, loop.dest[i], alpha, inv_alpha);
-			}
-		} while (loop.next());
-	}
-};
-
-class RtAddClamp1colRGBACommand : public DrawerRt1colCommand
-{
-public:
-	RtAddClamp1colRGBACommand(int hx, int sx, int yl, int yh) : DrawerRt1colCommand(hx, sx, yl, yh)
-	{
-	}
-
-	void Execute(DrawerThread *thread) override
-	{
-		LoopIterator loop(this, thread);
-		if (!loop) return;
-		do
-		{
-			uint32_t fg = LightBgra::shade_pal_index(*loop.source, _light, _shade_constants);
-			*loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, _destalpha);
-		} while (loop.next());
-	}
-};
-
-class RtAddClamp4colsRGBACommand : public DrawerRt4colsCommand
-{
-public:
-	RtAddClamp4colsRGBACommand(int sx, int yl, int yh) : DrawerRt4colsCommand(sx, yl, yh)
-	{
-	}
-
-	void Execute(DrawerThread *thread) override
-	{
-		LoopIterator loop(this, thread);
-		if (!loop) return;
-		do
-		{
-			for (int i = 0; i < 4; i++)
-			{
-				uint32_t fg = LightBgra::shade_pal_index(loop.source[i], _light, _shade_constants);
-				loop.dest[i] = BlendBgra::add(fg, loop.dest[i], _srcalpha, _destalpha);
-			}
-		} while (loop.next());
-	}
-};
-
-class RtSubClamp1colRGBACommand : public DrawerRt1colCommand
-{
-public:
-	RtSubClamp1colRGBACommand(int hx, int sx, int yl, int yh) : DrawerRt1colCommand(hx, sx, yl, yh)
-	{
-	}
-
-	void Execute(DrawerThread *thread) override
-	{
-		LoopIterator loop(this, thread);
-		if (!loop) return;
-		do
-		{
-			uint32_t fg = LightBgra::shade_pal_index(*loop.source, _light, _shade_constants);
-			*loop.dest = BlendBgra::sub(fg, *loop.dest, _srcalpha, _destalpha);
-		} while (loop.next());
-	}
-};
-
-class RtSubClamp4colsRGBACommand : public DrawerRt4colsCommand
-{
-public:
-	RtSubClamp4colsRGBACommand(int sx, int yl, int yh) : DrawerRt4colsCommand(sx, yl, yh)
-	{
-	}
-
-	void Execute(DrawerThread *thread) override
-	{
-		LoopIterator loop(this, thread);
-		if (!loop) return;
-		do
-		{
-			for (int i = 0; i < 4; i++)
-			{
-				uint32_t fg = LightBgra::shade_pal_index(loop.source[i], _light, _shade_constants);
-				loop.dest[i] = BlendBgra::sub(fg, loop.dest[i], _srcalpha, _destalpha);
-			}
-		} while (loop.next());
-	}
-};
-
-class RtRevSubClamp1colRGBACommand : public DrawerRt1colCommand
-{
-public:
-	RtRevSubClamp1colRGBACommand(int hx, int sx, int yl, int yh) : DrawerRt1colCommand(hx, sx, yl, yh)
-	{
-	}
-
-	void Execute(DrawerThread *thread) override
-	{
-		LoopIterator loop(this, thread);
-		if (!loop) return;
-		do
-		{
-			uint32_t fg = LightBgra::shade_pal_index(*loop.source, _light, _shade_constants);
-			*loop.dest = BlendBgra::revsub(fg, *loop.dest, _srcalpha, _destalpha);
-		} while (loop.next());
-	}
-};
-
-class RtRevSubClamp4colsRGBACommand : public DrawerRt4colsCommand
-{
-public:
-	RtRevSubClamp4colsRGBACommand(int sx, int yl, int yh) : DrawerRt4colsCommand(sx, yl, yh)
-	{
-	}
-
-	void Execute(DrawerThread *thread) override
-	{
-		LoopIterator loop(this, thread);
-		if (!loop) return;
-		do
-		{
-			for (int i = 0; i < 4; i++)
-			{
-				uint32_t fg = LightBgra::shade_pal_index(loop.source[i], _light, _shade_constants);
-				loop.dest[i] = BlendBgra::revsub(fg, loop.dest[i], _srcalpha, _destalpha);
-			}
-		} while (loop.next());
-	}
-};
-
 class RtTranslate1colRGBACommand : public DrawerCommand
 {
 	const BYTE * RESTRICT translation;
--- a/src/r_drawt_rgba_sse.h
+++ b/src/r_drawt_rgba_sse.h
@ -1,757 +0,0 @@
-//
-// SSE/AVX intrinsics based drawers for the r_drawt family of drawers.
-//
-// Note: This header file is intentionally not guarded by a __R_DRAWT_RGBA_SSE__ define.
-//       It is because the code is nearly identical for SSE vs AVX. The file is included
-//       multiple times by r_drawt_rgba.cpp with different defines that changes the class
-//       names outputted and the type of intrinsics used.
-
-#ifdef _MSC_VER
-#pragma warning(disable: 4752) // warning C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
-#endif
-
-class VecCommand(RtMap4colsRGBA) : public DrawerCommand
-{
-	int sx;
-	int yl;
-	int yh;
-	fixed_t _light;
-	ShadeConstants _shade_constants;
-	BYTE * RESTRICT _destorg;
-	int _pitch;
-	BYTE * RESTRICT _colormap;
-
-public:
-	VecCommand(RtMap4colsRGBA)(int sx, int yl, int yh)
-	{
-		this->sx = sx;
-		this->yl = yl;
-		this->yh = yh;
-
-		_light = dc_light;
-		_shade_constants = dc_shade_constants;
-		_destorg = dc_destorg;
-		_pitch = dc_pitch;
-		_colormap = dc_colormap;
-	}
-
-	void Execute(DrawerThread *thread) override
-	{
-		uint32_t *source;
-		uint32_t *dest;
-		int count;
-		int pitch;
-		int sincr;
-
-		count = thread->count_for_thread(yl, yh - yl + 1);
-		if (count <= 0)
-			return;
-
-		ShadeConstants shade_constants = _shade_constants;
-		uint32_t light = LightBgra::calc_light_multiplier(_light);
-		uint32_t *palette = (uint32_t*)GPalette.BaseColors;
-
-		dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg);
-		source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4;
-		pitch = _pitch * thread->num_cores;
-		sincr = thread->num_cores * 4;
-
-		BYTE *colormap = _colormap;
-
-		if (shade_constants.simple_shade)
-		{
-			VEC_SHADE_VARS();
-			VEC_SHADE_SIMPLE_INIT(light);
-
-			if (count & 1) {
-				uint32_t p0 = colormap[source[0]];
-				uint32_t p1 = colormap[source[1]];
-				uint32_t p2 = colormap[source[2]];
-				uint32_t p3 = colormap[source[3]];
-
-				// shade_pal_index:
-				__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
-				VEC_SHADE_SIMPLE(fg);
-				_mm_storeu_si128((__m128i*)dest, fg);
-
-				source += sincr;
-				dest += pitch;
-			}
-			if (!(count >>= 1))
-				return;
-
-			do {
-				// shade_pal_index 0-3
-				{
-					uint32_t p0 = colormap[source[0]];
-					uint32_t p1 = colormap[source[1]];
-					uint32_t p2 = colormap[source[2]];
-					uint32_t p3 = colormap[source[3]];
-
-					__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
-					VEC_SHADE_SIMPLE(fg);
-					_mm_storeu_si128((__m128i*)dest, fg);
-				}
-
-				// shade_pal_index 4-7 (pitch)
-				{
-					uint32_t p0 = colormap[source[sincr]];
-					uint32_t p1 = colormap[source[sincr + 1]];
-					uint32_t p2 = colormap[source[sincr + 2]];
-					uint32_t p3 = colormap[source[sincr + 3]];
-
-					__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
-					VEC_SHADE_SIMPLE(fg);
-					_mm_storeu_si128((__m128i*)(dest + pitch), fg);
-				}
-
-				source += sincr * 2;
-				dest += pitch * 2;
-			} while (--count);
-		}
-		else
-		{
-			VEC_SHADE_VARS();
-			VEC_SHADE_INIT(light, shade_constants);
-
-			if (count & 1) {
-				uint32_t p0 = colormap[source[0]];
-				uint32_t p1 = colormap[source[1]];
-				uint32_t p2 = colormap[source[2]];
-				uint32_t p3 = colormap[source[3]];
-
-				// shade_pal_index:
-				__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
-				VEC_SHADE(fg, shade_constants);
-				_mm_storeu_si128((__m128i*)dest, fg);
-
-				source += sincr;
-				dest += pitch;
-			}
-			if (!(count >>= 1))
-				return;
-
-			do {
-				// shade_pal_index 0-3
-				{
-					uint32_t p0 = colormap[source[0]];
-					uint32_t p1 = colormap[source[1]];
-					uint32_t p2 = colormap[source[2]];
-					uint32_t p3 = colormap[source[3]];
-
-					__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
-					VEC_SHADE(fg, shade_constants);
-					_mm_storeu_si128((__m128i*)dest, fg);
-				}
-
-				// shade_pal_index 4-7 (pitch)
-				{
-					uint32_t p0 = colormap[source[sincr]];
-					uint32_t p1 = colormap[source[sincr + 1]];
-					uint32_t p2 = colormap[source[sincr + 2]];
-					uint32_t p3 = colormap[source[sincr + 3]];
-
-					__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
-					VEC_SHADE(fg, shade_constants);
-					_mm_storeu_si128((__m128i*)(dest + pitch), fg);
-				}
-
-				source += sincr * 2;
-				dest += pitch * 2;
-			} while (--count);
-		}
-	}
-};
-
-class VecCommand(RtAdd4colsRGBA) : public DrawerCommand
-{
-	int sx;
-	int yl;
-	int yh;
-	BYTE * RESTRICT _destorg;
-	int _pitch;
-	fixed_t _light;
-	ShadeConstants _shade_constants;
-	BYTE * RESTRICT _colormap;
-	fixed_t _srcalpha;
-	fixed_t _destalpha;
-
-public:
-	VecCommand(RtAdd4colsRGBA)(int sx, int yl, int yh)
-	{
-		this->sx = sx;
-		this->yl = yl;
-		this->yh = yh;
-
-		_destorg = dc_destorg;
-		_pitch = dc_pitch;
-		_light = dc_light;
-		_shade_constants = dc_shade_constants;
-		_colormap = dc_colormap;
-		_srcalpha = dc_srcalpha;
-		_destalpha = dc_destalpha;
-	}
-
-	void Execute(DrawerThread *thread) override
-	{
-		uint32_t *source;
-		uint32_t *dest;
-		int count;
-		int pitch;
-		int sincr;
-
-		count = thread->count_for_thread(yl, yh - yl + 1);
-		if (count <= 0)
-			return;
-
-		dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg);
-		source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4;
-		pitch = _pitch * thread->num_cores;
-		sincr = 4 * thread->num_cores;
-
-		uint32_t light = LightBgra::calc_light_multiplier(_light);
-		uint32_t *palette = (uint32_t*)GPalette.BaseColors;
-		BYTE *colormap = _colormap;
-
-		uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8);
-		uint32_t bg_alpha = _destalpha >> (FRACBITS - 8);
-
-		ShadeConstants shade_constants = _shade_constants;
-
-		if (shade_constants.simple_shade)
-		{
-			VEC_SHADE_VARS();
-			VEC_SHADE_SIMPLE_INIT(light);
-
-			__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
-			__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
-
-			do {
-				uint32_t p0 = colormap[source[0]];
-				uint32_t p1 = colormap[source[1]];
-				uint32_t p2 = colormap[source[2]];
-				uint32_t p3 = colormap[source[3]];
-
-				// shade_pal_index:
-				__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
-				VEC_SHADE_SIMPLE(fg);
-
-				__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
-				__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
-
-				// unpack bg:
-				__m128i bg = _mm_loadu_si128((const __m128i*)dest);
-				__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
-				__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
-
-				// (fg_red * fg_alpha + bg_red * bg_alpha) / 256:
-				__m128i color_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8);
-				__m128i color_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8);
-
-				__m128i color = _mm_packus_epi16(color_lo, color_hi);
-				_mm_storeu_si128((__m128i*)dest, color);
-
-				source += sincr;
-				dest += pitch;
-			} while (--count);
-		}
-		else
-		{
-			VEC_SHADE_VARS();
-			VEC_SHADE_INIT(light, shade_constants);
-
-			__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
-			__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
-
-			do {
-				uint32_t p0 = colormap[source[0]];
-				uint32_t p1 = colormap[source[1]];
-				uint32_t p2 = colormap[source[2]];
-				uint32_t p3 = colormap[source[3]];
-
-				// shade_pal_index:
-				__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
-				VEC_SHADE(fg, shade_constants);
-
-				__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
-				__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
-
-				// unpack bg:
-				__m128i bg = _mm_loadu_si128((const __m128i*)dest);
-				__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
-				__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
-
-				// (fg_red * fg_alpha + bg_red * bg_alpha) / 256:
-				__m128i color_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8);
-				__m128i color_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8);
-
-				__m128i color = _mm_packus_epi16(color_lo, color_hi);
-				_mm_storeu_si128((__m128i*)dest, color);
-
-				source += sincr;
-				dest += pitch;
-			} while (--count);
-		}
-	}
-};
-
-class VecCommand(RtShaded4colsRGBA) : public DrawerCommand
-{
-	int sx;
-	int yl;
-	int yh;
-	lighttable_t * RESTRICT _colormap;
-	int _color;
-	BYTE * RESTRICT _destorg;
-	int _pitch;
-	fixed_t _light;
-
-public:
-	VecCommand(RtShaded4colsRGBA)(int sx, int yl, int yh)
-	{
-		this->sx = sx;
-		this->yl = yl;
-		this->yh = yh;
-
-		_colormap = dc_colormap;
-		_color = dc_color;
-		_destorg = dc_destorg;
-		_pitch = dc_pitch;
-		_light = dc_light;
-	}
-
-	void Execute(DrawerThread *thread) override
-	{
-		BYTE *colormap;
-		uint32_t *source;
-		uint32_t *dest;
-		int count;
-		int pitch;
-		int sincr;
-
-		count = thread->count_for_thread(yl, yh - yl + 1);
-		if (count <= 0)
-			return;
-
-		colormap = _colormap;
-		dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg);
-		source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4;
-		pitch = _pitch * thread->num_cores;
-		sincr = 4 * thread->num_cores;
-
-		__m128i fg = _mm_unpackhi_epi8(_mm_set1_epi32(LightBgra::shade_pal_index_simple(_color, LightBgra::calc_light_multiplier(_light))), _mm_setzero_si128());
-		__m128i alpha_one = _mm_set1_epi16(64);
-
-		do {
-			uint32_t p0 = colormap[source[0]];
-			uint32_t p1 = colormap[source[1]];
-			uint32_t p2 = colormap[source[2]];
-			uint32_t p3 = colormap[source[3]];
-
-			__m128i alpha_hi = _mm_set_epi16(64, p3, p3, p3, 64, p2, p2, p2);
-			__m128i alpha_lo = _mm_set_epi16(64, p1, p1, p1, 64, p0, p0, p0);
-			__m128i inv_alpha_hi = _mm_subs_epu16(alpha_one, alpha_hi);
-			__m128i inv_alpha_lo = _mm_subs_epu16(alpha_one, alpha_lo);
-
-			// unpack bg:
-			__m128i bg = _mm_loadu_si128((const __m128i*)dest);
-			__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
-			__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
-
-			// (fg_red * alpha + bg_red * inv_alpha) / 64:
-			__m128i color_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg, alpha_hi), _mm_mullo_epi16(bg_hi, inv_alpha_hi)), 6);
-			__m128i color_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg, alpha_lo), _mm_mullo_epi16(bg_lo, inv_alpha_lo)), 6);
-
-			__m128i color = _mm_packus_epi16(color_lo, color_hi);
-			_mm_storeu_si128((__m128i*)dest, color);
-
-			source += sincr;
-			dest += pitch;
-		} while (--count);
-	}
-};
-
-class VecCommand(RtAddClamp4colsRGBA) : public DrawerCommand
-{
-	int sx;
-	int yl;
-	int yh;
-	BYTE * RESTRICT _destorg;
-	int _pitch;
-	fixed_t _light;
-	fixed_t _srcalpha;
-	fixed_t _destalpha;
-	ShadeConstants _shade_constants;
-
-public:
-	VecCommand(RtAddClamp4colsRGBA)(int sx, int yl, int yh)
-	{
-		this->sx = sx;
-		this->yl = yl;
-		this->yh = yh;
-
-		_destorg = dc_destorg;
-		_pitch = dc_pitch;
-		_light = dc_light;
-		_srcalpha = dc_srcalpha;
-		_destalpha = dc_destalpha;
-		_shade_constants = dc_shade_constants;
-	}
-
-	void Execute(DrawerThread *thread) override
-	{
-		uint32_t *source;
-		uint32_t *dest;
-		int count;
-		int pitch;
-		int sincr;
-
-		count = thread->count_for_thread(yl, yh - yl + 1);
-		if (count <= 0)
-			return;
-
-		dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg);
-		source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4;
-		pitch = _pitch * thread->num_cores;
-		sincr = 4 * thread->num_cores;
-
-		uint32_t light = LightBgra::calc_light_multiplier(_light);
-		uint32_t *palette = (uint32_t*)GPalette.BaseColors;
-
-		uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8);
-		uint32_t bg_alpha = _destalpha >> (FRACBITS - 8);
-
-		ShadeConstants shade_constants = _shade_constants;
-
-		if (shade_constants.simple_shade)
-		{
-			VEC_SHADE_VARS();
-			VEC_SHADE_SIMPLE_INIT(light);
-
-			__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
-			__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
-
-			do {
-				uint32_t p0 = source[0];
-				uint32_t p1 = source[1];
-				uint32_t p2 = source[2];
-				uint32_t p3 = source[3];
-
-				// shade_pal_index:
-				__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
-				VEC_SHADE_SIMPLE(fg);
-
-				__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
-				__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
-
-				// unpack bg:
-				__m128i bg = _mm_loadu_si128((const __m128i*)dest);
-				__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
-				__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
-
-				// (fg_red * fg_alpha + bg_red * bg_alpha) / 256:
-				__m128i color_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8);
-				__m128i color_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8);
-
-				__m128i color = _mm_packus_epi16(color_lo, color_hi);
-				_mm_storeu_si128((__m128i*)dest, color);
-
-				source += sincr;
-				dest += pitch;
-			} while (--count);
-		}
-		else
-		{
-			VEC_SHADE_VARS();
-			VEC_SHADE_INIT(light, shade_constants);
-
-			__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
-			__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
-
-			do {
-				uint32_t p0 = source[0];
-				uint32_t p1 = source[1];
-				uint32_t p2 = source[2];
-				uint32_t p3 = source[3];
-
-				// shade_pal_index:
-				__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
-				VEC_SHADE(fg, shade_constants);
-
-				__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
-				__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
-
-				// unpack bg:
-				__m128i bg = _mm_loadu_si128((const __m128i*)dest);
-				__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
-				__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
-
-				// (fg_red * fg_alpha + bg_red * bg_alpha) / 256:
-				__m128i color_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8);
-				__m128i color_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8);
-
-				__m128i color = _mm_packus_epi16(color_lo, color_hi);
-				_mm_storeu_si128((__m128i*)dest, color);
-
-				source += sincr;
-				dest += pitch;
-			} while (--count);
-		}
-	}
-};
-
-class VecCommand(RtSubClamp4colsRGBA) : public DrawerCommand
-{
-	int sx;
-	int yl;
-	int yh;
-	BYTE * RESTRICT _destorg;
-	int _pitch;
-	fixed_t _light;
-	fixed_t _srcalpha;
-	fixed_t _destalpha;
-	ShadeConstants _shade_constants;
-
-public:
-	VecCommand(RtSubClamp4colsRGBA)(int sx, int yl, int yh)
-	{
-		this->sx = sx;
-		this->yl = yl;
-		this->yh = yh;
-
-		_destorg = dc_destorg;
-		_pitch = dc_pitch;
-		_light = dc_light;
-		_srcalpha = dc_srcalpha;
-		_destalpha = dc_destalpha;
-		_shade_constants = dc_shade_constants;
-	}
-
-	void Execute(DrawerThread *thread) override
-	{
-		uint32_t *source;
-		uint32_t *dest;
-		int count;
-		int pitch;
-		int sincr;
-
-		count = thread->count_for_thread(yl, yh - yl + 1);
-		if (count <= 0)
-			return;
-
-		dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg);
-		source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4;
-		pitch = _pitch * thread->num_cores;
-		sincr = 4 * thread->num_cores;
-
-		uint32_t light = LightBgra::calc_light_multiplier(_light);
-		uint32_t *palette = (uint32_t*)GPalette.BaseColors;
-		ShadeConstants shade_constants = _shade_constants;
-
-		uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8);
-		uint32_t bg_alpha = _destalpha >> (FRACBITS - 8);
-
-		if (shade_constants.simple_shade)
-		{
-			VEC_SHADE_VARS();
-			VEC_SHADE_SIMPLE_INIT(light);
-
-			__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
-			__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
-
-			do {
-				uint32_t p0 = source[0];
-				uint32_t p1 = source[1];
-				uint32_t p2 = source[2];
-				uint32_t p3 = source[3];
-
-				// shade_pal_index:
-				__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
-				VEC_SHADE_SIMPLE(fg);
-
-				__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
-				__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
-
-				// unpack bg:
-				__m128i bg = _mm_loadu_si128((const __m128i*)dest);
-				__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
-				__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
-
-				// (bg_red * bg_alpha - fg_red * fg_alpha) / 256:
-				__m128i color_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, mbg_alpha), _mm_mullo_epi16(fg_hi, mfg_alpha)), 8);
-				__m128i color_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, mbg_alpha), _mm_mullo_epi16(fg_lo, mfg_alpha)), 8);
-
-				__m128i color = _mm_packus_epi16(color_lo, color_hi);
-				_mm_storeu_si128((__m128i*)dest, color);
-
-				source += sincr;
-				dest += pitch;
-			} while (--count);
-		}
-		else
-		{
-			VEC_SHADE_VARS();
-			VEC_SHADE_INIT(light, shade_constants);
-
-			__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
-			__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
-
-			do {
-				uint32_t p0 = source[0];
-				uint32_t p1 = source[1];
-				uint32_t p2 = source[2];
-				uint32_t p3 = source[3];
-
-				// shade_pal_index:
-				__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
-				VEC_SHADE(fg, shade_constants);
-
-				__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
-				__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
-
-				// unpack bg:
-				__m128i bg = _mm_loadu_si128((const __m128i*)dest);
-				__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
-				__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
-
-				// (bg_red * bg_alpha - fg_red * fg_alpha) / 256:
-				__m128i color_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, mbg_alpha), _mm_mullo_epi16(fg_hi, mfg_alpha)), 8);
-				__m128i color_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, mbg_alpha), _mm_mullo_epi16(fg_lo, mfg_alpha)), 8);
-
-				__m128i color = _mm_packus_epi16(color_lo, color_hi);
-				_mm_storeu_si128((__m128i*)dest, color);
-
-				source += sincr;
-				dest += pitch;
-			} while (--count);
-		}
-	}
-};
-
-class VecCommand(RtRevSubClamp4colsRGBA) : public DrawerCommand
-{
-	int sx;
-	int yl;
-	int yh;
-	BYTE * RESTRICT _destorg;
-	int _pitch;
-	fixed_t _light;
-	fixed_t _srcalpha;
-	fixed_t _destalpha;
-	ShadeConstants _shade_constants;
-
-public:
-	VecCommand(RtRevSubClamp4colsRGBA)(int sx, int yl, int yh)
-	{
-		this->sx = sx;
-		this->yl = yl;
-		this->yh = yh;
-
-		_destorg = dc_destorg;
-		_pitch = dc_pitch;
-		_light = dc_light;
-		_srcalpha = dc_srcalpha;
-		_destalpha = dc_destalpha;
-		_shade_constants = dc_shade_constants;
-	}
-
-	void Execute(DrawerThread *thread) override
-	{
-		uint32_t *source;
-		uint32_t *dest;
-		int count;
-		int pitch;
-		int sincr;
-
-		count = thread->count_for_thread(yl, yh - yl + 1);
-		if (count <= 0)
-			return;
-
-		dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg);
-		source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4;
-		pitch = _pitch * thread->num_cores;
-		sincr = 4 * thread->num_cores;
-
-		uint32_t light = LightBgra::calc_light_multiplier(_light);
-		uint32_t *palette = (uint32_t*)GPalette.BaseColors;
-		ShadeConstants shade_constants = _shade_constants;
-
-		uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8);
-		uint32_t bg_alpha = _destalpha >> (FRACBITS - 8);
-
-		if (shade_constants.simple_shade)
-		{
-			VEC_SHADE_VARS();
-			VEC_SHADE_SIMPLE_INIT(light);
-
-			__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
-			__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
-
-			do {
-				uint32_t p0 = source[0];
-				uint32_t p1 = source[1];
-				uint32_t p2 = source[2];
-				uint32_t p3 = source[3];
-
-				// shade_pal_index:
-				__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
-				VEC_SHADE_SIMPLE(fg);
-
-				__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
-				__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
-
-				// unpack bg:
-				__m128i bg = _mm_loadu_si128((const __m128i*)dest);
-				__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
-				__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
-
-				// (fg_red * fg_alpha - bg_red * bg_alpha) / 256:
-				__m128i color_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8);
-				__m128i color_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8);
-
-				__m128i color = _mm_packus_epi16(color_lo, color_hi);
-				_mm_storeu_si128((__m128i*)dest, color);
-
-				source += sincr;
-				dest += pitch;
-			} while (--count);
-		}
-		else
-		{
-			VEC_SHADE_VARS();
-			VEC_SHADE_INIT(light, shade_constants);
-
-			__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
-			__m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha);
-
-			do {
-				uint32_t p0 = source[0];
-				uint32_t p1 = source[1];
-				uint32_t p2 = source[2];
-				uint32_t p3 = source[3];
-
-				// shade_pal_index:
-				__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
-				VEC_SHADE(fg, shade_constants);
-
-				__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
-				__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
-
-				// unpack bg:
-				__m128i bg = _mm_loadu_si128((const __m128i*)dest);
-				__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
-				__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
-
-				// (fg_red * fg_alpha - bg_red * bg_alpha) / 256:
-				__m128i color_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8);
-				__m128i color_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8);
-
-				__m128i color = _mm_packus_epi16(color_lo, color_hi);
-				_mm_storeu_si128((__m128i*)dest, color);
-
-				source += sincr;
-				dest += pitch;
-			} while (--count);
-		}
-	}
-};