From 8aabc26cd94018238ed606b81b3d49fabbe429fd Mon Sep 17 00:00:00 2001
From: Magnus Norddahl <dpjudas@users.noreply.github.com>
Date: Mon, 30 May 2016 05:52:15 +0200
Subject: [PATCH] Created standalone rgba drawing functions

---
 src/CMakeLists.txt   |    1 +
 src/r_draw.cpp       | 2470 ++++++++++++++++++++++++++++++------------
 src/r_draw.h         |  165 ++-
 src/r_drawt.cpp      |  332 +-----
 src/r_drawt_rgba.cpp |  883 +++++++++++++++
 src/r_main.cpp       |    4 +-
 src/r_plane.cpp      |   11 +-
 src/r_plane.h        |    4 +
 src/r_segs.cpp       |    4 +-
 src/r_things.cpp     |   97 +-
 src/r_things.h       |    5 +-
 src/v_draw.cpp       |    2 +-
 12 files changed, 2896 insertions(+), 1082 deletions(-)
 create mode 100644 src/r_drawt_rgba.cpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 84d6f06b9..c90756b5d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -883,6 +883,7 @@ set( FASTMATH_PCH_SOURCES
 	r_bsp.cpp
 	r_draw.cpp
 	r_drawt.cpp
+	r_drawt_rgba.cpp
 	r_main.cpp
 	r_plane.cpp
 	r_segs.cpp
diff --git a/src/r_draw.cpp b/src/r_draw.cpp
index 044910008..d2b694f05 100644
--- a/src/r_draw.cpp
+++ b/src/r_draw.cpp
@@ -38,6 +38,7 @@
 #include "r_data/r_translate.h"
 #include "v_palette.h"
 #include "r_data/colormaps.h"
+#include "r_plane.h"
 
 #include "gi.h"
 #include "stats.h"
@@ -73,6 +74,19 @@ int 			scaledviewwidth;
 //		screen depth and asm/no asm.
 void (*R_DrawColumnHoriz)(void);
 void (*R_DrawColumn)(void);
+void (*R_FillColumn)(void);
+void (*R_FillAddColumn)(void);
+void (*R_FillAddClampColumn)(void);
+void (*R_FillSubClampColumn)(void);
+void (*R_FillRevSubClampColumn)(void);
+void (*R_DrawAddColumn)(void);
+void (*R_DrawTlatedAddColumn)(void);
+void (*R_DrawAddClampColumn)(void);
+void (*R_DrawAddClampTranslatedColumn)(void);
+void (*R_DrawSubClampColumn)(void);
+void (*R_DrawSubClampTranslatedColumn)(void);
+void (*R_DrawRevSubClampColumn)(void);
+void (*R_DrawRevSubClampTranslatedColumn)(void);
 void (*R_DrawFuzzColumn)(void);
 void (*R_DrawTranslatedColumn)(void);
 void (*R_DrawShadedColumn)(void);
@@ -82,7 +96,44 @@ void (*R_DrawSpanTranslucent)(void);
 void (*R_DrawSpanMaskedTranslucent)(void);
 void (*R_DrawSpanAddClamp)(void);
 void (*R_DrawSpanMaskedAddClamp)(void);
-void (*rt_map4cols)(int,int,int);
+void (*R_FillSpan)(void);
+void (*R_FillColumnHoriz)(void);
+void (*R_DrawFogBoundary)(int x1, int x2, short *uclip, short *dclip);
+void (*R_MapColoredPlane)(int y, int x1);
+void (*R_DrawParticle)(vissprite_t *);
+fixed_t (*tmvline1_add)();
+void (*tmvline4_add)();
+fixed_t (*tmvline1_addclamp)();
+void (*tmvline4_addclamp)();
+fixed_t (*tmvline1_subclamp)();
+void (*tmvline4_subclamp)();
+fixed_t (*tmvline1_revsubclamp)();
+void (*tmvline4_revsubclamp)();
+void (*rt_copy1col)(int hx, int sx, int yl, int yh);
+void (*rt_copy4cols)(int sx, int yl, int yh);
+void (*rt_shaded1col)(int hx, int sx, int yl, int yh);
+void (*rt_shaded4cols)(int sx, int yl, int yh);
+void (*rt_map1col)(int hx, int sx, int yl, int yh);
+void (*rt_add1col)(int hx, int sx, int yl, int yh);
+void (*rt_addclamp1col)(int hx, int sx, int yl, int yh);
+void (*rt_subclamp1col)(int hx, int sx, int yl, int yh);
+void (*rt_revsubclamp1col)(int hx, int sx, int yl, int yh);
+void (*rt_tlate1col)(int hx, int sx, int yl, int yh);
+void (*rt_tlateadd1col)(int hx, int sx, int yl, int yh);
+void (*rt_tlateaddclamp1col)(int hx, int sx, int yl, int yh);
+void (*rt_tlatesubclamp1col)(int hx, int sx, int yl, int yh);
+void (*rt_tlaterevsubclamp1col)(int hx, int sx, int yl, int yh);
+void (*rt_map4cols)(int sx, int yl, int yh);
+void (*rt_add4cols)(int sx, int yl, int yh);
+void (*rt_addclamp4cols)(int sx, int yl, int yh);
+void (*rt_subclamp4cols)(int sx, int yl, int yh);
+void (*rt_revsubclamp4cols)(int sx, int yl, int yh);
+void (*rt_tlate4cols)(int sx, int yl, int yh);
+void (*rt_tlateadd4cols)(int sx, int yl, int yh);
+void (*rt_tlateaddclamp4cols)(int sx, int yl, int yh);
+void (*rt_tlatesubclamp4cols)(int sx, int yl, int yh);
+void (*rt_tlaterevsubclamp4cols)(int sx, int yl, int yh);
+void (*rt_initcols)(canvas_pixel_t *buffer);
 
 //
 // R_DrawColumn
@@ -198,10 +249,6 @@ void R_DrawColumnP_C (void)
 	// Framebuffer destination address.
 	dest = dc_dest;
 
-#ifndef PALETTEOUTPUT
-	uint32_t light = calc_light_multiplier(dc_light);
-#endif
-
 	// Determine scaling,
 	//	which is the only mapping to be done.
 	fracstep = dc_iscale; 
@@ -221,11 +268,7 @@ void R_DrawColumnP_C (void)
 		{
 			// Re-map color indices from wall texture column
 			//	using a lighting/special effects LUT.
-#ifndef PALETTEOUTPUT
-			*dest = shade_pal_index(colormap[source[frac>>FRACBITS]], light);
-#else
 			*dest = colormap[source[frac >> FRACBITS]];
-#endif
 
 			dest += pitch;
 			frac += fracstep;
@@ -235,8 +278,78 @@ void R_DrawColumnP_C (void)
 } 
 #endif
 
+void R_DrawColumnP_RGBA_C()
+{
+	int 				count;
+	canvas_pixel_t*		dest;
+	fixed_t 			frac;
+	fixed_t 			fracstep;
+
+	count = dc_count;
+
+	// Zero length, column does not exceed a pixel.
+	if (count <= 0)
+		return;
+
+	// Framebuffer destination address.
+	dest = dc_dest;
+
+	uint32_t light = calc_light_multiplier(dc_light);
+
+	// Determine scaling,
+	//	which is the only mapping to be done.
+	fracstep = dc_iscale;
+	frac = dc_texturefrac;
+
+	{
+		// [RH] Get local copies of these variables so that the compiler
+		//		has a better chance of optimizing this well.
+		BYTE *colormap = dc_colormap;
+		const BYTE *source = dc_source;
+		int pitch = dc_pitch;
+
+		// Inner loop that does the actual texture mapping,
+		//	e.g. a DDA-lile scaling.
+		// This is as fast as it gets.
+		do
+		{
+			// Re-map color indices from wall texture column
+			//	using a lighting/special effects LUT.
+			*dest = shade_pal_index(colormap[source[frac >> FRACBITS]], light);
+
+			dest += pitch;
+			frac += fracstep;
+
+		} while (--count);
+	}
+}
+
 // [RH] Just fills a column with a color
-void R_FillColumnP (void)
+void R_FillColumnP_C (void)
+{
+	int 				count;
+	canvas_pixel_t*		dest;
+
+	count = dc_count;
+
+	if (count <= 0)
+		return;
+
+	dest = dc_dest;
+
+	{
+		int pitch = dc_pitch;
+		BYTE color = dc_color;
+
+		do
+		{
+			*dest = color;
+			dest += pitch;
+		} while (--count);
+	}
+}
+
+void R_FillColumnP_RGBA()
 {
 	int 				count;
 	canvas_pixel_t*		dest;
@@ -248,9 +361,7 @@ void R_FillColumnP (void)
 
 	dest = dc_dest;
 
-#ifndef PALETTEOUTPUT
 	uint32_t light = calc_light_multiplier(dc_light);
-#endif
 
 	{
 		int pitch = dc_pitch;
@@ -258,17 +369,40 @@ void R_FillColumnP (void)
 
 		do
 		{
-#ifndef PALETTEOUTPUT
 			*dest = shade_pal_index(color, light);
-#else
-			*dest = color;
-#endif
 			dest += pitch;
 		} while (--count);
 	}
 }
 
-void R_FillAddColumn (void)
+void R_FillAddColumn_C (void)
+{
+	int count;
+	canvas_pixel_t *dest;
+
+	count = dc_count;
+	if (count <= 0)
+		return;
+
+	dest = dc_dest;
+	int pitch = dc_pitch;
+
+	DWORD *bg2rgb;
+	DWORD fg;
+
+	bg2rgb = dc_destblend;
+	fg = dc_srccolor;
+
+	do
+	{
+		DWORD bg;
+		bg = (fg + bg2rgb[*dest]) | 0x1f07c1f;
+		*dest = RGB32k.All[bg & (bg>>15)];
+		dest += pitch; 
+	} while (--count);
+}
+
+void R_FillAddColumn_RGBA_C()
 {
 	int count;
 	canvas_pixel_t *dest;
@@ -280,7 +414,6 @@ void R_FillAddColumn (void)
 	dest = dc_dest;
 	int pitch = dc_pitch;
 
-#ifndef PALETTEOUTPUT
 	uint32_t fg_red = (dc_srccolor >> 12) & 0xf8;
 	uint32_t fg_green = (dc_srccolor >> 2) & 0xf8;
 	uint32_t fg_blue = (dc_srccolor << 3) & 0xf8;
@@ -298,24 +431,9 @@ void R_FillAddColumn (void)
 		*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
 		dest += pitch;
 	} while (--count);
-#else
-	DWORD *bg2rgb;
-	DWORD fg;
-
-	bg2rgb = dc_destblend;
-	fg = dc_srccolor;
-
-	do
-	{
-		DWORD bg;
-		bg = (fg + bg2rgb[*dest]) | 0x1f07c1f;
-		*dest = RGB32k.All[bg & (bg>>15)];
-		dest += pitch; 
-	} while (--count);
-#endif
 }
 
-void R_FillAddClampColumn (void)
+void R_FillAddClampColumn_C (void)
 {
 	int count;
 	canvas_pixel_t *dest;
@@ -327,25 +445,6 @@ void R_FillAddClampColumn (void)
 	dest = dc_dest;
 	int pitch = dc_pitch;
 
-#ifndef PALETTEOUTPUT
-	uint32_t fg_red = (dc_srccolor >> 12) & 0xf8;
-	uint32_t fg_green = (dc_srccolor >> 2) & 0xf8;
-	uint32_t fg_blue = (dc_srccolor << 3) & 0xf8;
-
-	do
-	{
-		uint32_t bg_red = (*dest >> 16) & 0xff;
-		uint32_t bg_green = (*dest >> 8) & 0xff;
-		uint32_t bg_blue = (*dest) & 0xff;
-
-		uint32_t red = clamp<uint32_t>(fg_red + bg_red, 0, 255);
-		uint32_t green = clamp<uint32_t>(fg_green + bg_green, 0, 255);
-		uint32_t blue = clamp<uint32_t>(fg_blue + bg_blue, 0, 255);
-
-		*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
-		dest += pitch;
-	} while (--count);
-#else
 	DWORD *bg2rgb;
 	DWORD fg;
 
@@ -365,10 +464,9 @@ void R_FillAddClampColumn (void)
 		*dest = RGB32k.All[a & (a>>15)];
 		dest += pitch; 
 	} while (--count);
-#endif
 }
 
-void R_FillSubClampColumn (void)
+void R_FillAddClampColumn_RGBA()
 {
 	int count;
 	canvas_pixel_t *dest;
@@ -380,7 +478,6 @@ void R_FillSubClampColumn (void)
 	dest = dc_dest;
 	int pitch = dc_pitch;
 
-#ifndef PALETTEOUTPUT
 	uint32_t fg_red = (dc_srccolor >> 12) & 0xf8;
 	uint32_t fg_green = (dc_srccolor >> 2) & 0xf8;
 	uint32_t fg_blue = (dc_srccolor << 3) & 0xf8;
@@ -391,14 +488,27 @@ void R_FillSubClampColumn (void)
 		uint32_t bg_green = (*dest >> 8) & 0xff;
 		uint32_t bg_blue = (*dest) & 0xff;
 
-		uint32_t red = clamp<uint32_t>(256 - fg_red + bg_red, 256, 256 + 255) - 255;
-		uint32_t green = clamp<uint32_t>(256 - fg_green + bg_green, 256, 256 + 255) - 255;
-		uint32_t blue = clamp<uint32_t>(256 - fg_blue + bg_blue, 256, 256 + 255) - 255;
+		uint32_t red = clamp<uint32_t>(fg_red + bg_red, 0, 255);
+		uint32_t green = clamp<uint32_t>(fg_green + bg_green, 0, 255);
+		uint32_t blue = clamp<uint32_t>(fg_blue + bg_blue, 0, 255);
 
 		*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
 		dest += pitch;
 	} while (--count);
-#else
+}
+
+void R_FillSubClampColumn_C (void)
+{
+	int count;
+	canvas_pixel_t *dest;
+
+	count = dc_count;
+	if (count <= 0)
+		return;
+
+	dest = dc_dest;
+	int pitch = dc_pitch;
+
 	DWORD *bg2rgb;
 	DWORD fg;
 
@@ -417,10 +527,9 @@ void R_FillSubClampColumn (void)
 		*dest = RGB32k.All[a & (a>>15)];
 		dest += pitch; 
 	} while (--count);
-#endif
 }
 
-void R_FillRevSubClampColumn (void)
+void R_FillSubClampColumn_RGBA()
 {
 	int count;
 	canvas_pixel_t *dest;
@@ -432,7 +541,6 @@ void R_FillRevSubClampColumn (void)
 	dest = dc_dest;
 	int pitch = dc_pitch;
 
-#ifndef PALETTEOUTPUT
 	uint32_t fg_red = (dc_srccolor >> 12) & 0xf8;
 	uint32_t fg_green = (dc_srccolor >> 2) & 0xf8;
 	uint32_t fg_blue = (dc_srccolor << 3) & 0xf8;
@@ -443,14 +551,27 @@ void R_FillRevSubClampColumn (void)
 		uint32_t bg_green = (*dest >> 8) & 0xff;
 		uint32_t bg_blue = (*dest) & 0xff;
 
-		uint32_t red = clamp<uint32_t>(256 + fg_red - bg_red, 256, 256 + 255) - 255;
-		uint32_t green = clamp<uint32_t>(256 + fg_green - bg_green, 256, 256 + 255) - 255;
-		uint32_t blue = clamp<uint32_t>(256 + fg_blue - bg_blue, 256, 256 + 255) - 255;
+		uint32_t red = clamp<uint32_t>(256 - fg_red + bg_red, 256, 256 + 255) - 255;
+		uint32_t green = clamp<uint32_t>(256 - fg_green + bg_green, 256, 256 + 255) - 255;
+		uint32_t blue = clamp<uint32_t>(256 - fg_blue + bg_blue, 256, 256 + 255) - 255;
 
 		*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
 		dest += pitch;
 	} while (--count);
-#else
+}
+
+void R_FillRevSubClampColumn_C (void)
+{
+	int count;
+	canvas_pixel_t *dest;
+
+	count = dc_count;
+	if (count <= 0)
+		return;
+
+	dest = dc_dest;
+	int pitch = dc_pitch;
+
 	DWORD *bg2rgb;
 	DWORD fg;
 
@@ -469,7 +590,37 @@ void R_FillRevSubClampColumn (void)
 		*dest = RGB32k.All[a & (a>>15)];
 		dest += pitch; 
 	} while (--count);
-#endif
+}
+
+void R_FillRevSubClampColumn_RGBA()
+{
+	int count;
+	canvas_pixel_t *dest;
+
+	count = dc_count;
+	if (count <= 0)
+		return;
+
+	dest = dc_dest;
+	int pitch = dc_pitch;
+
+	uint32_t fg_red = (dc_srccolor >> 12) & 0xf8;
+	uint32_t fg_green = (dc_srccolor >> 2) & 0xf8;
+	uint32_t fg_blue = (dc_srccolor << 3) & 0xf8;
+
+	do
+	{
+		uint32_t bg_red = (*dest >> 16) & 0xff;
+		uint32_t bg_green = (*dest >> 8) & 0xff;
+		uint32_t bg_blue = (*dest) & 0xff;
+
+		uint32_t red = clamp<uint32_t>(256 + fg_red - bg_red, 256, 256 + 255) - 255;
+		uint32_t green = clamp<uint32_t>(256 + fg_green - bg_green, 256, 256 + 255) - 255;
+		uint32_t blue = clamp<uint32_t>(256 + fg_blue - bg_blue, 256, 256 + 255) - 255;
+
+		*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
+		dest += pitch;
+	} while (--count);
 }
 
 //
@@ -542,7 +693,77 @@ void R_DrawFuzzColumnP_C (void)
 
 	dest = ylookup[dc_yl] + dc_x + dc_destorg;
 
-#ifndef PALETTEOUTPUT
+	// colormap #6 is used for shading (of 0-31, a bit brighter than average)
+	{
+		// [RH] Make local copies of global vars to try and improve
+		//		the optimizations made by the compiler.
+		int pitch = dc_pitch;
+		int fuzz = fuzzpos;
+		int cnt;
+		BYTE *map = &NormalLight.Maps[6*256];
+
+		// [RH] Split this into three separate loops to minimize
+		// the number of times fuzzpos needs to be clamped.
+		if (fuzz)
+		{
+			cnt = MIN(FUZZTABLE-fuzz,count);
+			count -= cnt;
+			do
+			{
+				*dest = map[dest[fuzzoffset[fuzz++]]];
+				dest += pitch;
+			} while (--cnt);
+		}
+		if (fuzz == FUZZTABLE || count > 0)
+		{
+			while (count >= FUZZTABLE)
+			{
+				fuzz = 0;
+				cnt = FUZZTABLE;
+				count -= FUZZTABLE;
+				do
+				{
+					*dest = map[dest[fuzzoffset[fuzz++]]];
+					dest += pitch;
+				} while (--cnt);
+			}
+			fuzz = 0;
+			if (count > 0)
+			{
+				do
+				{
+					*dest = map[dest[fuzzoffset[fuzz++]]];
+					dest += pitch;
+				} while (--count);
+			}
+		}
+		fuzzpos = fuzz;
+	}
+} 
+#endif
+
+void R_DrawFuzzColumnP_RGBA_C()
+{
+	int count;
+	canvas_pixel_t *dest;
+
+	// Adjust borders. Low...
+	if (dc_yl == 0)
+		dc_yl = 1;
+
+	// .. and high.
+	if (dc_yh > fuzzviewheight)
+		dc_yh = fuzzviewheight;
+
+	count = dc_yh - dc_yl;
+
+	// Zero length.
+	if (count < 0)
+		return;
+
+	count++;
+
+	dest = ylookup[dc_yl] + dc_x + dc_destorg;
 
 	// Note: this implementation assumes this function is only used for the pinky shadow effect (i.e. no other fancy colormap than black)
 	// I'm not sure if this is really always the case or not.
@@ -618,58 +839,7 @@ void R_DrawFuzzColumnP_C (void)
 		}
 		fuzzpos = fuzz;
 	}
-
-#else
-
-	// colormap #6 is used for shading (of 0-31, a bit brighter than average)
-	{
-		// [RH] Make local copies of global vars to try and improve
-		//		the optimizations made by the compiler.
-		int pitch = dc_pitch;
-		int fuzz = fuzzpos;
-		int cnt;
-		BYTE *map = &NormalLight.Maps[6*256];
-
-		// [RH] Split this into three separate loops to minimize
-		// the number of times fuzzpos needs to be clamped.
-		if (fuzz)
-		{
-			cnt = MIN(FUZZTABLE-fuzz,count);
-			count -= cnt;
-			do
-			{
-				*dest = map[dest[fuzzoffset[fuzz++]]];
-				dest += pitch;
-			} while (--cnt);
-		}
-		if (fuzz == FUZZTABLE || count > 0)
-		{
-			while (count >= FUZZTABLE)
-			{
-				fuzz = 0;
-				cnt = FUZZTABLE;
-				count -= FUZZTABLE;
-				do
-				{
-					*dest = map[dest[fuzzoffset[fuzz++]]];
-					dest += pitch;
-				} while (--cnt);
-			}
-			fuzz = 0;
-			if (count > 0)
-			{
-				do
-				{
-					*dest = map[dest[fuzzoffset[fuzz++]]];
-					dest += pitch;
-				} while (--count);
-			}
-		}
-		fuzzpos = fuzz;
-	}
-#endif
-} 
-#endif
+}
 
 //
 // R_DrawTranlucentColumn
@@ -733,7 +903,44 @@ void R_DrawAddColumnP_C (void)
 	fracstep = dc_iscale;
 	frac = dc_texturefrac;
 
-#ifndef PALETTEOUTPUT
+	{
+		DWORD *fg2rgb = dc_srcblend;
+		DWORD *bg2rgb = dc_destblend;
+		BYTE *colormap = dc_colormap;
+		const BYTE *source = dc_source;
+		int pitch = dc_pitch;
+
+		do
+		{
+			DWORD fg = colormap[source[frac>>FRACBITS]];
+			DWORD bg = *dest;
+
+			fg = fg2rgb[fg];
+			bg = bg2rgb[bg];
+			fg = (fg+bg) | 0x1f07c1f;
+			*dest = RGB32k.All[fg & (fg>>15)];
+			dest += pitch;
+			frac += fracstep;
+		} while (--count);
+	}
+}
+
+void R_DrawAddColumnP_RGBA_C()
+{
+	int count;
+	canvas_pixel_t *dest;
+	fixed_t frac;
+	fixed_t fracstep;
+
+	count = dc_count;
+	if (count <= 0)
+		return;
+
+	dest = dc_dest;
+
+	fracstep = dc_iscale;
+	frac = dc_texturefrac;
+
 	{
 		const BYTE *source = dc_source;
 		int pitch = dc_pitch;
@@ -760,28 +967,6 @@ void R_DrawAddColumnP_C (void)
 			frac += fracstep;
 		} while (--count);
 	}
-#else
-	{
-		DWORD *fg2rgb = dc_srcblend;
-		DWORD *bg2rgb = dc_destblend;
-		BYTE *colormap = dc_colormap;
-		const BYTE *source = dc_source;
-		int pitch = dc_pitch;
-
-		do
-		{
-			DWORD fg = colormap[source[frac>>FRACBITS]];
-			DWORD bg = *dest;
-
-			fg = fg2rgb[fg];
-			bg = bg2rgb[bg];
-			fg = (fg+bg) | 0x1f07c1f;
-			*dest = RGB32k.All[fg & (fg>>15)];
-			dest += pitch;
-			frac += fracstep;
-		} while (--count);
-	}
-#endif
 }
 
 //
@@ -803,9 +988,39 @@ void R_DrawTranslatedColumnP_C (void)
 	if (count <= 0) 
 		return;
 
-#ifndef PALETTEOUTPUT
+	dest = dc_dest;
+
+	fracstep = dc_iscale;
+	frac = dc_texturefrac;
+
+	{
+		// [RH] Local copies of global vars to improve compiler optimizations
+		BYTE *colormap = dc_colormap;
+		BYTE *translation = dc_translation;
+		const BYTE *source = dc_source;
+		int pitch = dc_pitch;
+
+		do
+		{
+			*dest = colormap[translation[source[frac>>FRACBITS]]];
+			dest += pitch;
+			frac += fracstep;
+		} while (--count);
+	}
+}
+
+void R_DrawTranslatedColumnP_RGBA_C()
+{
+	int 				count;
+	canvas_pixel_t*		dest;
+	fixed_t 			frac;
+	fixed_t 			fracstep;
+
+	count = dc_count;
+	if (count <= 0)
+		return;
+
 	uint32_t light = calc_light_multiplier(dc_light);
-#endif
 
 	dest = dc_dest;
 
@@ -821,20 +1036,54 @@ void R_DrawTranslatedColumnP_C (void)
 
 		do
 		{
-#ifndef PALETTEOUTPUT
 			*dest = shade_pal_index(colormap[translation[source[frac >> FRACBITS]]], light);
-#else
-			*dest = colormap[translation[source[frac>>FRACBITS]]];
-#endif
 			dest += pitch;
-
 			frac += fracstep;
 		} while (--count);
 	}
 }
 
 // Draw a column that is both translated and translucent
-void R_DrawTlatedAddColumnP_C (void)
+void R_DrawTlatedAddColumnP_C()
+{
+	int count;
+	canvas_pixel_t *dest;
+	fixed_t frac;
+	fixed_t fracstep;
+
+	count = dc_count;
+	if (count <= 0)
+		return;
+
+	dest = dc_dest;
+
+	fracstep = dc_iscale;
+	frac = dc_texturefrac;
+
+	{
+		DWORD *fg2rgb = dc_srcblend;
+		DWORD *bg2rgb = dc_destblend;
+		BYTE *translation = dc_translation;
+		BYTE *colormap = dc_colormap;
+		const BYTE *source = dc_source;
+		int pitch = dc_pitch;
+
+		do
+		{
+			DWORD fg = colormap[translation[source[frac>>FRACBITS]]];
+			DWORD bg = *dest;
+
+			fg = fg2rgb[fg];
+			bg = bg2rgb[bg];
+			fg = (fg + bg) | 0x1f07c1f;
+			*dest = RGB32k.All[fg & (fg >> 15)];
+			dest += pitch;
+			frac += fracstep;
+		} while (--count);
+	}
+}
+
+void R_DrawTlatedAddColumnP_RGBA_C()
 {
 	int count;
 	canvas_pixel_t *dest;
@@ -845,16 +1094,13 @@ void R_DrawTlatedAddColumnP_C (void)
 	if (count <= 0)
 		return;
 
-#ifndef PALETTEOUTPUT
 	uint32_t light = calc_light_multiplier(dc_light);
-#endif
 
 	dest = dc_dest;
 
 	fracstep = dc_iscale;
 	frac = dc_texturefrac;
 
-#ifndef PALETTEOUTPUT
 	{
 		BYTE *translation = dc_translation;
 		BYTE *colormap = dc_colormap;
@@ -882,29 +1128,6 @@ void R_DrawTlatedAddColumnP_C (void)
 			frac += fracstep;
 		} while (--count);
 	}
-#else
-	{
-		DWORD *fg2rgb = dc_srcblend;
-		DWORD *bg2rgb = dc_destblend;
-		BYTE *translation = dc_translation;
-		BYTE *colormap = dc_colormap;
-		const BYTE *source = dc_source;
-		int pitch = dc_pitch;
-
-		do
-		{
-			DWORD fg = colormap[translation[source[frac>>FRACBITS]]];
-			DWORD bg = *dest;
-
-			fg = fg2rgb[fg];
-			bg = bg2rgb[bg];
-			fg = (fg + bg) | 0x1f07c1f;
-			*dest = RGB32k.All[fg & (fg >> 15)];
-			dest += pitch;
-			frac += fracstep;
-		} while (--count);
-	}
-#endif
 }
 
 // Draw a column whose "color" values are actually translucency
@@ -925,7 +1148,41 @@ void R_DrawShadedColumnP_C (void)
 	fracstep = dc_iscale; 
 	frac = dc_texturefrac;
 
-#ifndef PALETTEOUTPUT
+	{
+		const BYTE *source = dc_source;
+		BYTE *colormap = dc_colormap;
+		int pitch = dc_pitch;
+		DWORD *fgstart = &Col2RGB8[0][dc_color];
+
+		do
+		{
+			DWORD val = colormap[source[frac>>FRACBITS]];
+			DWORD fg = fgstart[val<<8];
+			val = (Col2RGB8[64-val][*dest] + fg) | 0x1f07c1f;
+			*dest = RGB32k.All[val & (val>>15)];
+
+			dest += pitch;
+			frac += fracstep;
+		} while (--count);
+	}
+} 
+
+void R_DrawShadedColumnP_RGBA_C()
+{
+	int  count;
+	canvas_pixel_t *dest;
+	fixed_t frac, fracstep;
+
+	count = dc_count;
+
+	if (count <= 0)
+		return;
+
+	dest = dc_dest;
+
+	fracstep = dc_iscale;
+	frac = dc_texturefrac;
+
 	uint32_t fg = shade_pal_index(dc_color, calc_light_multiplier(dc_light));
 	uint32_t fg_red = (fg >> 16) & 0xff;
 	uint32_t fg_green = (fg >> 8) & 0xff;
@@ -954,26 +1211,7 @@ void R_DrawShadedColumnP_C (void)
 			frac += fracstep;
 		} while (--count);
 	}
-#else
-	{
-		const BYTE *source = dc_source;
-		BYTE *colormap = dc_colormap;
-		int pitch = dc_pitch;
-		DWORD *fgstart = &Col2RGB8[0][dc_color];
-
-		do
-		{
-			DWORD val = colormap[source[frac>>FRACBITS]];
-			DWORD fg = fgstart[val<<8];
-			val = (Col2RGB8[64-val][*dest] + fg) | 0x1f07c1f;
-			*dest = RGB32k.All[val & (val>>15)];
-
-			dest += pitch;
-			frac += fracstep;
-		} while (--count);
-	}
-#endif
-} 
+}
 
 // Add source to destination, clamping it to white
 void R_DrawAddClampColumnP_C ()
@@ -992,7 +1230,6 @@ void R_DrawAddClampColumnP_C ()
 	fracstep = dc_iscale;
 	frac = dc_texturefrac;
 
-#ifndef PALETTEOUTPUT
 	{
 		const BYTE *source = dc_source;
 		BYTE *colormap = dc_colormap;
@@ -1019,30 +1256,50 @@ void R_DrawAddClampColumnP_C ()
 			frac += fracstep;
 		} while (--count);
 	}
-#else
+}
+
+void R_DrawAddClampColumnP_RGBA_C()
+{
+	int count;
+	canvas_pixel_t *dest;
+	fixed_t frac;
+	fixed_t fracstep;
+
+	count = dc_count;
+	if (count <= 0)
+		return;
+
+	dest = dc_dest;
+
+	fracstep = dc_iscale;
+	frac = dc_texturefrac;
+
 	{
-		BYTE *colormap = dc_colormap;
 		const BYTE *source = dc_source;
+		BYTE *colormap = dc_colormap;
 		int pitch = dc_pitch;
-		DWORD *fg2rgb = dc_srcblend;
-		DWORD *bg2rgb = dc_destblend;
+		uint32_t light = calc_light_multiplier(dc_light);
 
 		do
 		{
-			DWORD a = fg2rgb[colormap[source[frac>>FRACBITS]]] + bg2rgb[*dest];
-			DWORD b = a;
+			uint32_t fg = shade_pal_index(colormap[source[frac >> FRACBITS]], light);
+			uint32_t fg_red = (fg >> 16) & 0xff;
+			uint32_t fg_green = (fg >> 8) & 0xff;
+			uint32_t fg_blue = fg & 0xff;
 
-			a |= 0x01f07c1f;
-			b &= 0x40100400;
-			a &= 0x3fffffff;
-			b = b - (b >> 5);
-			a |= b;
-			*dest = RGB32k.All[a & (a>>15)];
+			uint32_t bg_red = (*dest >> 16) & 0xff;
+			uint32_t bg_green = (*dest >> 8) & 0xff;
+			uint32_t bg_blue = (*dest) & 0xff;
+
+			uint32_t red = clamp<uint32_t>(fg_red + bg_red, 0, 255);
+			uint32_t green = clamp<uint32_t>(fg_green + bg_green, 0, 255);
+			uint32_t blue = clamp<uint32_t>(fg_blue + bg_blue, 0, 255);
+
+			*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
 			dest += pitch;
 			frac += fracstep;
 		} while (--count);
 	}
-#endif
 }
 
 // Add translated source to destination, clamping it to white
@@ -1062,35 +1319,6 @@ void R_DrawAddClampTranslatedColumnP_C ()
 	fracstep = dc_iscale;
 	frac = dc_texturefrac;
 
-#ifndef PALETTEOUTPUT
-	{
-		BYTE *translation = dc_translation;
-		BYTE *colormap = dc_colormap;
-		const BYTE *source = dc_source;
-		int pitch = dc_pitch;
-		uint32_t light = calc_light_multiplier(dc_light);
-
-		do
-		{
-			uint32_t fg = shade_pal_index(colormap[translation[source[frac >> FRACBITS]]], light);
-			uint32_t fg_red = (fg >> 16) & 0xff;
-			uint32_t fg_green = (fg >> 8) & 0xff;
-			uint32_t fg_blue = fg & 0xff;
-
-			uint32_t bg_red = (*dest >> 16) & 0xff;
-			uint32_t bg_green = (*dest >> 8) & 0xff;
-			uint32_t bg_blue = (*dest) & 0xff;
-
-			uint32_t red = clamp<uint32_t>(fg_red + bg_red, 0, 255);
-			uint32_t green = clamp<uint32_t>(fg_green + bg_green, 0, 255);
-			uint32_t blue = clamp<uint32_t>(fg_blue + bg_blue, 0, 255);
-
-			*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
-			dest += pitch;
-			frac += fracstep;
-		} while (--count);
-	}
-#else
 	{
 		BYTE *translation = dc_translation;
 		BYTE *colormap = dc_colormap;
@@ -1114,7 +1342,51 @@ void R_DrawAddClampTranslatedColumnP_C ()
 			frac += fracstep;
 		} while (--count);
 	}
-#endif
+}
+
+void R_DrawAddClampTranslatedColumnP_RGBA_C()
+{
+	int count;
+	canvas_pixel_t *dest;
+	fixed_t frac;
+	fixed_t fracstep;
+
+	count = dc_count;
+	if (count <= 0)
+		return;
+
+	dest = dc_dest;
+
+	fracstep = dc_iscale;
+	frac = dc_texturefrac;
+
+	{
+		BYTE *translation = dc_translation;
+		BYTE *colormap = dc_colormap;
+		const BYTE *source = dc_source;
+		int pitch = dc_pitch;
+		uint32_t light = calc_light_multiplier(dc_light);
+
+		do
+		{
+			uint32_t fg = shade_pal_index(colormap[translation[source[frac >> FRACBITS]]], light);
+			uint32_t fg_red = (fg >> 16) & 0xff;
+			uint32_t fg_green = (fg >> 8) & 0xff;
+			uint32_t fg_blue = fg & 0xff;
+
+			uint32_t bg_red = (*dest >> 16) & 0xff;
+			uint32_t bg_green = (*dest >> 8) & 0xff;
+			uint32_t bg_blue = (*dest) & 0xff;
+
+			uint32_t red = clamp<uint32_t>(fg_red + bg_red, 0, 255);
+			uint32_t green = clamp<uint32_t>(fg_green + bg_green, 0, 255);
+			uint32_t blue = clamp<uint32_t>(fg_blue + bg_blue, 0, 255);
+
+			*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
+			dest += pitch;
+			frac += fracstep;
+		} while (--count);
+	}
 }
 
 // Subtract destination from source, clamping it to black
@@ -1134,7 +1406,45 @@ void R_DrawSubClampColumnP_C ()
 	fracstep = dc_iscale;
 	frac = dc_texturefrac;
 
-#ifndef PALETTEOUTPUT
+	{
+		BYTE *colormap = dc_colormap;
+		const BYTE *source = dc_source;
+		int pitch = dc_pitch;
+		DWORD *fg2rgb = dc_srcblend;
+		DWORD *bg2rgb = dc_destblend;
+
+		do
+		{
+			DWORD a = (fg2rgb[colormap[source[frac>>FRACBITS]]] | 0x40100400) - bg2rgb[*dest];
+			DWORD b = a;
+
+			b &= 0x40100400;
+			b = b - (b >> 5);
+			a &= b;
+			a |= 0x01f07c1f;
+			*dest = RGB32k.All[a & (a>>15)];
+			dest += pitch;
+			frac += fracstep;
+		} while (--count);
+	}
+}
+
+void R_DrawSubClampColumnP_RGBA_C()
+{
+	int count;
+	canvas_pixel_t *dest;
+	fixed_t frac;
+	fixed_t fracstep;
+
+	count = dc_count;
+	if (count <= 0)
+		return;
+
+	dest = dc_dest;
+
+	fracstep = dc_iscale;
+	frac = dc_texturefrac;
+
 	{
 		BYTE *colormap = dc_colormap;
 		const BYTE *source = dc_source;
@@ -1161,29 +1471,6 @@ void R_DrawSubClampColumnP_C ()
 			frac += fracstep;
 		} while (--count);
 	}
-#else
-	{
-		BYTE *colormap = dc_colormap;
-		const BYTE *source = dc_source;
-		int pitch = dc_pitch;
-		DWORD *fg2rgb = dc_srcblend;
-		DWORD *bg2rgb = dc_destblend;
-
-		do
-		{
-			DWORD a = (fg2rgb[colormap[source[frac>>FRACBITS]]] | 0x40100400) - bg2rgb[*dest];
-			DWORD b = a;
-
-			b &= 0x40100400;
-			b = b - (b >> 5);
-			a &= b;
-			a |= 0x01f07c1f;
-			*dest = RGB32k.All[a & (a>>15)];
-			dest += pitch;
-			frac += fracstep;
-		} while (--count);
-	}
-#endif
 }
 
 // Subtract destination from source, clamping it to black
@@ -1203,35 +1490,6 @@ void R_DrawSubClampTranslatedColumnP_C ()
 	fracstep = dc_iscale;
 	frac = dc_texturefrac;
 
-#ifndef PALETTEOUTPUT
-	{
-		BYTE *translation = dc_translation;
-		BYTE *colormap = dc_colormap;
-		const BYTE *source = dc_source;
-		int pitch = dc_pitch;
-		uint32_t light = calc_light_multiplier(dc_light);
-
-		do
-		{
-			uint32_t fg = shade_pal_index(colormap[translation[source[frac >> FRACBITS]]], light);
-			uint32_t fg_red = (fg >> 16) & 0xff;
-			uint32_t fg_green = (fg >> 8) & 0xff;
-			uint32_t fg_blue = fg & 0xff;
-
-			uint32_t bg_red = (*dest >> 16) & 0xff;
-			uint32_t bg_green = (*dest >> 8) & 0xff;
-			uint32_t bg_blue = (*dest) & 0xff;
-
-			uint32_t red = clamp<uint32_t>(256 - fg_red + bg_red, 256, 256 + 255) - 256;
-			uint32_t green = clamp<uint32_t>(256 - fg_green + bg_green, 256, 256 + 255) - 256;
-			uint32_t blue = clamp<uint32_t>(256 - fg_blue + bg_blue, 256, 256 + 255) - 256;
-
-			*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
-			dest += pitch;
-			frac += fracstep;
-		} while (--count);
-	}
-#else
 	{
 		BYTE *translation = dc_translation;
 		BYTE *colormap = dc_colormap;
@@ -1254,7 +1512,51 @@ void R_DrawSubClampTranslatedColumnP_C ()
 			frac += fracstep;
 		} while (--count);
 	}
-#endif
+}
+
+void R_DrawSubClampTranslatedColumnP_RGBA_C()
+{
+	int count;
+	canvas_pixel_t *dest;
+	fixed_t frac;
+	fixed_t fracstep;
+
+	count = dc_count;
+	if (count <= 0)
+		return;
+
+	dest = dc_dest;
+
+	fracstep = dc_iscale;
+	frac = dc_texturefrac;
+
+	{
+		BYTE *translation = dc_translation;
+		BYTE *colormap = dc_colormap;
+		const BYTE *source = dc_source;
+		int pitch = dc_pitch;
+		uint32_t light = calc_light_multiplier(dc_light);
+
+		do
+		{
+			uint32_t fg = shade_pal_index(colormap[translation[source[frac >> FRACBITS]]], light);
+			uint32_t fg_red = (fg >> 16) & 0xff;
+			uint32_t fg_green = (fg >> 8) & 0xff;
+			uint32_t fg_blue = fg & 0xff;
+
+			uint32_t bg_red = (*dest >> 16) & 0xff;
+			uint32_t bg_green = (*dest >> 8) & 0xff;
+			uint32_t bg_blue = (*dest) & 0xff;
+
+			uint32_t red = clamp<uint32_t>(256 - fg_red + bg_red, 256, 256 + 255) - 256;
+			uint32_t green = clamp<uint32_t>(256 - fg_green + bg_green, 256, 256 + 255) - 256;
+			uint32_t blue = clamp<uint32_t>(256 - fg_blue + bg_blue, 256, 256 + 255) - 256;
+
+			*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
+			dest += pitch;
+			frac += fracstep;
+		} while (--count);
+	}
 }
 
 // Subtract source from destination, clamping it to black
@@ -1274,7 +1576,45 @@ void R_DrawRevSubClampColumnP_C ()
 	fracstep = dc_iscale;
 	frac = dc_texturefrac;
 
-#ifndef PALETTEOUTPUT
+	{
+		BYTE *colormap = dc_colormap;
+		const BYTE *source = dc_source;
+		int pitch = dc_pitch;
+		DWORD *fg2rgb = dc_srcblend;
+		DWORD *bg2rgb = dc_destblend;
+
+		do
+		{
+			DWORD a = (bg2rgb[*dest] | 0x40100400) - fg2rgb[colormap[source[frac>>FRACBITS]]];
+			DWORD b = a;
+
+			b &= 0x40100400;
+			b = b - (b >> 5);
+			a &= b;
+			a |= 0x01f07c1f;
+			*dest = RGB32k.All[a & (a>>15)];
+			dest += pitch;
+			frac += fracstep;
+		} while (--count);
+	}
+}
+
+void R_DrawRevSubClampColumnP_RGBA_C()
+{
+	int count;
+	canvas_pixel_t *dest;
+	fixed_t frac;
+	fixed_t fracstep;
+
+	count = dc_count;
+	if (count <= 0)
+		return;
+
+	dest = dc_dest;
+
+	fracstep = dc_iscale;
+	frac = dc_texturefrac;
+
 	{
 		BYTE *colormap = dc_colormap;
 		const BYTE *source = dc_source;
@@ -1301,29 +1641,6 @@ void R_DrawRevSubClampColumnP_C ()
 			frac += fracstep;
 		} while (--count);
 	}
-#else
-	{
-		BYTE *colormap = dc_colormap;
-		const BYTE *source = dc_source;
-		int pitch = dc_pitch;
-		DWORD *fg2rgb = dc_srcblend;
-		DWORD *bg2rgb = dc_destblend;
-
-		do
-		{
-			DWORD a = (bg2rgb[*dest] | 0x40100400) - fg2rgb[colormap[source[frac>>FRACBITS]]];
-			DWORD b = a;
-
-			b &= 0x40100400;
-			b = b - (b >> 5);
-			a &= b;
-			a |= 0x01f07c1f;
-			*dest = RGB32k.All[a & (a>>15)];
-			dest += pitch;
-			frac += fracstep;
-		} while (--count);
-	}
-#endif
 }
 
 // Subtract source from destination, clamping it to black
@@ -1343,7 +1660,46 @@ void R_DrawRevSubClampTranslatedColumnP_C ()
 	fracstep = dc_iscale;
 	frac = dc_texturefrac;
 
-#ifndef PALETTEOUTPUT
+	{
+		BYTE *translation = dc_translation;
+		BYTE *colormap = dc_colormap;
+		const BYTE *source = dc_source;
+		int pitch = dc_pitch;
+		DWORD *fg2rgb = dc_srcblend;
+		DWORD *bg2rgb = dc_destblend;
+
+		do
+		{
+			DWORD a = (bg2rgb[*dest] | 0x40100400) - fg2rgb[colormap[translation[source[frac>>FRACBITS]]]];
+			DWORD b = a;
+
+			b &= 0x40100400;
+			b = b - (b >> 5);
+			a &= b;
+			a |= 0x01f07c1f;
+			*dest = RGB32k.All[(a>>15) & a];
+			dest += pitch;
+			frac += fracstep;
+		} while (--count);
+	}
+}
+
+void R_DrawRevSubClampTranslatedColumnP_RGBA_C()
+{
+	int count;
+	canvas_pixel_t *dest;
+	fixed_t frac;
+	fixed_t fracstep;
+
+	count = dc_count;
+	if (count <= 0)
+		return;
+
+	dest = dc_dest;
+
+	fracstep = dc_iscale;
+	frac = dc_texturefrac;
+
 	{
 		BYTE *translation = dc_translation;
 		BYTE *colormap = dc_colormap;
@@ -1371,34 +1727,9 @@ void R_DrawRevSubClampTranslatedColumnP_C ()
 			frac += fracstep;
 		} while (--count);
 	}
-#else
-	{
-		BYTE *translation = dc_translation;
-		BYTE *colormap = dc_colormap;
-		const BYTE *source = dc_source;
-		int pitch = dc_pitch;
-		DWORD *fg2rgb = dc_srcblend;
-		DWORD *bg2rgb = dc_destblend;
-
-		do
-		{
-			DWORD a = (bg2rgb[*dest] | 0x40100400) - fg2rgb[colormap[translation[source[frac>>FRACBITS]]]];
-			DWORD b = a;
-
-			b &= 0x40100400;
-			b = b - (b >> 5);
-			a &= b;
-			a |= 0x01f07c1f;
-			*dest = RGB32k.All[(a>>15) & a];
-			dest += pitch;
-			frac += fracstep;
-		} while (--count);
-	}
-#endif
 }
 
 
-
 //
 // R_DrawSpan 
 // With DOOM style restrictions on view orientation,
@@ -1549,15 +1880,84 @@ void R_DrawSpanP_C (void)
 	xstep = ds_xstep;
 	ystep = ds_ystep;
 
-#ifndef PALETTEOUTPUT
-	uint32_t light = calc_light_multiplier(ds_light);
+	if (ds_xbits == 6 && ds_ybits == 6)
+	{
+		// 64x64 is the most common case by far, so special case it.
+
+		do
+		{
+			// Current texture index in u,v.
+			spot = ((xfrac>>(32-6-6))&(63*64)) + (yfrac>>(32-6));
+
+			// Lookup pixel from flat texture tile,
+			//  re-index using light/colormap.
+			*dest++ = colormap[source[spot]];
+
+			// Next step in u,v.
+			xfrac += xstep;
+			yfrac += ystep;
+		} while (--count);
+	}
+	else
+	{
+		BYTE yshift = 32 - ds_ybits;
+		BYTE xshift = yshift - ds_xbits;
+		int xmask = ((1 << ds_xbits) - 1) << ds_ybits;
+
+		do
+		{
+			// Current texture index in u,v.
+			spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
+
+			// Lookup pixel from flat texture tile,
+			//  re-index using light/colormap.
+			*dest++ = colormap[source[spot]];
+
+			// Next step in u,v.
+			xfrac += xstep;
+			yfrac += ystep;
+		} while (--count);
+	}
+}
 #endif
 
+void R_DrawSpanP_RGBA_C()
+{
+	dsfixed_t			xfrac;
+	dsfixed_t			yfrac;
+	dsfixed_t			xstep;
+	dsfixed_t			ystep;
+	canvas_pixel_t*		dest;
+	const BYTE*			source = ds_source;
+	const BYTE*			colormap = ds_colormap;
+	int 				count;
+	int 				spot;
+
+#ifdef RANGECHECK 
+	if (ds_x2 < ds_x1 || ds_x1 < 0
+		|| ds_x2 >= screen->width || ds_y > screen->height)
+	{
+		I_Error("R_DrawSpan: %i to %i at %i", ds_x1, ds_x2, ds_y);
+	}
+	//		dscount++;
+#endif
+
+	xfrac = ds_xfrac;
+	yfrac = ds_yfrac;
+
+	dest = ylookup[ds_y] + ds_x1 + dc_destorg;
+
+	count = ds_x2 - ds_x1 + 1;
+
+	xstep = ds_xstep;
+	ystep = ds_ystep;
+
+	uint32_t light = calc_light_multiplier(ds_light);
+
 	if (ds_xbits == 6 && ds_ybits == 6)
 	{
 		// 64x64 is the most common case by far, so special case it.
 
-#ifndef PALETTEOUTPUT
 #ifndef NO_SSE
 		__m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light);
 		uint32_t *palette = (uint32_t*)GPalette.BaseColors;
@@ -1589,14 +1989,14 @@ void R_DrawSpanP_C (void)
 
 			// Lookup pixel from flat texture tile,
 			//  re-index using light/colormap.
-			__m128i fg = _mm_set_epi32(palette[p0], palette[p1], palette[p2], palette[p3]);
+			__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
 			__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
 			__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
 			fg_hi = _mm_mullo_epi16(fg_hi, mlight);
 			fg_hi = _mm_srli_epi16(fg_hi, 8);
 			fg_lo = _mm_mullo_epi16(fg_lo, mlight);
 			fg_lo = _mm_srli_epi16(fg_lo, 8);
-			fg = _mm_packus_epi16(fg_hi, fg_lo);
+			fg = _mm_packus_epi16(fg_lo, fg_hi);
 			_mm_storeu_si128((__m128i*)dest, fg);
 
 			// Next step in u,v.
@@ -1604,21 +2004,16 @@ void R_DrawSpanP_C (void)
 		}
 		if (count == 0)
 			return;
-#endif
 #endif
 
 		do
 		{
 			// Current texture index in u,v.
-			spot = ((xfrac>>(32-6-6))&(63*64)) + (yfrac>>(32-6));
+			spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
 
 			// Lookup pixel from flat texture tile,
 			//  re-index using light/colormap.
-#ifndef PALETTEOUTPUT
 			*dest++ = shade_pal_index(colormap[source[spot]], light);
-#else
-			*dest++ = colormap[source[spot]];
-#endif
 
 			// Next step in u,v.
 			xfrac += xstep;
@@ -1638,11 +2033,7 @@ void R_DrawSpanP_C (void)
 
 			// Lookup pixel from flat texture tile,
 			//  re-index using light/colormap.
-#ifndef PALETTEOUTPUT
 			*dest++ = shade_pal_index(colormap[source[spot]], light);
-#else
-			*dest++ = colormap[source[spot]];
-#endif
 
 			// Next step in u,v.
 			xfrac += xstep;
@@ -1651,6 +2042,8 @@ void R_DrawSpanP_C (void)
 	}
 }
 
+#ifndef X86_ASM
+
 // [RH] Draw a span with holes
 void R_DrawSpanMaskedP_C (void)
 {
@@ -1664,10 +2057,6 @@ void R_DrawSpanMaskedP_C (void)
 	int 				count;
 	int 				spot;
 
-#ifndef PALETTEOUTPUT
-	uint32_t light = calc_light_multiplier(ds_light);
-#endif
-
 	xfrac = ds_xfrac;
 	yfrac = ds_yfrac;
 
@@ -1689,11 +2078,7 @@ void R_DrawSpanMaskedP_C (void)
 			texdata = source[spot];
 			if (texdata != 0)
 			{
-#ifndef PALETTEOUTPUT
-				*dest = shade_pal_index(colormap[texdata], light);
-#else
 				*dest = colormap[texdata];
-#endif
 			}
 			dest++;
 			xfrac += xstep;
@@ -1713,11 +2098,7 @@ void R_DrawSpanMaskedP_C (void)
 			texdata = source[spot];
 			if (texdata != 0)
 			{
-#ifndef PALETTEOUTPUT
-				*dest = shade_pal_index(colormap[texdata], light);
-#else
 				*dest = colormap[texdata];
-#endif
 			}
 			dest++;
 			xfrac += xstep;
@@ -1727,6 +2108,71 @@ void R_DrawSpanMaskedP_C (void)
 }
 #endif
 
+void R_DrawSpanMaskedP_RGBA_C()
+{
+	dsfixed_t			xfrac;
+	dsfixed_t			yfrac;
+	dsfixed_t			xstep;
+	dsfixed_t			ystep;
+	canvas_pixel_t*		dest;
+	const BYTE*			source = ds_source;
+	const BYTE*			colormap = ds_colormap;
+	int 				count;
+	int 				spot;
+
+	uint32_t light = calc_light_multiplier(ds_light);
+
+	xfrac = ds_xfrac;
+	yfrac = ds_yfrac;
+
+	dest = ylookup[ds_y] + ds_x1 + dc_destorg;
+
+	count = ds_x2 - ds_x1 + 1;
+
+	xstep = ds_xstep;
+	ystep = ds_ystep;
+
+	if (ds_xbits == 6 && ds_ybits == 6)
+	{
+		// 64x64 is the most common case by far, so special case it.
+		do
+		{
+			BYTE texdata;
+
+			spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
+			texdata = source[spot];
+			if (texdata != 0)
+			{
+				*dest = shade_pal_index(colormap[texdata], light);
+			}
+			dest++;
+			xfrac += xstep;
+			yfrac += ystep;
+		} while (--count);
+	}
+	else
+	{
+		BYTE yshift = 32 - ds_ybits;
+		BYTE xshift = yshift - ds_xbits;
+		int xmask = ((1 << ds_xbits) - 1) << ds_ybits;
+		do
+		{
+			BYTE texdata;
+
+			spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
+			texdata = source[spot];
+			if (texdata != 0)
+			{
+				*dest = shade_pal_index(colormap[texdata], light);
+			}
+			dest++;
+			xfrac += xstep;
+			yfrac += ystep;
+		} while (--count);
+	}
+}
+
+
 void R_DrawSpanTranslucentP_C (void)
 {
 	dsfixed_t			xfrac;
@@ -1756,7 +2202,68 @@ void R_DrawSpanTranslucentP_C (void)
 	if (ds_xbits == 6 && ds_ybits == 6)
 	{
 		// 64x64 is the most common case by far, so special case it.
-#ifndef PALETTEOUTPUT
+		do
+		{
+			spot = ((xfrac>>(32-6-6))&(63*64)) + (yfrac>>(32-6));
+			DWORD fg = colormap[source[spot]];
+			DWORD bg = *dest;
+			fg = fg2rgb[fg];
+			bg = bg2rgb[bg];
+			fg = (fg+bg) | 0x1f07c1f;
+			*dest++ = RGB32k.All[fg & (fg>>15)];
+			xfrac += xstep;
+			yfrac += ystep;
+		} while (--count);
+	}
+	else
+	{
+		BYTE yshift = 32 - ds_ybits;
+		BYTE xshift = yshift - ds_xbits;
+		int xmask = ((1 << ds_xbits) - 1) << ds_ybits;
+		do
+		{
+			spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
+			DWORD fg = colormap[source[spot]];
+			DWORD bg = *dest;
+			fg = fg2rgb[fg];
+			bg = bg2rgb[bg];
+			fg = (fg+bg) | 0x1f07c1f;
+			*dest++ = RGB32k.All[fg & (fg>>15)];
+			xfrac += xstep;
+			yfrac += ystep;
+		} while (--count);
+	}
+}
+
+void R_DrawSpanTranslucentP_RGBA_C()
+{
+	dsfixed_t			xfrac;
+	dsfixed_t			yfrac;
+	dsfixed_t			xstep;
+	dsfixed_t			ystep;
+	canvas_pixel_t*		dest;
+	const BYTE*			source = ds_source;
+	const BYTE*			colormap = ds_colormap;
+	int 				count;
+	int 				spot;
+	DWORD *fg2rgb = dc_srcblend;
+	DWORD *bg2rgb = dc_destblend;
+
+	xfrac = ds_xfrac;
+	yfrac = ds_yfrac;
+
+	dest = ylookup[ds_y] + ds_x1 + dc_destorg;
+
+	count = ds_x2 - ds_x1 + 1;
+
+	xstep = ds_xstep;
+	ystep = ds_ystep;
+
+	uint32_t light = calc_light_multiplier(ds_light);
+
+	if (ds_xbits == 6 && ds_ybits == 6)
+	{
+		// 64x64 is the most common case by far, so special case it.
 		do
 		{
 			spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
@@ -1779,24 +2286,9 @@ void R_DrawSpanTranslucentP_C (void)
 			xfrac += xstep;
 			yfrac += ystep;
 		} while (--count);
-#else
-		do
-		{
-			spot = ((xfrac>>(32-6-6))&(63*64)) + (yfrac>>(32-6));
-			DWORD fg = colormap[source[spot]];
-			DWORD bg = *dest;
-			fg = fg2rgb[fg];
-			bg = bg2rgb[bg];
-			fg = (fg+bg) | 0x1f07c1f;
-			*dest++ = RGB32k.All[fg & (fg>>15)];
-			xfrac += xstep;
-			yfrac += ystep;
-		} while (--count);
-#endif
 	}
 	else
 	{
-#ifndef PALETTEOUTPUT
 		BYTE yshift = 32 - ds_ybits;
 		BYTE xshift = yshift - ds_xbits;
 		int xmask = ((1 << ds_xbits) - 1) << ds_ybits;
@@ -1822,23 +2314,6 @@ void R_DrawSpanTranslucentP_C (void)
 			xfrac += xstep;
 			yfrac += ystep;
 		} while (--count);
-#else
-		BYTE yshift = 32 - ds_ybits;
-		BYTE xshift = yshift - ds_xbits;
-		int xmask = ((1 << ds_xbits) - 1) << ds_ybits;
-		do
-		{
-			spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
-			DWORD fg = colormap[source[spot]];
-			DWORD bg = *dest;
-			fg = fg2rgb[fg];
-			bg = bg2rgb[bg];
-			fg = (fg+bg) | 0x1f07c1f;
-			*dest++ = RGB32k.All[fg & (fg>>15)];
-			xfrac += xstep;
-			yfrac += ystep;
-		} while (--count);
-#endif
 	}
 }
 
@@ -1879,29 +2354,12 @@ void R_DrawSpanMaskedTranslucentP_C (void)
 			texdata = source[spot];
 			if (texdata != 0)
 			{
-#ifndef PALETTEOUTPUT
-				uint32_t fg = shade_pal_index(colormap[texdata], light);
-				uint32_t fg_red = (fg >> 16) & 0xff;
-				uint32_t fg_green = (fg >> 8) & 0xff;
-				uint32_t fg_blue = (fg) & 0xff;
-
-				uint32_t bg_red = (*dest >> 16) & 0xff;
-				uint32_t bg_green = (*dest >> 8) & 0xff;
-				uint32_t bg_blue = (*dest) & 0xff;
-
-				uint32_t red = (fg_red + bg_red + 1) / 2;
-				uint32_t green = (fg_green + bg_green + 1) / 2;
-				uint32_t blue = (fg_blue + bg_blue + 1) / 2;
-
-				*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
-#else
 				DWORD fg = colormap[texdata];
 				DWORD bg = *dest;
 				fg = fg2rgb[fg];
 				bg = bg2rgb[bg];
 				fg = (fg+bg) | 0x1f07c1f;
 				*dest = RGB32k.All[fg & (fg>>15)];
-#endif
 			}
 			dest++;
 			xfrac += xstep;
@@ -1921,29 +2379,12 @@ void R_DrawSpanMaskedTranslucentP_C (void)
 			texdata = source[spot];
 			if (texdata != 0)
 			{
-#ifndef PALETTEOUTPUT
-				uint32_t fg = shade_pal_index(colormap[texdata], light);
-				uint32_t fg_red = (fg >> 16) & 0xff;
-				uint32_t fg_green = (fg >> 8) & 0xff;
-				uint32_t fg_blue = (fg) & 0xff;
-
-				uint32_t bg_red = (*dest >> 16) & 0xff;
-				uint32_t bg_green = (*dest >> 8) & 0xff;
-				uint32_t bg_blue = (*dest) & 0xff;
-
-				uint32_t red = (fg_red + bg_red + 1) / 2;
-				uint32_t green = (fg_green + bg_green + 1) / 2;
-				uint32_t blue = (fg_blue + bg_blue + 1) / 2;
-
-				*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
-#else
 				DWORD fg = colormap[texdata];
 				DWORD bg = *dest;
 				fg = fg2rgb[fg];
 				bg = bg2rgb[bg];
 				fg = (fg+bg) | 0x1f07c1f;
 				*dest = RGB32k.All[fg & (fg>>15)];
-#endif
 			}
 			dest++;
 			xfrac += xstep;
@@ -1952,7 +2393,7 @@ void R_DrawSpanMaskedTranslucentP_C (void)
 	}
 }
 
-void R_DrawSpanAddClampP_C (void)
+void R_DrawSpanMaskedTranslucentP_RGBA_C()
 {
 	dsfixed_t			xfrac;
 	dsfixed_t			yfrac;
@@ -1978,6 +2419,96 @@ void R_DrawSpanAddClampP_C (void)
 	xstep = ds_xstep;
 	ystep = ds_ystep;
 
+	if (ds_xbits == 6 && ds_ybits == 6)
+	{
+		// 64x64 is the most common case by far, so special case it.
+		do
+		{
+			BYTE texdata;
+
+			spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
+			texdata = source[spot];
+			if (texdata != 0)
+			{
+				uint32_t fg = shade_pal_index(colormap[texdata], light);
+				uint32_t fg_red = (fg >> 16) & 0xff;
+				uint32_t fg_green = (fg >> 8) & 0xff;
+				uint32_t fg_blue = (fg) & 0xff;
+
+				uint32_t bg_red = (*dest >> 16) & 0xff;
+				uint32_t bg_green = (*dest >> 8) & 0xff;
+				uint32_t bg_blue = (*dest) & 0xff;
+
+				uint32_t red = (fg_red + bg_red + 1) / 2;
+				uint32_t green = (fg_green + bg_green + 1) / 2;
+				uint32_t blue = (fg_blue + bg_blue + 1) / 2;
+
+				*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
+			}
+			dest++;
+			xfrac += xstep;
+			yfrac += ystep;
+		} while (--count);
+	}
+	else
+	{
+		BYTE yshift = 32 - ds_ybits;
+		BYTE xshift = yshift - ds_xbits;
+		int xmask = ((1 << ds_xbits) - 1) << ds_ybits;
+		do
+		{
+			BYTE texdata;
+
+			spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
+			texdata = source[spot];
+			if (texdata != 0)
+			{
+				uint32_t fg = shade_pal_index(colormap[texdata], light);
+				uint32_t fg_red = (fg >> 16) & 0xff;
+				uint32_t fg_green = (fg >> 8) & 0xff;
+				uint32_t fg_blue = (fg) & 0xff;
+
+				uint32_t bg_red = (*dest >> 16) & 0xff;
+				uint32_t bg_green = (*dest >> 8) & 0xff;
+				uint32_t bg_blue = (*dest) & 0xff;
+
+				uint32_t red = (fg_red + bg_red + 1) / 2;
+				uint32_t green = (fg_green + bg_green + 1) / 2;
+				uint32_t blue = (fg_blue + bg_blue + 1) / 2;
+
+				*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
+			}
+			dest++;
+			xfrac += xstep;
+			yfrac += ystep;
+		} while (--count);
+	}
+}
+
+void R_DrawSpanAddClampP_C (void)
+{
+	dsfixed_t			xfrac;
+	dsfixed_t			yfrac;
+	dsfixed_t			xstep;
+	dsfixed_t			ystep;
+	canvas_pixel_t*		dest;
+	const BYTE*			source = ds_source;
+	const BYTE*			colormap = ds_colormap;
+	int 				count;
+	int 				spot;
+	DWORD *fg2rgb = dc_srcblend;
+	DWORD *bg2rgb = dc_destblend;
+
+	xfrac = ds_xfrac;
+	yfrac = ds_yfrac;
+
+	dest = ylookup[ds_y] + ds_x1 + dc_destorg;
+
+	count = ds_x2 - ds_x1 + 1;
+
+	xstep = ds_xstep;
+	ystep = ds_ystep;
+
 	if (ds_xbits == 6 && ds_ybits == 6)
 	{
 		// 64x64 is the most common case by far, so special case it.
@@ -1985,22 +2516,6 @@ void R_DrawSpanAddClampP_C (void)
 		{
 			spot = ((xfrac>>(32-6-6))&(63*64)) + (yfrac>>(32-6));
 
-#ifndef PALETTEOUTPUT
-			uint32_t fg = shade_pal_index(colormap[source[spot]], light);
-			uint32_t fg_red = (fg >> 16) & 0xff;
-			uint32_t fg_green = (fg >> 8) & 0xff;
-			uint32_t fg_blue = (fg) & 0xff;
-
-			uint32_t bg_red = (*dest >> 16) & 0xff;
-			uint32_t bg_green = (*dest >> 8) & 0xff;
-			uint32_t bg_blue = (*dest) & 0xff;
-
-			uint32_t red = clamp<uint32_t>(fg_red + bg_red, 0, 255);
-			uint32_t green = clamp<uint32_t>(fg_green + bg_green, 0, 255);
-			uint32_t blue = clamp<uint32_t>(fg_blue + bg_blue, 0, 255);
-
-			*dest++ = 0xff000000 | (red << 16) | (green << 8) | blue;
-#else
 			DWORD a = fg2rgb[colormap[source[spot]]] + bg2rgb[*dest];
 			DWORD b = a;
 
@@ -2010,7 +2525,6 @@ void R_DrawSpanAddClampP_C (void)
 			b = b - (b >> 5);
 			a |= b;
 			*dest++ = RGB32k.All[a & (a>>15)];
-#endif
 
 			xfrac += xstep;
 			yfrac += ystep;
@@ -2025,7 +2539,55 @@ void R_DrawSpanAddClampP_C (void)
 		{
 			spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
 
-#ifndef PALETTEOUTPUT
+			DWORD a = fg2rgb[colormap[source[spot]]] + bg2rgb[*dest];
+			DWORD b = a;
+
+			a |= 0x01f07c1f;
+			b &= 0x40100400;
+			a &= 0x3fffffff;
+			b = b - (b >> 5);
+			a |= b;
+			*dest++ = RGB32k.All[a & (a>>15)];
+
+			xfrac += xstep;
+			yfrac += ystep;
+		} while (--count);
+	}
+}
+
+void R_DrawSpanAddClampP_RGBA_C()
+{
+	dsfixed_t			xfrac;
+	dsfixed_t			yfrac;
+	dsfixed_t			xstep;
+	dsfixed_t			ystep;
+	canvas_pixel_t*		dest;
+	const BYTE*			source = ds_source;
+	const BYTE*			colormap = ds_colormap;
+	int 				count;
+	int 				spot;
+	DWORD *fg2rgb = dc_srcblend;
+	DWORD *bg2rgb = dc_destblend;
+
+	uint32_t light = calc_light_multiplier(ds_light);
+
+	xfrac = ds_xfrac;
+	yfrac = ds_yfrac;
+
+	dest = ylookup[ds_y] + ds_x1 + dc_destorg;
+
+	count = ds_x2 - ds_x1 + 1;
+
+	xstep = ds_xstep;
+	ystep = ds_ystep;
+
+	if (ds_xbits == 6 && ds_ybits == 6)
+	{
+		// 64x64 is the most common case by far, so special case it.
+		do
+		{
+			spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
+
 			uint32_t fg = shade_pal_index(colormap[source[spot]], light);
 			uint32_t fg_red = (fg >> 16) & 0xff;
 			uint32_t fg_green = (fg >> 8) & 0xff;
@@ -2040,17 +2602,34 @@ void R_DrawSpanAddClampP_C (void)
 			uint32_t blue = clamp<uint32_t>(fg_blue + bg_blue, 0, 255);
 
 			*dest++ = 0xff000000 | (red << 16) | (green << 8) | blue;
-#else
-			DWORD a = fg2rgb[colormap[source[spot]]] + bg2rgb[*dest];
-			DWORD b = a;
 
-			a |= 0x01f07c1f;
-			b &= 0x40100400;
-			a &= 0x3fffffff;
-			b = b - (b >> 5);
-			a |= b;
-			*dest++ = RGB32k.All[a & (a>>15)];
-#endif
+			xfrac += xstep;
+			yfrac += ystep;
+		} while (--count);
+	}
+	else
+	{
+		BYTE yshift = 32 - ds_ybits;
+		BYTE xshift = yshift - ds_xbits;
+		int xmask = ((1 << ds_xbits) - 1) << ds_ybits;
+		do
+		{
+			spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
+
+			uint32_t fg = shade_pal_index(colormap[source[spot]], light);
+			uint32_t fg_red = (fg >> 16) & 0xff;
+			uint32_t fg_green = (fg >> 8) & 0xff;
+			uint32_t fg_blue = (fg) & 0xff;
+
+			uint32_t bg_red = (*dest >> 16) & 0xff;
+			uint32_t bg_green = (*dest >> 8) & 0xff;
+			uint32_t bg_blue = (*dest) & 0xff;
+
+			uint32_t red = clamp<uint32_t>(fg_red + bg_red, 0, 255);
+			uint32_t green = clamp<uint32_t>(fg_green + bg_green, 0, 255);
+			uint32_t blue = clamp<uint32_t>(fg_blue + bg_blue, 0, 255);
+
+			*dest++ = 0xff000000 | (red << 16) | (green << 8) | blue;
 
 			xfrac += xstep;
 			yfrac += ystep;
@@ -2095,22 +2674,6 @@ void R_DrawSpanMaskedAddClampP_C (void)
 			texdata = source[spot];
 			if (texdata != 0)
 			{
-#ifndef PALETTEOUTPUT
-				uint32_t fg = shade_pal_index(colormap[texdata], light);
-				uint32_t fg_red = (fg >> 16) & 0xff;
-				uint32_t fg_green = (fg >> 8) & 0xff;
-				uint32_t fg_blue = (fg) & 0xff;
-
-				uint32_t bg_red = (*dest >> 16) & 0xff;
-				uint32_t bg_green = (*dest >> 8) & 0xff;
-				uint32_t bg_blue = (*dest) & 0xff;
-
-				uint32_t red = (fg_red + bg_red + 1) / 2;
-				uint32_t green = (fg_green + bg_green + 1) / 2;
-				uint32_t blue = (fg_blue + bg_blue + 1) / 2;
-
-				*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
-#else
 				DWORD a = fg2rgb[colormap[texdata]] + bg2rgb[*dest];
 				DWORD b = a;
 
@@ -2120,7 +2683,6 @@ void R_DrawSpanMaskedAddClampP_C (void)
 				b = b - (b >> 5);
 				a |= b;
 				*dest = RGB32k.All[a & (a>>15)];
-#endif
 			}
 			dest++;
 			xfrac += xstep;
@@ -2140,7 +2702,60 @@ void R_DrawSpanMaskedAddClampP_C (void)
 			texdata = source[spot];
 			if (texdata != 0)
 			{
-#ifndef PALETTEOUTPUT
+				DWORD a = fg2rgb[colormap[texdata]] + bg2rgb[*dest];
+				DWORD b = a;
+
+				a |= 0x01f07c1f;
+				b &= 0x40100400;
+				a &= 0x3fffffff;
+				b = b - (b >> 5);
+				a |= b;
+				*dest = RGB32k.All[a & (a>>15)];
+			}
+			dest++;
+			xfrac += xstep;
+			yfrac += ystep;
+		} while (--count);
+	}
+}
+
+void R_DrawSpanMaskedAddClampP_RGBA_C()
+{
+	dsfixed_t			xfrac;
+	dsfixed_t			yfrac;
+	dsfixed_t			xstep;
+	dsfixed_t			ystep;
+	canvas_pixel_t*		dest;
+	const BYTE*			source = ds_source;
+	const BYTE*			colormap = ds_colormap;
+	int 				count;
+	int 				spot;
+	DWORD *fg2rgb = dc_srcblend;
+	DWORD *bg2rgb = dc_destblend;
+
+	uint32_t light = calc_light_multiplier(ds_light);
+
+	xfrac = ds_xfrac;
+	yfrac = ds_yfrac;
+
+	dest = ylookup[ds_y] + ds_x1 + dc_destorg;
+
+	count = ds_x2 - ds_x1 + 1;
+
+	xstep = ds_xstep;
+	ystep = ds_ystep;
+
+	if (ds_xbits == 6 && ds_ybits == 6)
+	{
+		// 64x64 is the most common case by far, so special case it.
+		do
+		{
+			BYTE texdata;
+
+			spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
+			texdata = source[spot];
+			if (texdata != 0)
+			{
 				uint32_t fg = shade_pal_index(colormap[texdata], light);
 				uint32_t fg_red = (fg >> 16) & 0xff;
 				uint32_t fg_green = (fg >> 8) & 0xff;
@@ -2155,17 +2770,39 @@ void R_DrawSpanMaskedAddClampP_C (void)
 				uint32_t blue = (fg_blue + bg_blue + 1) / 2;
 
 				*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
-#else
-				DWORD a = fg2rgb[colormap[texdata]] + bg2rgb[*dest];
-				DWORD b = a;
+			}
+			dest++;
+			xfrac += xstep;
+			yfrac += ystep;
+		} while (--count);
+	}
+	else
+	{
+		BYTE yshift = 32 - ds_ybits;
+		BYTE xshift = yshift - ds_xbits;
+		int xmask = ((1 << ds_xbits) - 1) << ds_ybits;
+		do
+		{
+			BYTE texdata;
 
-				a |= 0x01f07c1f;
-				b &= 0x40100400;
-				a &= 0x3fffffff;
-				b = b - (b >> 5);
-				a |= b;
-				*dest = RGB32k.All[a & (a>>15)];
-#endif
+			spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
+			texdata = source[spot];
+			if (texdata != 0)
+			{
+				uint32_t fg = shade_pal_index(colormap[texdata], light);
+				uint32_t fg_red = (fg >> 16) & 0xff;
+				uint32_t fg_green = (fg >> 8) & 0xff;
+				uint32_t fg_blue = (fg) & 0xff;
+
+				uint32_t bg_red = (*dest >> 16) & 0xff;
+				uint32_t bg_green = (*dest >> 8) & 0xff;
+				uint32_t bg_blue = (*dest) & 0xff;
+
+				uint32_t red = (fg_red + bg_red + 1) / 2;
+				uint32_t green = (fg_green + bg_green + 1) / 2;
+				uint32_t blue = (fg_blue + bg_blue + 1) / 2;
+
+				*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
 			}
 			dest++;
 			xfrac += xstep;
@@ -2175,18 +2812,19 @@ void R_DrawSpanMaskedAddClampP_C (void)
 }
 
 // [RH] Just fill a span with a color
-void R_FillSpan (void)
+void R_FillSpan_C (void)
+{
+	memset (ylookup[ds_y] + ds_x1 + dc_destorg, ds_color, (ds_x2 - ds_x1 + 1));
+}
+
+void R_FillSpan_RGBA()
 {
-#ifndef PALETTEOUTPUT
 	canvas_pixel_t *dest = ylookup[ds_y] + ds_x1 + dc_destorg;
 	int count = (ds_x2 - ds_x1 + 1);
 	uint32_t light = calc_light_multiplier(ds_light);
 	uint32_t color = shade_pal_index(ds_color, light);
 	for (int i = 0; i < count; i++)
 		dest[i] = color;
-#else
-	memset (ylookup[ds_y] + ds_x1 + dc_destorg, ds_color, (ds_x2 - ds_x1 + 1) * sizeof(canvas_pixel_t));
-#endif
 }
 
 // Draw a voxel slab
@@ -2383,17 +3021,33 @@ DWORD vlinec1 ()
 	int bits = vlinebits;
 	int pitch = dc_pitch;
 
-#ifndef PALETTEOUTPUT
-	uint32_t light = calc_light_multiplier(dc_light);
+	do
+	{
+		*dest = colormap[source[frac >> bits]];
+		frac += fracstep;
+		dest += pitch;
+	} while (--count);
+
+	return frac;
+}
 #endif
 
+DWORD vlinec1_RGBA()
+{
+	DWORD fracstep = dc_iscale;
+	DWORD frac = dc_texturefrac;
+	BYTE *colormap = dc_colormap;
+	int count = dc_count;
+	const BYTE *source = dc_source;
+	canvas_pixel_t *dest = dc_dest;
+	int bits = vlinebits;
+	int pitch = dc_pitch;
+
+	uint32_t light = calc_light_multiplier(dc_light);
+
 	do
 	{
-#ifndef PALETTEOUTPUT
-		*dest = shade_pal_index(colormap[source[frac>>bits]], light);
-#else
-		*dest = colormap[source[frac >> bits]];
-#endif
+		*dest = shade_pal_index(colormap[source[frac >> bits]], light);
 		frac += fracstep;
 		dest += pitch;
 	} while (--count);
@@ -2401,6 +3055,7 @@ DWORD vlinec1 ()
 	return frac;
 }
 
+#if !defined(X86_ASM)
 void vlinec4 ()
 {
 	canvas_pixel_t *dest = dc_dest;
@@ -2408,23 +3063,37 @@ void vlinec4 ()
 	int bits = vlinebits;
 	DWORD place;
 
-#ifndef PALETTEOUTPUT
+	do
+	{
+		dest[0] = palookupoffse[0][bufplce[0][(place=vplce[0])>>bits]]; vplce[0] = place+vince[0];
+		dest[1] = palookupoffse[1][bufplce[1][(place=vplce[1])>>bits]]; vplce[1] = place+vince[1];
+		dest[2] = palookupoffse[2][bufplce[2][(place=vplce[2])>>bits]]; vplce[2] = place+vince[2];
+		dest[3] = palookupoffse[3][bufplce[3][(place=vplce[3])>>bits]]; vplce[3] = place+vince[3];
+		dest += dc_pitch;
+	} while (--count);
+}
+#endif
+
+void vlinec4_RGBA()
+{
+	canvas_pixel_t *dest = dc_dest;
+	int count = dc_count;
+	int bits = vlinebits;
+
 	uint32_t light0 = calc_light_multiplier(palookuplight[0]);
 	uint32_t light1 = calc_light_multiplier(palookuplight[1]);
 	uint32_t light2 = calc_light_multiplier(palookuplight[2]);
 	uint32_t light3 = calc_light_multiplier(palookuplight[3]);
 #ifndef NO_SSE
-	__m128i mlight_hi = _mm_set_epi16(256, light0, light0, light0, 256, light1, light1, light1);
-	__m128i mlight_lo = _mm_set_epi16(256, light2, light2, light2, 256, light3, light3, light3);
+	__m128i mlight_hi = _mm_set_epi16(256, light1, light1, light1, 256, light0, light0, light0);
+	__m128i mlight_lo = _mm_set_epi16(256, light3, light3, light3, 256, light2, light2, light2);
 	uint32_t *palette = (uint32_t*)GPalette.BaseColors;
 	DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
 	DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
-#endif
 #endif
 
 	do
 	{
-#ifndef PALETTEOUTPUT
 #ifndef NO_SSE
 
 		DWORD place0 = local_vplce[0];
@@ -2442,14 +3111,14 @@ void vlinec4 ()
 		local_vplce[2] = place2 + local_vince[2];
 		local_vplce[3] = place3 + local_vince[3];
 
-		__m128i fg = _mm_set_epi32(palette[p0], palette[p1], palette[p2], palette[p3]);
+		__m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]);
 		__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
 		__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
 		fg_hi = _mm_mullo_epi16(fg_hi, mlight_hi);
 		fg_hi = _mm_srli_epi16(fg_hi, 8);
 		fg_lo = _mm_mullo_epi16(fg_lo, mlight_lo);
 		fg_lo = _mm_srli_epi16(fg_lo, 8);
-		fg = _mm_packus_epi16(fg_hi, fg_lo);
+		fg = _mm_packus_epi16(fg_lo, fg_hi);
 		_mm_storeu_si128((__m128i*)dest, fg);
 
 #else
@@ -2457,17 +3126,10 @@ void vlinec4 ()
 		dest[1] = shade_pal_index(palookupoffse[1][bufplce[1][(place = vplce[1]) >> bits]], light1); vplce[1] = place + vince[1];
 		dest[2] = shade_pal_index(palookupoffse[2][bufplce[2][(place = vplce[2]) >> bits]], light2); vplce[2] = place + vince[2];
 		dest[3] = shade_pal_index(palookupoffse[3][bufplce[3][(place = vplce[3]) >> bits]], light3); vplce[3] = place + vince[3];
-#endif
-#else
-		dest[0] = palookupoffse[0][bufplce[0][(place=vplce[0])>>bits]]; vplce[0] = place+vince[0];
-		dest[1] = palookupoffse[1][bufplce[1][(place=vplce[1])>>bits]]; vplce[1] = place+vince[1];
-		dest[2] = palookupoffse[2][bufplce[2][(place=vplce[2])>>bits]]; vplce[2] = place+vince[2];
-		dest[3] = palookupoffse[3][bufplce[3][(place=vplce[3])>>bits]]; vplce[3] = place+vince[3];
 #endif
 		dest += dc_pitch;
 	} while (--count);
 
-#ifndef PALETTEOUTPUT
 #ifndef NO_SSE
 	// Is this needed? Global variables makes it tricky to know..
 	vplce[0] = local_vplce[0];
@@ -2479,9 +3141,7 @@ void vlinec4 ()
 	vince[2] = local_vince[2];
 	vince[3] = local_vince[3];
 #endif
-#endif
 }
-#endif
 
 void setupmvline (int fracbits)
 {
@@ -2506,20 +3166,40 @@ DWORD mvlinec1 ()
 	int bits = mvlinebits;
 	int pitch = dc_pitch;
 
-#ifndef PALETTEOUTPUT
-	uint32_t light = calc_light_multiplier(dc_light);
-#endif
-
 	do
 	{
 		BYTE pix = source[frac>>bits];
 		if (pix != 0)
 		{
-#ifndef PALETTEOUTPUT
-			*dest = shade_pal_index(colormap[pix], light);
-#else
 			*dest = colormap[pix];
+		}
+		frac += fracstep;
+		dest += pitch;
+	} while (--count);
+
+	return frac;
+}
 #endif
+
+DWORD mvlinec1_RGBA()
+{
+	DWORD fracstep = dc_iscale;
+	DWORD frac = dc_texturefrac;
+	BYTE *colormap = dc_colormap;
+	int count = dc_count;
+	const BYTE *source = dc_source;
+	canvas_pixel_t *dest = dc_dest;
+	int bits = mvlinebits;
+	int pitch = dc_pitch;
+
+	uint32_t light = calc_light_multiplier(dc_light);
+
+	do
+	{
+		BYTE pix = source[frac >> bits];
+		if (pix != 0)
+		{
+			*dest = shade_pal_index(colormap[pix], light);
 		}
 		frac += fracstep;
 		dest += pitch;
@@ -2528,6 +3208,7 @@ DWORD mvlinec1 ()
 	return frac;
 }
 
+#if !defined(X86_ASM)
 void mvlinec4 ()
 {
 	canvas_pixel_t *dest = dc_dest;
@@ -2535,33 +3216,42 @@ void mvlinec4 ()
 	int bits = mvlinebits;
 	DWORD place;
 
-#ifndef PALETTEOUTPUT
-	uint32_t light0 = calc_light_multiplier(palookuplight[0]);
-	uint32_t light1 = calc_light_multiplier(palookuplight[1]);
-	uint32_t light2 = calc_light_multiplier(palookuplight[2]);
-	uint32_t light3 = calc_light_multiplier(palookuplight[3]);
-#endif
-
 	do
 	{
 		BYTE pix;
-
-#ifndef PALETTEOUTPUT
-		pix = bufplce[0][(place = vplce[0]) >> bits]; if (pix) dest[0] = shade_pal_index(palookupoffse[0][pix], light0); vplce[0] = place + vince[0];
-		pix = bufplce[1][(place = vplce[1]) >> bits]; if (pix) dest[1] = shade_pal_index(palookupoffse[1][pix], light1); vplce[1] = place + vince[1];
-		pix = bufplce[2][(place = vplce[2]) >> bits]; if (pix) dest[2] = shade_pal_index(palookupoffse[2][pix], light2); vplce[2] = place + vince[2];
-		pix = bufplce[3][(place = vplce[3]) >> bits]; if (pix) dest[3] = shade_pal_index(palookupoffse[3][pix], light3); vplce[3] = place + vince[3];
-#else
 		pix = bufplce[0][(place=vplce[0])>>bits]; if(pix) dest[0] = palookupoffse[0][pix]; vplce[0] = place+vince[0];
 		pix = bufplce[1][(place=vplce[1])>>bits]; if(pix) dest[1] = palookupoffse[1][pix]; vplce[1] = place+vince[1];
 		pix = bufplce[2][(place=vplce[2])>>bits]; if(pix) dest[2] = palookupoffse[2][pix]; vplce[2] = place+vince[2];
 		pix = bufplce[3][(place=vplce[3])>>bits]; if(pix) dest[3] = palookupoffse[3][pix]; vplce[3] = place+vince[3];
-#endif
 		dest += dc_pitch;
 	} while (--count);
 }
 #endif
 
+void mvlinec4_RGBA()
+{
+	canvas_pixel_t *dest = dc_dest;
+	int count = dc_count;
+	int bits = mvlinebits;
+	DWORD place;
+
+	uint32_t light0 = calc_light_multiplier(palookuplight[0]);
+	uint32_t light1 = calc_light_multiplier(palookuplight[1]);
+	uint32_t light2 = calc_light_multiplier(palookuplight[2]);
+	uint32_t light3 = calc_light_multiplier(palookuplight[3]);
+
+	do
+	{
+		BYTE pix;
+		pix = bufplce[0][(place = vplce[0]) >> bits]; if (pix) dest[0] = shade_pal_index(palookupoffse[0][pix], light0); vplce[0] = place + vince[0];
+		pix = bufplce[1][(place = vplce[1]) >> bits]; if (pix) dest[1] = shade_pal_index(palookupoffse[1][pix], light1); vplce[1] = place + vince[1];
+		pix = bufplce[2][(place = vplce[2]) >> bits]; if (pix) dest[2] = shade_pal_index(palookupoffse[2][pix], light2); vplce[2] = place + vince[2];
+		pix = bufplce[3][(place = vplce[3]) >> bits]; if (pix) dest[3] = shade_pal_index(palookupoffse[3][pix], light3); vplce[3] = place + vince[3];
+		dest += dc_pitch;
+	} while (--count);
+}
+
+
 extern "C" short spanend[MAXHEIGHT];
 extern float rw_light;
 extern float rw_lightstep;
@@ -2572,21 +3262,13 @@ static void R_DrawFogBoundarySection (int y, int y2, int x1)
 	BYTE *colormap = dc_colormap;
 	canvas_pixel_t *dest = ylookup[y] + dc_destorg;
 
-#ifndef PALETTEOUTPUT
-	uint32_t light = calc_light_multiplier(dc_light);
-#endif
-
 	for (; y < y2; ++y)
 	{
 		int x2 = spanend[y];
 		int x = x1;
 		do
 		{
-#ifndef PALETTEOUTPUT
-			dest[x] = shade_pal_index(colormap[dest[x]], light);
-#else
 			dest[x] = colormap[dest[x]];
-#endif
 		} while (++x <= x2);
 		dest += dc_pitch;
 	}
@@ -2598,21 +3280,13 @@ static void R_DrawFogBoundaryLine (int y, int x)
 	BYTE *colormap = dc_colormap;
 	canvas_pixel_t *dest = ylookup[y] + dc_destorg;
 
-#ifndef PALETTEOUTPUT
-	uint32_t light = calc_light_multiplier(dc_light);
-#endif
-
 	do
 	{
-#ifndef PALETTEOUTPUT
-		dest[x] = shade_pal_index(colormap[dest[x]], light);
-#else
 		dest[x] = colormap[dest[x]];
-#endif
 	} while (++x <= x2);
 }
 
-void R_DrawFogBoundary (int x1, int x2, short *uclip, short *dclip)
+void R_DrawFogBoundary_C (int x1, int x2, short *uclip, short *dclip)
 {
 	// This is essentially the same as R_MapVisPlane but with an extra step
 	// to create new horizontal spans whenever the light changes enough that
@@ -2703,6 +3377,133 @@ void R_DrawFogBoundary (int x1, int x2, short *uclip, short *dclip)
 	}
 }
 
+static void R_DrawFogBoundarySection_RGBA(int y, int y2, int x1)
+{
+	BYTE *colormap = dc_colormap;
+	canvas_pixel_t *dest = ylookup[y] + dc_destorg;
+
+	uint32_t light = calc_light_multiplier(dc_light);
+
+	for (; y < y2; ++y)
+	{
+		int x2 = spanend[y];
+		int x = x1;
+		do
+		{
+			dest[x] = shade_pal_index(colormap[dest[x]], light);
+		} while (++x <= x2);
+		dest += dc_pitch;
+	}
+}
+
+static void R_DrawFogBoundaryLine_RGBA(int y, int x)
+{
+	int x2 = spanend[y];
+	BYTE *colormap = dc_colormap;
+	canvas_pixel_t *dest = ylookup[y] + dc_destorg;
+
+	uint32_t light = calc_light_multiplier(dc_light);
+
+	do
+	{
+		dest[x] = shade_pal_index(colormap[dest[x]], light);
+	} while (++x <= x2);
+}
+
+void R_DrawFogBoundary_RGBA(int x1, int x2, short *uclip, short *dclip)
+{
+	// To do: we do not need to create new spans when using rgba output - instead we should calculate light on a per pixel basis
+
+	// This is essentially the same as R_MapVisPlane but with an extra step
+	// to create new horizontal spans whenever the light changes enough that
+	// we need to use a new colormap.
+
+	double lightstep = rw_lightstep;
+	double light = rw_light + rw_lightstep*(x2 - x1 - 1);
+	int x = x2 - 1;
+	int t2 = uclip[x];
+	int b2 = dclip[x];
+	int rcolormap = GETPALOOKUP(light, wallshade);
+	int lcolormap;
+	BYTE *basecolormapdata = basecolormap->Maps;
+
+	if (b2 > t2)
+	{
+		clearbufshort(spanend + t2, b2 - t2, x);
+	}
+
+	dc_colormap = basecolormapdata + (rcolormap << COLORMAPSHIFT);
+	dc_light = 0;
+
+	for (--x; x >= x1; --x)
+	{
+		int t1 = uclip[x];
+		int b1 = dclip[x];
+		const int xr = x + 1;
+		int stop;
+
+		light -= rw_lightstep;
+		lcolormap = GETPALOOKUP(light, wallshade);
+		if (lcolormap != rcolormap)
+		{
+			if (t2 < b2 && rcolormap != 0)
+			{ // Colormap 0 is always the identity map, so rendering it is
+			  // just a waste of time.
+				R_DrawFogBoundarySection_RGBA(t2, b2, xr);
+			}
+			if (t1 < t2) t2 = t1;
+			if (b1 > b2) b2 = b1;
+			if (t2 < b2)
+			{
+				clearbufshort(spanend + t2, b2 - t2, x);
+			}
+			rcolormap = lcolormap;
+			dc_colormap = basecolormapdata + (lcolormap << COLORMAPSHIFT);
+			dc_light = 0;
+		}
+		else
+		{
+			if (dc_colormap != basecolormapdata)
+			{
+				stop = MIN(t1, b2);
+				while (t2 < stop)
+				{
+					R_DrawFogBoundaryLine_RGBA(t2++, xr);
+				}
+				stop = MAX(b1, t2);
+				while (b2 > stop)
+				{
+					R_DrawFogBoundaryLine_RGBA(--b2, xr);
+				}
+			}
+			else
+			{
+				t2 = MAX(t2, MIN(t1, b2));
+				b2 = MIN(b2, MAX(b1, t2));
+			}
+
+			stop = MIN(t2, b1);
+			while (t1 < stop)
+			{
+				spanend[t1++] = x;
+			}
+			stop = MAX(b2, t2);
+			while (b1 > stop)
+			{
+				spanend[--b1] = x;
+			}
+		}
+
+		t2 = uclip[x];
+		b2 = dclip[x];
+	}
+	if (t2 < b2 && rcolormap != 0)
+	{
+		R_DrawFogBoundarySection_RGBA(t2, b2, x1);
+	}
+}
+
+
 int tmvlinebits;
 
 void setuptmvline (int bits)
@@ -2710,7 +3511,40 @@ void setuptmvline (int bits)
 	tmvlinebits = bits;
 }
 
-fixed_t tmvline1_add ()
+fixed_t tmvline1_add_C ()
+{
+	DWORD fracstep = dc_iscale;
+	DWORD frac = dc_texturefrac;
+	BYTE *colormap = dc_colormap;
+	int count = dc_count;
+	const BYTE *source = dc_source;
+	canvas_pixel_t *dest = dc_dest;
+	int bits = tmvlinebits;
+	int pitch = dc_pitch;
+
+	DWORD *fg2rgb = dc_srcblend;
+	DWORD *bg2rgb = dc_destblend;
+
+	uint32_t light = calc_light_multiplier(dc_light);
+
+	do
+	{
+		BYTE pix = source[frac>>bits];
+		if (pix != 0)
+		{
+			DWORD fg = fg2rgb[colormap[pix]];
+			DWORD bg = bg2rgb[*dest];
+			fg = (fg+bg) | 0x1f07c1f;
+			*dest = RGB32k.All[fg & (fg>>15)];
+		}
+		frac += fracstep;
+		dest += pitch;
+	} while (--count);
+
+	return frac;
+}
+
+fixed_t tmvline1_add_RGBA()
 {
 	DWORD fracstep = dc_iscale;
 	DWORD frac = dc_texturefrac;
@@ -2728,7 +3562,6 @@ fixed_t tmvline1_add ()
 
 	do
 	{
-#ifndef PALETTEOUTPUT
 		BYTE pix = source[frac >> bits];
 		if (pix != 0)
 		{
@@ -2747,16 +3580,6 @@ fixed_t tmvline1_add ()
 
 			*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
 		}
-#else
-		BYTE pix = source[frac>>bits];
-		if (pix != 0)
-		{
-			DWORD fg = fg2rgb[colormap[pix]];
-			DWORD bg = bg2rgb[*dest];
-			fg = (fg+bg) | 0x1f07c1f;
-			*dest = RGB32k.All[fg & (fg>>15)];
-		}
-#endif
 		frac += fracstep;
 		dest += pitch;
 	} while (--count);
@@ -2764,7 +3587,40 @@ fixed_t tmvline1_add ()
 	return frac;
 }
 
-void tmvline4_add ()
+void tmvline4_add_C ()
+{
+	canvas_pixel_t *dest = dc_dest;
+	int count = dc_count;
+	int bits = tmvlinebits;
+
+	DWORD *fg2rgb = dc_srcblend;
+	DWORD *bg2rgb = dc_destblend;
+
+	uint32_t light[4];
+	light[0] = calc_light_multiplier(palookuplight[0]);
+	light[1] = calc_light_multiplier(palookuplight[1]);
+	light[2] = calc_light_multiplier(palookuplight[2]);
+	light[3] = calc_light_multiplier(palookuplight[3]);
+
+	do
+	{
+		for (int i = 0; i < 4; ++i)
+		{
+			BYTE pix = bufplce[i][vplce[i] >> bits];
+			if (pix != 0)
+			{
+				DWORD fg = fg2rgb[palookupoffse[i][pix]];
+				DWORD bg = bg2rgb[dest[i]];
+				fg = (fg+bg) | 0x1f07c1f;
+				dest[i] = RGB32k.All[fg & (fg>>15)];
+			}
+			vplce[i] += vince[i];
+		}
+		dest += dc_pitch;
+	} while (--count);
+}
+
+void tmvline4_add_RGBA()
 {
 	canvas_pixel_t *dest = dc_dest;
 	int count = dc_count;
@@ -2786,7 +3642,6 @@ void tmvline4_add ()
 			BYTE pix = bufplce[i][vplce[i] >> bits];
 			if (pix != 0)
 			{
-#ifndef PALETTEOUTPUT
 				uint32_t fg = shade_pal_index(palookupoffse[i][pix], light[i]);
 				uint32_t fg_red = (fg >> 16) & 0xff;
 				uint32_t fg_green = (fg >> 8) & 0xff;
@@ -2801,12 +3656,6 @@ void tmvline4_add ()
 				uint32_t blue = clamp<uint32_t>(fg_blue + bg_blue, 0, 255);
 
 				dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
-#else
-				DWORD fg = fg2rgb[palookupoffse[i][pix]];
-				DWORD bg = bg2rgb[dest[i]];
-				fg = (fg+bg) | 0x1f07c1f;
-				dest[i] = RGB32k.All[fg & (fg>>15)];
-#endif
 			}
 			vplce[i] += vince[i];
 		}
@@ -2814,7 +3663,7 @@ void tmvline4_add ()
 	} while (--count);
 }
 
-fixed_t tmvline1_addclamp ()
+fixed_t tmvline1_addclamp_C ()
 {
 	DWORD fracstep = dc_iscale;
 	DWORD frac = dc_texturefrac;
@@ -2835,7 +3684,44 @@ fixed_t tmvline1_addclamp ()
 		BYTE pix = source[frac>>bits];
 		if (pix != 0)
 		{
-#ifndef PALETTEOUTPUT
+			DWORD a = fg2rgb[colormap[pix]] + bg2rgb[*dest];
+			DWORD b = a;
+
+			a |= 0x01f07c1f;
+			b &= 0x40100400;
+			a &= 0x3fffffff;
+			b = b - (b >> 5);
+			a |= b;
+			*dest = RGB32k.All[a & (a>>15)];
+		}
+		frac += fracstep;
+		dest += pitch;
+	} while (--count);
+
+	return frac;
+}
+
+fixed_t tmvline1_addclamp_RGBA()
+{
+	DWORD fracstep = dc_iscale;
+	DWORD frac = dc_texturefrac;
+	BYTE *colormap = dc_colormap;
+	int count = dc_count;
+	const BYTE *source = dc_source;
+	canvas_pixel_t *dest = dc_dest;
+	int bits = tmvlinebits;
+	int pitch = dc_pitch;
+
+	DWORD *fg2rgb = dc_srcblend;
+	DWORD *bg2rgb = dc_destblend;
+
+	uint32_t light = calc_light_multiplier(dc_light);
+
+	do
+	{
+		BYTE pix = source[frac >> bits];
+		if (pix != 0)
+		{
 			uint32_t fg = shade_pal_index(colormap[pix], light);
 			uint32_t fg_red = (fg >> 16) & 0xff;
 			uint32_t fg_green = (fg >> 8) & 0xff;
@@ -2850,17 +3736,6 @@ fixed_t tmvline1_addclamp ()
 			uint32_t blue = clamp<uint32_t>(fg_blue + bg_blue, 0, 255);
 
 			*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
-#else
-			DWORD a = fg2rgb[colormap[pix]] + bg2rgb[*dest];
-			DWORD b = a;
-
-			a |= 0x01f07c1f;
-			b &= 0x40100400;
-			a &= 0x3fffffff;
-			b = b - (b >> 5);
-			a |= b;
-			*dest = RGB32k.All[a & (a>>15)];
-#endif
 		}
 		frac += fracstep;
 		dest += pitch;
@@ -2869,7 +3744,7 @@ fixed_t tmvline1_addclamp ()
 	return frac;
 }
 
-void tmvline4_addclamp ()
+void tmvline4_addclamp_C ()
 {
 	canvas_pixel_t *dest = dc_dest;
 	int count = dc_count;
@@ -2878,6 +3753,35 @@ void tmvline4_addclamp ()
 	DWORD *fg2rgb = dc_srcblend;
 	DWORD *bg2rgb = dc_destblend;
 
+	do
+	{
+		for (int i = 0; i < 4; ++i)
+		{
+			BYTE pix = bufplce[i][vplce[i] >> bits];
+			if (pix != 0)
+			{
+				DWORD a = fg2rgb[palookupoffse[i][pix]] + bg2rgb[dest[i]];
+				DWORD b = a;
+
+				a |= 0x01f07c1f;
+				b &= 0x40100400;
+				a &= 0x3fffffff;
+				b = b - (b >> 5);
+				a |= b;
+				dest[i] = RGB32k.All[a & (a>>15)];
+			}
+			vplce[i] += vince[i];
+		}
+		dest += dc_pitch;
+	} while (--count);
+}
+
+void tmvline4_addclamp_RGBA()
+{
+	canvas_pixel_t *dest = dc_dest;
+	int count = dc_count;
+	int bits = tmvlinebits;
+
 	uint32_t light[4];
 	light[0] = calc_light_multiplier(palookuplight[0]);
 	light[1] = calc_light_multiplier(palookuplight[1]);
@@ -2891,7 +3795,6 @@ void tmvline4_addclamp ()
 			BYTE pix = bufplce[i][vplce[i] >> bits];
 			if (pix != 0)
 			{
-#ifndef PALETTEOUTPUT
 				uint32_t fg = shade_pal_index(palookupoffse[i][pix], light[i]);
 				uint32_t fg_red = (fg >> 16) & 0xff;
 				uint32_t fg_green = (fg >> 8) & 0xff;
@@ -2906,17 +3809,6 @@ void tmvline4_addclamp ()
 				uint32_t blue = clamp<uint32_t>(fg_blue + bg_blue, 0, 255);
 
 				dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
-#else
-				DWORD a = fg2rgb[palookupoffse[i][pix]] + bg2rgb[dest[i]];
-				DWORD b = a;
-
-				a |= 0x01f07c1f;
-				b &= 0x40100400;
-				a &= 0x3fffffff;
-				b = b - (b >> 5);
-				a |= b;
-				dest[i] = RGB32k.All[a & (a>>15)];
-#endif
 			}
 			vplce[i] += vince[i];
 		}
@@ -2924,7 +3816,7 @@ void tmvline4_addclamp ()
 	} while (--count);
 }
 
-fixed_t tmvline1_subclamp ()
+fixed_t tmvline1_subclamp_C ()
 {
 	DWORD fracstep = dc_iscale;
 	DWORD frac = dc_texturefrac;
@@ -2938,14 +3830,45 @@ fixed_t tmvline1_subclamp ()
 	DWORD *fg2rgb = dc_srcblend;
 	DWORD *bg2rgb = dc_destblend;
 
-	uint32_t light = calc_light_multiplier(dc_light);
-
 	do
 	{
 		BYTE pix = source[frac>>bits];
 		if (pix != 0)
 		{
-#ifndef PALETTEOUTPUT
+			DWORD a = (fg2rgb[colormap[pix]] | 0x40100400) - bg2rgb[*dest];
+			DWORD b = a;
+
+			b &= 0x40100400;
+			b = b - (b >> 5);
+			a &= b;
+			a |= 0x01f07c1f;
+			*dest = RGB32k.All[a & (a>>15)];
+		}
+		frac += fracstep;
+		dest += pitch;
+	} while (--count);
+
+	return frac;
+}
+
+fixed_t tmvline1_subclamp_RGBA()
+{
+	DWORD fracstep = dc_iscale;
+	DWORD frac = dc_texturefrac;
+	BYTE *colormap = dc_colormap;
+	int count = dc_count;
+	const BYTE *source = dc_source;
+	canvas_pixel_t *dest = dc_dest;
+	int bits = tmvlinebits;
+	int pitch = dc_pitch;
+
+	uint32_t light = calc_light_multiplier(dc_light);
+
+	do
+	{
+		BYTE pix = source[frac >> bits];
+		if (pix != 0)
+		{
 			uint32_t fg = shade_pal_index(colormap[pix], light);
 			uint32_t fg_red = (fg >> 16) & 0xff;
 			uint32_t fg_green = (fg >> 8) & 0xff;
@@ -2960,16 +3883,6 @@ fixed_t tmvline1_subclamp ()
 			uint32_t blue = clamp<uint32_t>(256 - fg_blue + bg_blue, 256, 256 + 255) - 256;
 
 			*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
-#else
-			DWORD a = (fg2rgb[colormap[pix]] | 0x40100400) - bg2rgb[*dest];
-			DWORD b = a;
-
-			b &= 0x40100400;
-			b = b - (b >> 5);
-			a &= b;
-			a |= 0x01f07c1f;
-			*dest = RGB32k.All[a & (a>>15)];
-#endif
 		}
 		frac += fracstep;
 		dest += pitch;
@@ -2978,7 +3891,7 @@ fixed_t tmvline1_subclamp ()
 	return frac;
 }
 
-void tmvline4_subclamp ()
+void tmvline4_subclamp_C ()
 {
 	canvas_pixel_t *dest = dc_dest;
 	int count = dc_count;
@@ -2987,6 +3900,34 @@ void tmvline4_subclamp ()
 	DWORD *fg2rgb = dc_srcblend;
 	DWORD *bg2rgb = dc_destblend;
 
+	do
+	{
+		for (int i = 0; i < 4; ++i)
+		{
+			BYTE pix = bufplce[i][vplce[i] >> bits];
+			if (pix != 0)
+			{
+				DWORD a = (fg2rgb[palookupoffse[i][pix]] | 0x40100400) - bg2rgb[dest[i]];
+				DWORD b = a;
+
+				b &= 0x40100400;
+				b = b - (b >> 5);
+				a &= b;
+				a |= 0x01f07c1f;
+				dest[i] = RGB32k.All[a & (a>>15)];
+			}
+			vplce[i] += vince[i];
+		}
+		dest += dc_pitch;
+	} while (--count);
+}
+
+void tmvline4_subclamp_RGBA()
+{
+	canvas_pixel_t *dest = dc_dest;
+	int count = dc_count;
+	int bits = tmvlinebits;
+
 	uint32_t light[4];
 	light[0] = calc_light_multiplier(palookuplight[0]);
 	light[1] = calc_light_multiplier(palookuplight[1]);
@@ -3000,7 +3941,6 @@ void tmvline4_subclamp ()
 			BYTE pix = bufplce[i][vplce[i] >> bits];
 			if (pix != 0)
 			{
-#ifndef PALETTEOUTPUT
 				uint32_t fg = shade_pal_index(palookupoffse[i][pix], light[i]);
 				uint32_t fg_red = (fg >> 16) & 0xff;
 				uint32_t fg_green = (fg >> 8) & 0xff;
@@ -3015,16 +3955,6 @@ void tmvline4_subclamp ()
 				uint32_t blue = clamp<uint32_t>(256 - fg_blue + bg_blue, 256, 256 + 255) - 256;
 
 				dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
-#else
-				DWORD a = (fg2rgb[palookupoffse[i][pix]] | 0x40100400) - bg2rgb[dest[i]];
-				DWORD b = a;
-
-				b &= 0x40100400;
-				b = b - (b >> 5);
-				a &= b;
-				a |= 0x01f07c1f;
-				dest[i] = RGB32k.All[a & (a>>15)];
-#endif
 			}
 			vplce[i] += vince[i];
 		}
@@ -3032,7 +3962,7 @@ void tmvline4_subclamp ()
 	} while (--count);
 }
 
-fixed_t tmvline1_revsubclamp ()
+fixed_t tmvline1_revsubclamp_C ()
 {
 	DWORD fracstep = dc_iscale;
 	DWORD frac = dc_texturefrac;
@@ -3046,14 +3976,45 @@ fixed_t tmvline1_revsubclamp ()
 	DWORD *fg2rgb = dc_srcblend;
 	DWORD *bg2rgb = dc_destblend;
 
-	uint32_t light = calc_light_multiplier(dc_light);
-
 	do
 	{
 		BYTE pix = source[frac>>bits];
 		if (pix != 0)
 		{
-#ifndef PALETTEOUTPUT
+			DWORD a = (bg2rgb[*dest] | 0x40100400) - fg2rgb[colormap[pix]];
+			DWORD b = a;
+
+			b &= 0x40100400;
+			b = b - (b >> 5);
+			a &= b;
+			a |= 0x01f07c1f;
+			*dest = RGB32k.All[a & (a>>15)];
+		}
+		frac += fracstep;
+		dest += pitch;
+	} while (--count);
+
+	return frac;
+}
+
+fixed_t tmvline1_revsubclamp_RGBA()
+{
+	DWORD fracstep = dc_iscale;
+	DWORD frac = dc_texturefrac;
+	BYTE *colormap = dc_colormap;
+	int count = dc_count;
+	const BYTE *source = dc_source;
+	canvas_pixel_t *dest = dc_dest;
+	int bits = tmvlinebits;
+	int pitch = dc_pitch;
+
+	uint32_t light = calc_light_multiplier(dc_light);
+
+	do
+	{
+		BYTE pix = source[frac >> bits];
+		if (pix != 0)
+		{
 			uint32_t fg = shade_pal_index(colormap[pix], light);
 			uint32_t fg_red = (fg >> 16) & 0xff;
 			uint32_t fg_green = (fg >> 8) & 0xff;
@@ -3068,16 +4029,6 @@ fixed_t tmvline1_revsubclamp ()
 			uint32_t blue = clamp<uint32_t>(256 + fg_blue - bg_blue, 256, 256 + 255) - 256;
 
 			*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
-#else
-			DWORD a = (bg2rgb[*dest] | 0x40100400) - fg2rgb[colormap[pix]];
-			DWORD b = a;
-
-			b &= 0x40100400;
-			b = b - (b >> 5);
-			a &= b;
-			a |= 0x01f07c1f;
-			*dest = RGB32k.All[a & (a>>15)];
-#endif
 		}
 		frac += fracstep;
 		dest += pitch;
@@ -3086,7 +4037,38 @@ fixed_t tmvline1_revsubclamp ()
 	return frac;
 }
 
-void tmvline4_revsubclamp ()
+void tmvline4_revsubclamp_C ()
+{
+	canvas_pixel_t *dest = dc_dest;
+	int count = dc_count;
+	int bits = tmvlinebits;
+
+	DWORD *fg2rgb = dc_srcblend;
+	DWORD *bg2rgb = dc_destblend;
+
+	do
+	{
+		for (int i = 0; i < 4; ++i)
+		{
+			BYTE pix = bufplce[i][vplce[i] >> bits];
+			if (pix != 0)
+			{
+				DWORD a = (bg2rgb[dest[i]] | 0x40100400) - fg2rgb[palookupoffse[i][pix]];
+				DWORD b = a;
+
+				b &= 0x40100400;
+				b = b - (b >> 5);
+				a &= b;
+				a |= 0x01f07c1f;
+				dest[i] = RGB32k.All[a & (a>>15)];
+			}
+			vplce[i] += vince[i];
+		}
+		dest += dc_pitch;
+	} while (--count);
+}
+
+void tmvline4_revsubclamp_RGBA()
 {
 	canvas_pixel_t *dest = dc_dest;
 	int count = dc_count;
@@ -3108,7 +4090,6 @@ void tmvline4_revsubclamp ()
 			BYTE pix = bufplce[i][vplce[i] >> bits];
 			if (pix != 0)
 			{
-#ifndef PALETTEOUTPUT
 				uint32_t fg = shade_pal_index(palookupoffse[i][pix], light[i]);
 				uint32_t fg_red = (fg >> 16) & 0xff;
 				uint32_t fg_green = (fg >> 8) & 0xff;
@@ -3123,16 +4104,6 @@ void tmvline4_revsubclamp ()
 				uint32_t blue = clamp<uint32_t>(256 + fg_blue - bg_blue, 256, 256 + 255) - 256;
 
 				dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
-#else
-				DWORD a = (bg2rgb[dest[i]] | 0x40100400) - fg2rgb[palookupoffse[i][pix]];
-				DWORD b = a;
-
-				b &= 0x40100400;
-				b = b - (b >> 5);
-				a &= b;
-				a |= 0x01f07c1f;
-				dest[i] = RGB32k.All[a & (a>>15)];
-#endif
 			}
 			vplce[i] += vince[i];
 		}
@@ -3164,6 +4135,85 @@ const BYTE *R_GetColumn (FTexture *tex, int col)
 // [RH] Initialize the column drawer pointers
 void R_InitColumnDrawers ()
 {
+#ifndef PALETTEOUTPUT
+
+	R_DrawColumnHoriz			= R_DrawColumnHorizP_RGBA_C;
+	R_DrawColumn				= R_DrawColumnP_RGBA_C;
+	R_DrawFuzzColumn			= R_DrawFuzzColumnP_RGBA_C;
+	R_DrawTranslatedColumn		= R_DrawTranslatedColumnP_RGBA_C;
+	R_DrawShadedColumn			= R_DrawShadedColumnP_RGBA_C;
+	R_DrawSpan					= R_DrawSpanP_RGBA_C;
+	R_DrawSpanMasked			= R_DrawSpanMaskedP_RGBA_C;
+	rt_map4cols					= rt_map4cols_RGBA_c;
+
+	R_DrawSpanTranslucent		= R_DrawSpanTranslucentP_RGBA_C;
+	R_DrawSpanMaskedTranslucent = R_DrawSpanMaskedTranslucentP_RGBA_C;
+	R_DrawSpanAddClamp			= R_DrawSpanAddClampP_RGBA_C;
+	R_DrawSpanMaskedAddClamp	= R_DrawSpanMaskedAddClampP_RGBA_C;
+	R_FillColumn				= R_FillColumnP_RGBA;
+	R_FillAddColumn				= R_FillAddColumn_RGBA_C;
+	R_FillAddClampColumn		= R_FillAddClampColumn_RGBA;
+	R_FillSubClampColumn		= R_FillSubClampColumn_RGBA;
+	R_FillRevSubClampColumn		= R_FillRevSubClampColumn_RGBA;
+	R_DrawAddColumn				= R_DrawAddColumnP_RGBA_C;
+	R_DrawTlatedAddColumn		= R_DrawTlatedAddColumnP_RGBA_C;
+	R_DrawAddClampColumn		= R_DrawAddClampColumnP_RGBA_C;
+	R_DrawAddClampTranslatedColumn = R_DrawAddClampTranslatedColumnP_RGBA_C;
+	R_DrawSubClampColumn		= R_DrawSubClampColumnP_RGBA_C;
+	R_DrawSubClampTranslatedColumn = R_DrawSubClampTranslatedColumnP_RGBA_C;
+	R_DrawRevSubClampColumn		= R_DrawRevSubClampColumnP_RGBA_C;
+	R_DrawRevSubClampTranslatedColumn = R_DrawRevSubClampTranslatedColumnP_RGBA_C;
+	R_FillSpan					= R_FillSpan_RGBA;
+	R_DrawFogBoundary			= R_DrawFogBoundary_RGBA;
+	R_FillColumnHoriz			= R_FillColumnHorizP_RGBA_C;
+
+	R_DrawFogBoundary			= R_DrawFogBoundary_RGBA;
+	R_MapColoredPlane			= R_MapColoredPlane_RGBA;
+	R_DrawParticle				= R_DrawParticle_RGBA;
+
+	tmvline1_add				= tmvline1_add_RGBA;
+	tmvline4_add				= tmvline4_add_RGBA;
+	tmvline1_addclamp			= tmvline1_addclamp_RGBA;
+	tmvline4_addclamp			= tmvline4_addclamp_RGBA;
+	tmvline1_subclamp			= tmvline1_subclamp_RGBA;
+	tmvline4_subclamp			= tmvline4_subclamp_RGBA;
+	tmvline1_revsubclamp		= tmvline1_revsubclamp_RGBA;
+	tmvline4_revsubclamp		= tmvline4_revsubclamp_RGBA;
+
+	rt_copy1col					= rt_copy1col_RGBA_c;
+	rt_copy4cols				= rt_copy4cols_RGBA_c;
+	rt_map1col					= rt_map1col_RGBA_c;
+	rt_shaded4cols				= rt_shaded4cols_RGBA_c;
+	rt_add4cols					= rt_add4cols_RGBA_c;
+	rt_addclamp4cols			= rt_addclamp4cols_RGBA_c;
+	rt_shaded1col				= rt_shaded1col_RGBA_c;
+	rt_add1col					= rt_add1col_RGBA_c;
+	rt_addclamp1col				= rt_addclamp1col_RGBA_c;
+	rt_subclamp1col				= rt_subclamp1col_RGBA_c;
+	rt_revsubclamp1col			= rt_revsubclamp1col_RGBA_c;
+	rt_tlate1col				= rt_tlate1col_RGBA_c;
+	rt_tlateadd1col				= rt_tlateadd1col_RGBA_c;
+	rt_tlateaddclamp1col		= rt_tlateaddclamp1col_RGBA_c;
+	rt_tlatesubclamp1col		= rt_tlatesubclamp1col_RGBA_c;
+	rt_tlaterevsubclamp1col		= rt_tlaterevsubclamp1col_RGBA_c;
+	rt_map4cols					= rt_map4cols_RGBA_c;
+	rt_subclamp4cols			= rt_subclamp4cols_RGBA_c;
+	rt_revsubclamp4cols			= rt_revsubclamp4cols_RGBA_c;
+	rt_tlate4cols				= rt_tlate4cols_RGBA_c;
+	rt_tlateadd4cols			= rt_tlateadd4cols_RGBA_c;
+	rt_tlateaddclamp4cols		= rt_tlateaddclamp4cols_RGBA_c;
+	rt_tlatesubclamp4cols		= rt_tlatesubclamp4cols_RGBA_c;
+	rt_tlaterevsubclamp4cols	= rt_tlaterevsubclamp4cols_RGBA_c;
+	rt_initcols					= rt_initcols_rgba;
+
+	dovline1					= vlinec1_RGBA;
+	doprevline1					= vlinec1_RGBA;
+	dovline4					= vlinec4_RGBA;
+	domvline1					= mvlinec1_RGBA;
+	domvline4					= mvlinec4_RGBA;
+
+#else
+
 #ifdef X86_ASM
 	R_DrawColumn				= R_DrawColumnP_ASM;
 	R_DrawColumnHoriz			= R_DrawColumnHorizP_ASM;
@@ -3194,6 +4244,72 @@ void R_InitColumnDrawers ()
 	R_DrawSpanMaskedTranslucent = R_DrawSpanMaskedTranslucentP_C;
 	R_DrawSpanAddClamp			= R_DrawSpanAddClampP_C;
 	R_DrawSpanMaskedAddClamp	= R_DrawSpanMaskedAddClampP_C;
+	R_FillColumn				= R_FillColumnP_C;
+	R_FillAddColumn				= R_FillAddColumn_C;
+	R_FillAddClampColumn		= R_FillAddClampColumn_C;
+	R_FillSubClampColumn		= R_FillSubClampColumn_C;
+	R_FillRevSubClampColumn		= R_FillRevSubClampColumn_C;
+	R_DrawAddColumn				= R_DrawAddColumnP_C;
+	R_DrawTlatedAddColumn		= R_DrawTlatedAddColumnP_C;
+	R_DrawAddClampColumn		= R_DrawAddClampColumnP_C;
+	R_DrawAddClampTranslatedColumn = R_DrawAddClampTranslatedColumnP_C;
+	R_DrawSubClampColumn		= R_DrawSubClampColumnP_C;
+	R_DrawSubClampTranslatedColumn = R_DrawSubClampTranslatedColumnP_C;
+	R_DrawRevSubClampColumn		= R_DrawRevSubClampColumnP_C;
+	R_DrawRevSubClampTranslatedColumn = R_DrawRevSubClampTranslatedColumnP_C;
+	R_FillSpan					= R_FillSpan_C;
+	R_DrawFogBoundary			= R_DrawFogBoundary_C;
+	R_FillColumnHoriz			= R_FillColumnHorizP_C;
+
+	R_DrawFogBoundary			= R_DrawFogBoundary_C;
+	R_MapColoredPlane			= R_MapColoredPlane_C;
+	R_DrawParticle				= R_DrawParticle_C;
+
+	tmvline1_add				= tmvline1_add_C;
+	tmvline4_add				= tmvline4_add_C;
+	tmvline1_addclamp			= tmvline1_addclamp_C;
+	tmvline4_addclamp			= tmvline4_addclamp_C;
+	tmvline1_subclamp			= tmvline1_subclamp_C;
+	tmvline4_subclamp			= tmvline4_subclamp_C;
+	tmvline1_revsubclamp		= tmvline1_revsubclamp_C;
+	tmvline4_revsubclamp		= tmvline4_revsubclamp_C;
+
+#ifdef X86_ASM
+	rt_copy1col					= rt_copy1col_asm;
+	rt_copy4cols				= rt_copy4cols_asm;
+	rt_map1col					= rt_map1col_asm;
+	rt_shaded4cols				= rt_shaded4cols_asm;
+	rt_add4cols					= rt_add4cols_asm;
+	rt_addclamp4cols			= rt_addclamp4cols_asm;
+#else
+	rt_copy1col					= rt_copy1col_c;
+	rt_copy4cols				= rt_copy4cols_c;
+	rt_map1col					= rt_map1col_c;
+	rt_shaded4cols				= rt_shaded4cols_c;
+	rt_add4cols					= rt_add4cols_c;
+	rt_addclamp4cols			= rt_addclamp4cols_c;
+#endif
+	rt_shaded1col				= rt_shaded1col_c;
+	rt_add1col					= rt_add1col_c;
+	rt_addclamp1col				= rt_addclamp1col_c;
+	rt_subclamp1col				= rt_subclamp1col_c;
+	rt_revsubclamp1col			= rt_revsubclamp1col_c;
+	rt_tlate1col				= rt_tlate1col_c;
+	rt_tlateadd1col				= rt_tlateadd1col_c;
+	rt_tlateaddclamp1col		= rt_tlateaddclamp1col_c;
+	rt_tlatesubclamp1col		= rt_tlatesubclamp1col_c;
+	rt_tlaterevsubclamp1col		= rt_tlaterevsubclamp1col_c;
+	rt_map4cols					= rt_map4cols_c;
+	rt_subclamp4cols			= rt_subclamp4cols_c;
+	rt_revsubclamp4cols			= rt_revsubclamp4cols_c;
+	rt_tlate4cols				= rt_tlate4cols_c;
+	rt_tlateadd4cols			= rt_tlateadd4cols_c;
+	rt_tlateaddclamp4cols		= rt_tlateaddclamp4cols_c;
+	rt_tlatesubclamp4cols		= rt_tlatesubclamp4cols_c;
+	rt_tlaterevsubclamp4cols	= rt_tlaterevsubclamp4cols_c;
+	rt_initcols					= rt_initcols_pal;
+
+#endif
 }
 
 // [RH] Choose column drawers in a single place
@@ -3211,7 +4327,7 @@ static bool R_SetBlendFunc (int op, fixed_t fglevel, fixed_t bglevel, int flags)
 	{
 		if (flags & STYLEF_ColorIsFixed)
 		{
-			colfunc = R_FillColumnP;
+			colfunc = R_FillColumn;
 			hcolfunc_post1 = rt_copy1col;
 			hcolfunc_post4 = rt_copy4cols;
 		}
@@ -3261,13 +4377,13 @@ static bool R_SetBlendFunc (int op, fixed_t fglevel, fixed_t bglevel, int flags)
 			}
 			else if (dc_translation == NULL)
 			{
-				colfunc = R_DrawAddColumnP_C;
+				colfunc = R_DrawAddColumn;
 				hcolfunc_post1 = rt_add1col;
 				hcolfunc_post4 = rt_add4cols;
 			}
 			else
 			{
-				colfunc = R_DrawTlatedAddColumnP_C;
+				colfunc = R_DrawTlatedAddColumn;
 				hcolfunc_post1 = rt_tlateadd1col;
 				hcolfunc_post4 = rt_tlateadd4cols;
 			}
@@ -3282,13 +4398,13 @@ static bool R_SetBlendFunc (int op, fixed_t fglevel, fixed_t bglevel, int flags)
 			}
 			else if (dc_translation == NULL)
 			{
-				colfunc = R_DrawAddClampColumnP_C;
+				colfunc = R_DrawAddClampColumn;
 				hcolfunc_post1 = rt_addclamp1col;
 				hcolfunc_post4 = rt_addclamp4cols;
 			}
 			else
 			{
-				colfunc = R_DrawAddClampTranslatedColumnP_C;
+				colfunc = R_DrawAddClampTranslatedColumn;
 				hcolfunc_post1 = rt_tlateaddclamp1col;
 				hcolfunc_post4 = rt_tlateaddclamp4cols;
 			}
@@ -3304,13 +4420,13 @@ static bool R_SetBlendFunc (int op, fixed_t fglevel, fixed_t bglevel, int flags)
 		}
 		else if (dc_translation == NULL)
 		{
-			colfunc = R_DrawSubClampColumnP_C;
+			colfunc = R_DrawSubClampColumn;
 			hcolfunc_post1 = rt_subclamp1col;
 			hcolfunc_post4 = rt_subclamp4cols;
 		}
 		else
 		{
-			colfunc = R_DrawSubClampTranslatedColumnP_C;
+			colfunc = R_DrawSubClampTranslatedColumn;
 			hcolfunc_post1 = rt_tlatesubclamp1col;
 			hcolfunc_post4 = rt_tlatesubclamp4cols;
 		}
@@ -3329,13 +4445,13 @@ static bool R_SetBlendFunc (int op, fixed_t fglevel, fixed_t bglevel, int flags)
 		}
 		else if (dc_translation == NULL)
 		{
-			colfunc = R_DrawRevSubClampColumnP_C;
+			colfunc = R_DrawRevSubClampColumn;
 			hcolfunc_post1 = rt_revsubclamp1col;
 			hcolfunc_post4 = rt_revsubclamp4cols;
 		}
 		else
 		{
-			colfunc = R_DrawRevSubClampTranslatedColumnP_C;
+			colfunc = R_DrawRevSubClampTranslatedColumn;
 			hcolfunc_post1 = rt_tlaterevsubclamp1col;
 			hcolfunc_post4 = rt_tlaterevsubclamp4cols;
 		}
@@ -3440,7 +4556,7 @@ ESPSResult R_SetPatchStyle (FRenderStyle style, fixed_t alpha, int translation,
 		// dc_srccolor is used by the R_Fill* routines. It is premultiplied
 		// with the alpha.
 		dc_srccolor = ((((r*x)>>4)<<20) | ((g*x)>>4) | ((((b)*x)>>4)<<10)) & 0x3feffbff;
-		hcolfunc_pre = R_FillColumnHorizP;
+		hcolfunc_pre = R_FillColumnHoriz;
 		dc_colormap = identitymap;
 		dc_light = 0;
 	}
@@ -3459,25 +4575,25 @@ void R_FinishSetPatchStyle ()
 
 bool R_GetTransMaskDrawers (fixed_t (**tmvline1)(), void (**tmvline4)())
 {
-	if (colfunc == R_DrawAddColumnP_C)
+	if (colfunc == R_DrawAddColumn)
 	{
 		*tmvline1 = tmvline1_add;
 		*tmvline4 = tmvline4_add;
 		return true;
 	}
-	if (colfunc == R_DrawAddClampColumnP_C)
+	if (colfunc == R_DrawAddClampColumn)
 	{
 		*tmvline1 = tmvline1_addclamp;
 		*tmvline4 = tmvline4_addclamp;
 		return true;
 	}
-	if (colfunc == R_DrawSubClampColumnP_C)
+	if (colfunc == R_DrawSubClampColumn)
 	{
 		*tmvline1 = tmvline1_subclamp;
 		*tmvline4 = tmvline4_subclamp;
 		return true;
 	}
-	if (colfunc == R_DrawRevSubClampColumnP_C)
+	if (colfunc == R_DrawRevSubClampColumn)
 	{
 		*tmvline1 = tmvline1_revsubclamp;
 		*tmvline4 = tmvline4_revsubclamp;
diff --git a/src/r_draw.h b/src/r_draw.h
index 6f7a91154..17698c360 100644
--- a/src/r_draw.h
+++ b/src/r_draw.h
@@ -127,33 +127,33 @@ extern "C"
 void rt_copy1col_c (int hx, int sx, int yl, int yh);
 void rt_copy4cols_c (int sx, int yl, int yh);
 
-void rt_shaded1col (int hx, int sx, int yl, int yh);
+void rt_shaded1col_c (int hx, int sx, int yl, int yh);
 void rt_shaded4cols_c (int sx, int yl, int yh);
 void rt_shaded4cols_asm (int sx, int yl, int yh);
 
 void rt_map1col_c (int hx, int sx, int yl, int yh);
-void rt_add1col (int hx, int sx, int yl, int yh);
-void rt_addclamp1col (int hx, int sx, int yl, int yh);
-void rt_subclamp1col (int hx, int sx, int yl, int yh);
-void rt_revsubclamp1col (int hx, int sx, int yl, int yh);
+void rt_add1col_c (int hx, int sx, int yl, int yh);
+void rt_addclamp1col_c (int hx, int sx, int yl, int yh);
+void rt_subclamp1col_c (int hx, int sx, int yl, int yh);
+void rt_revsubclamp1col_c (int hx, int sx, int yl, int yh);
 
-void rt_tlate1col (int hx, int sx, int yl, int yh);
-void rt_tlateadd1col (int hx, int sx, int yl, int yh);
-void rt_tlateaddclamp1col (int hx, int sx, int yl, int yh);
-void rt_tlatesubclamp1col (int hx, int sx, int yl, int yh);
-void rt_tlaterevsubclamp1col (int hx, int sx, int yl, int yh);
+void rt_tlate1col_c (int hx, int sx, int yl, int yh);
+void rt_tlateadd1col_c (int hx, int sx, int yl, int yh);
+void rt_tlateaddclamp1col_c (int hx, int sx, int yl, int yh);
+void rt_tlatesubclamp1col_c (int hx, int sx, int yl, int yh);
+void rt_tlaterevsubclamp1col_c (int hx, int sx, int yl, int yh);
 
 void rt_map4cols_c (int sx, int yl, int yh);
 void rt_add4cols_c (int sx, int yl, int yh);
 void rt_addclamp4cols_c (int sx, int yl, int yh);
-void rt_subclamp4cols (int sx, int yl, int yh);
-void rt_revsubclamp4cols (int sx, int yl, int yh);
+void rt_subclamp4cols_c (int sx, int yl, int yh);
+void rt_revsubclamp4cols_c (int sx, int yl, int yh);
 
-void rt_tlate4cols (int sx, int yl, int yh);
-void rt_tlateadd4cols (int sx, int yl, int yh);
-void rt_tlateaddclamp4cols (int sx, int yl, int yh);
-void rt_tlatesubclamp4cols (int sx, int yl, int yh);
-void rt_tlaterevsubclamp4cols (int sx, int yl, int yh);
+void rt_tlate4cols_c (int sx, int yl, int yh);
+void rt_tlateadd4cols_c (int sx, int yl, int yh);
+void rt_tlateaddclamp4cols_c (int sx, int yl, int yh);
+void rt_tlatesubclamp4cols_c (int sx, int yl, int yh);
+void rt_tlaterevsubclamp4cols_c (int sx, int yl, int yh);
 
 void rt_copy1col_asm (int hx, int sx, int yl, int yh);
 void rt_map1col_asm (int hx, int sx, int yl, int yh);
@@ -163,32 +163,83 @@ void rt_map4cols_asm1 (int sx, int yl, int yh);
 void rt_map4cols_asm2 (int sx, int yl, int yh);
 void rt_add4cols_asm (int sx, int yl, int yh);
 void rt_addclamp4cols_asm (int sx, int yl, int yh);
+
+///
+
+void rt_copy1col_RGBA_c (int hx, int sx, int yl, int yh);
+void rt_copy4cols_RGBA_c (int sx, int yl, int yh);
+
+void rt_shaded1col_RGBA_c (int hx, int sx, int yl, int yh);
+void rt_shaded4cols_RGBA_c (int sx, int yl, int yh);
+
+void rt_map1col_RGBA_c (int hx, int sx, int yl, int yh);
+void rt_add1col_RGBA_c (int hx, int sx, int yl, int yh);
+void rt_addclamp1col_RGBA_c (int hx, int sx, int yl, int yh);
+void rt_subclamp1col_RGBA_c (int hx, int sx, int yl, int yh);
+void rt_revsubclamp1col_RGBA_c (int hx, int sx, int yl, int yh);
+
+void rt_tlate1col_RGBA_c (int hx, int sx, int yl, int yh);
+void rt_tlateadd1col_RGBA_c (int hx, int sx, int yl, int yh);
+void rt_tlateaddclamp1col_RGBA_c (int hx, int sx, int yl, int yh);
+void rt_tlatesubclamp1col_RGBA_c (int hx, int sx, int yl, int yh);
+void rt_tlaterevsubclamp1col_RGBA_c (int hx, int sx, int yl, int yh);
+
+void rt_map4cols_RGBA_c (int sx, int yl, int yh);
+void rt_add4cols_RGBA_c (int sx, int yl, int yh);
+void rt_addclamp4cols_RGBA_c (int sx, int yl, int yh);
+void rt_subclamp4cols_RGBA_c (int sx, int yl, int yh);
+void rt_revsubclamp4cols_RGBA_c (int sx, int yl, int yh);
+
+void rt_tlate4cols_RGBA_c (int sx, int yl, int yh);
+void rt_tlateadd4cols_RGBA_c (int sx, int yl, int yh);
+void rt_tlateaddclamp4cols_RGBA_c (int sx, int yl, int yh);
+void rt_tlatesubclamp4cols_RGBA_c (int sx, int yl, int yh);
+void rt_tlaterevsubclamp4cols_RGBA_c (int sx, int yl, int yh);
+
 }
 
-extern void (*rt_map4cols)(int sx, int yl, int yh);
+extern void (*rt_copy1col)(int hx, int sx, int yl, int yh);
+extern void (*rt_copy4cols)(int sx, int yl, int yh);
 
-#ifdef X86_ASM
-#define rt_copy1col			rt_copy1col_asm
-#define rt_copy4cols		rt_copy4cols_asm
-#define rt_map1col			rt_map1col_asm
-#define rt_shaded4cols		rt_shaded4cols_asm
-#define rt_add4cols			rt_add4cols_asm
-#define rt_addclamp4cols	rt_addclamp4cols_asm
-#else
-#define rt_copy1col			rt_copy1col_c
-#define rt_copy4cols		rt_copy4cols_c
-#define rt_map1col			rt_map1col_c
-#define rt_shaded4cols		rt_shaded4cols_c
-#define rt_add4cols			rt_add4cols_c
-#define rt_addclamp4cols	rt_addclamp4cols_c
-#endif
+extern void (*rt_shaded1col)(int hx, int sx, int yl, int yh);
+extern void (*rt_shaded4cols)(int sx, int yl, int yh);
+
+extern void (*rt_map1col)(int hx, int sx, int yl, int yh);
+extern void (*rt_add1col)(int hx, int sx, int yl, int yh);
+extern void (*rt_addclamp1col)(int hx, int sx, int yl, int yh);
+extern void (*rt_subclamp1col)(int hx, int sx, int yl, int yh);
+extern void (*rt_revsubclamp1col)(int hx, int sx, int yl, int yh);
+
+extern void (*rt_tlate1col)(int hx, int sx, int yl, int yh);
+extern void (*rt_tlateadd1col)(int hx, int sx, int yl, int yh);
+extern void (*rt_tlateaddclamp1col)(int hx, int sx, int yl, int yh);
+extern void (*rt_tlatesubclamp1col)(int hx, int sx, int yl, int yh);
+extern void (*rt_tlaterevsubclamp1col)(int hx, int sx, int yl, int yh);
+
+extern void (*rt_map4cols)(int sx, int yl, int yh);
+extern void (*rt_add4cols)(int sx, int yl, int yh);
+extern void (*rt_addclamp4cols)(int sx, int yl, int yh);
+extern void (*rt_subclamp4cols)(int sx, int yl, int yh);
+extern void (*rt_revsubclamp4cols)(int sx, int yl, int yh);
+
+extern void (*rt_tlate4cols)(int sx, int yl, int yh);
+extern void (*rt_tlateadd4cols)(int sx, int yl, int yh);
+extern void (*rt_tlateaddclamp4cols)(int sx, int yl, int yh);
+extern void (*rt_tlatesubclamp4cols)(int sx, int yl, int yh);
+extern void (*rt_tlaterevsubclamp4cols)(int sx, int yl, int yh);
+
+extern void (*rt_initcols)(canvas_pixel_t *buffer);
 
 void rt_draw4cols (int sx);
 
 // [RH] Preps the temporary horizontal buffer.
-void rt_initcols (canvas_pixel_t *buffer=NULL);
+void rt_initcols_pal (canvas_pixel_t *buffer);
+void rt_initcols_rgba (canvas_pixel_t *buffer);
 
-void R_DrawFogBoundary (int x1, int x2, short *uclip, short *dclip);
+
+extern void (*R_DrawFogBoundary)(int x1, int x2, short *uclip, short *dclip);
+
+void R_DrawFogBoundary_C (int x1, int x2, short *uclip, short *dclip);
 
 
 #ifdef X86_ASM
@@ -212,6 +263,14 @@ void	R_DrawShadedColumnP_C (void);
 void	R_DrawSpanP_C (void);
 void	R_DrawSpanMaskedP_C (void);
 
+void	R_DrawColumnHorizP_RGBA_C (void);
+void	R_DrawColumnP_RGBA_C (void);
+void	R_DrawFuzzColumnP_RGBA_C (void);
+void	R_DrawTranslatedColumnP_RGBA_C (void);
+void	R_DrawShadedColumnP_RGBA_C (void);
+void	R_DrawSpanP_RGBA_C (void);
+void	R_DrawSpanMaskedP_RGBA_C (void);
+
 #endif
 
 void	R_DrawSpanTranslucentP_C (void);
@@ -220,9 +279,30 @@ void	R_DrawSpanMaskedTranslucentP_C (void);
 void	R_DrawTlatedLucentColumnP_C (void);
 #define R_DrawTlatedLucentColumn R_DrawTlatedLucentColumnP_C
 
-void	R_FillColumnP (void);
-void	R_FillColumnHorizP (void);
-void	R_FillSpan (void);
+extern void(*R_FillColumn)(void);
+extern void(*R_FillAddColumn)(void);
+extern void(*R_FillAddClampColumn)(void);
+extern void(*R_FillSubClampColumn)(void);
+extern void(*R_FillRevSubClampColumn)(void);
+extern void(*R_DrawAddColumn)(void);
+extern void(*R_DrawTlatedAddColumn)(void);
+extern void(*R_DrawAddClampColumn)(void);
+extern void(*R_DrawAddClampTranslatedColumn)(void);
+extern void(*R_DrawSubClampColumn)(void);
+extern void(*R_DrawSubClampTranslatedColumn)(void);
+extern void(*R_DrawRevSubClampColumn)(void);
+extern void(*R_DrawRevSubClampTranslatedColumn)(void);
+
+extern void(*R_FillSpan)(void);
+extern void(*R_FillColumnHoriz)(void);
+
+void	R_FillColumnP_C (void);
+
+void	R_FillColumnHorizP_C (void);
+void	R_FillSpan_C (void);
+
+void	R_FillColumnHorizP_RGBA_C(void);
+void	R_FillSpan_RGBA_C(void);
 
 #ifdef X86_ASM
 #define R_SetupDrawSlab R_SetupDrawSlabA
@@ -282,6 +362,15 @@ inline ESPSResult R_SetPatchStyle(FRenderStyle style, float alpha, int translati
 // style was STYLE_Shade
 void R_FinishSetPatchStyle ();
 
+extern fixed_t(*tmvline1_add)();
+extern void(*tmvline4_add)();
+extern fixed_t(*tmvline1_addclamp)();
+extern void(*tmvline4_addclamp)();
+extern fixed_t(*tmvline1_subclamp)();
+extern void(*tmvline4_subclamp)();
+extern fixed_t(*tmvline1_revsubclamp)();
+extern void(*tmvline4_revsubclamp)();
+
 // transmaskwallscan calls this to find out what column drawers to use
 bool R_GetTransMaskDrawers (fixed_t (**tmvline1)(), void (**tmvline4)());
 
diff --git a/src/r_drawt.cpp b/src/r_drawt.cpp
index f5fc027b5..9520f59b3 100644
--- a/src/r_drawt.cpp
+++ b/src/r_drawt.cpp
@@ -114,13 +114,6 @@ void rt_copy1col_c (int hx, int sx, int yl, int yh)
 // Copies all four spans to the screen starting at sx.
 void rt_copy4cols_c (int sx, int yl, int yh)
 {
-#ifndef PALETTEOUTPUT
-	// To do: we could do this with SSE using __m128i
-	rt_copy1col_c(0, sx, yl, yh);
-	rt_copy1col_c(1, sx + 1, yl, yh);
-	rt_copy1col_c(2, sx + 2, yl, yh);
-	rt_copy1col_c(3, sx + 3, yl, yh);
-#else
 	int *source;
 	int *dest;
 	int count;
@@ -149,7 +142,6 @@ void rt_copy4cols_c (int sx, int yl, int yh)
 		source += 8/sizeof(int);
 		dest += pitch*2;
 	} while (--count);
-#endif
 }
 
 // Maps one span at hx to the screen at sx.
@@ -166,21 +158,13 @@ void rt_map1col_c (int hx, int sx, int yl, int yh)
 		return;
 	count++;
 
-#ifndef PALETTEOUTPUT
-	uint32_t light = calc_light_multiplier(dc_light);
-#endif
-
 	colormap = dc_colormap;
 	dest = ylookup[yl] + sx + dc_destorg;
 	source = &dc_temp[yl*4 + hx];
 	pitch = dc_pitch;
 
 	if (count & 1) {
-#ifndef PALETTEOUTPUT
-		*dest = shade_pal_index(colormap[*source], light);
-#else
 		*dest = colormap[*source];
-#endif
 		source += 4;
 		dest += pitch;
 	}
@@ -188,13 +172,8 @@ void rt_map1col_c (int hx, int sx, int yl, int yh)
 		return;
 
 	do {
-#ifndef PALETTEOUTPUT
-		dest[0] = shade_pal_index(colormap[source[0]], light);
-		dest[pitch] = shade_pal_index(colormap[source[4]], light);
-#else
 		dest[0] = colormap[source[0]];
 		dest[pitch] = colormap[source[4]];
-#endif
 		source += 8;
 		dest += pitch*2;
 	} while (--count);
@@ -214,27 +193,16 @@ void rt_map4cols_c (int sx, int yl, int yh)
 		return;
 	count++;
 
-#ifndef PALETTEOUTPUT
-	uint32_t light = calc_light_multiplier(dc_light);
-#endif
-
 	colormap = dc_colormap;
 	dest = ylookup[yl] + sx + dc_destorg;
 	source = &dc_temp[yl*4];
 	pitch = dc_pitch;
 	
 	if (count & 1) {
-#ifndef PALETTEOUTPUT
-		dest[0] = shade_pal_index(colormap[source[0]], light);
-		dest[1] = shade_pal_index(colormap[source[1]], light);
-		dest[2] = shade_pal_index(colormap[source[2]], light);
-		dest[3] = shade_pal_index(colormap[source[3]], light);
-#else
 		dest[0] = colormap[source[0]];
 		dest[1] = colormap[source[1]];
 		dest[2] = colormap[source[2]];
 		dest[3] = colormap[source[3]];
-#endif
 		source += 4;
 		dest += pitch;
 	}
@@ -242,16 +210,6 @@ void rt_map4cols_c (int sx, int yl, int yh)
 		return;
 
 	do {
-#ifndef PALETTEOUTPUT
-		dest[0] = shade_pal_index(colormap[source[0]], light);
-		dest[1] = shade_pal_index(colormap[source[1]], light);
-		dest[2] = shade_pal_index(colormap[source[2]], light);
-		dest[3] = shade_pal_index(colormap[source[3]], light);
-		dest[pitch] = shade_pal_index(colormap[source[4]], light);
-		dest[pitch + 1] = shade_pal_index(colormap[source[5]], light);
-		dest[pitch + 2] = shade_pal_index(colormap[source[6]], light);
-		dest[pitch + 3] = shade_pal_index(colormap[source[7]], light);
-#else
 		dest[0] = colormap[source[0]];
 		dest[1] = colormap[source[1]];
 		dest[2] = colormap[source[2]];
@@ -260,7 +218,6 @@ void rt_map4cols_c (int sx, int yl, int yh)
 		dest[pitch+1] = colormap[source[5]];
 		dest[pitch+2] = colormap[source[6]];
 		dest[pitch+3] = colormap[source[7]];
-#endif
 		source += 8;
 		dest += pitch*2;
 	} while (--count);
@@ -356,21 +313,21 @@ void rt_Translate4cols(const BYTE *translation, int yl, int yh)
 }
 
 // Translates one span at hx to the screen at sx.
-void rt_tlate1col (int hx, int sx, int yl, int yh)
+void rt_tlate1col_c (int hx, int sx, int yl, int yh)
 {
 	rt_Translate1col(dc_translation, hx, yl, yh);
 	rt_map1col(hx, sx, yl, yh);
 }
 
 // Translates all four spans to the screen starting at sx.
-void rt_tlate4cols (int sx, int yl, int yh)
+void rt_tlate4cols_c (int sx, int yl, int yh)
 {
 	rt_Translate4cols(dc_translation, yl, yh);
 	rt_map4cols(sx, yl, yh);
 }
 
 // Adds one span at hx to the screen at sx without clamping.
-void rt_add1col (int hx, int sx, int yl, int yh)
+void rt_add1col_c (int hx, int sx, int yl, int yh)
 {
 	BYTE *colormap;
 	canvas_pixel_t *source;
@@ -388,29 +345,6 @@ void rt_add1col (int hx, int sx, int yl, int yh)
 	pitch = dc_pitch;
 	colormap = dc_colormap;
 
-#ifndef PALETTEOUTPUT
-	uint32_t light = calc_light_multiplier(dc_light);
-
-	do {
-		uint32_t fg = shade_pal_index(colormap[*source], light);
-		uint32_t fg_red = (fg >> 16) & 0xff;
-		uint32_t fg_green = (fg >> 8) & 0xff;
-		uint32_t fg_blue = fg & 0xff;
-
-		uint32_t bg_red = (*dest >> 16) & 0xff;
-		uint32_t bg_green = (*dest >> 8) & 0xff;
-		uint32_t bg_blue = (*dest) & 0xff;
-
-		uint32_t red = clamp<uint32_t>(fg_red + bg_red, 0, 255);
-		uint32_t green = clamp<uint32_t>(fg_green + bg_green, 0, 255);
-		uint32_t blue = clamp<uint32_t>(fg_blue + bg_blue, 0, 255);
-
-		*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
-
-		source += 4;
-		dest += pitch;
-	} while (--count);
-#else
 	DWORD *fg2rgb = dc_srcblend;
 	DWORD *bg2rgb = dc_destblend;
 	do {
@@ -424,7 +358,6 @@ void rt_add1col (int hx, int sx, int yl, int yh)
 		source += 4;
 		dest += pitch;
 	} while (--count);
-#endif
 }
 
 // Adds all four spans to the screen starting at sx without clamping.
@@ -446,32 +379,6 @@ void rt_add4cols_c (int sx, int yl, int yh)
 	pitch = dc_pitch;
 	colormap = dc_colormap;
 
-#ifndef PALETTEOUTPUT
-	uint32_t light = calc_light_multiplier(dc_light);
-
-	do {
-		for (int i = 0; i < 4; i++)
-		{
-			uint32_t fg = shade_pal_index(colormap[source[i]], light);
-			uint32_t fg_red = (fg >> 16) & 0xff;
-			uint32_t fg_green = (fg >> 8) & 0xff;
-			uint32_t fg_blue = fg & 0xff;
-
-			uint32_t bg_red = (dest[i] >> 16) & 0xff;
-			uint32_t bg_green = (dest[i] >> 8) & 0xff;
-			uint32_t bg_blue = (dest[i]) & 0xff;
-
-			uint32_t red = clamp<uint32_t>(fg_red + bg_red, 0, 255);
-			uint32_t green = clamp<uint32_t>(fg_green + bg_green, 0, 255);
-			uint32_t blue = clamp<uint32_t>(fg_blue + bg_blue, 0, 255);
-
-			dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
-		}
-
-		source += 4;
-		dest += pitch;
-	} while (--count);
-#else
 	DWORD *fg2rgb = dc_srcblend;
 	DWORD *bg2rgb = dc_destblend;
 
@@ -508,25 +415,24 @@ void rt_add4cols_c (int sx, int yl, int yh)
 		source += 4;
 		dest += pitch;
 	} while (--count);
-#endif
 }
 
 // Translates and adds one span at hx to the screen at sx without clamping.
-void rt_tlateadd1col (int hx, int sx, int yl, int yh)
+void rt_tlateadd1col_c (int hx, int sx, int yl, int yh)
 {
 	rt_Translate1col(dc_translation, hx, yl, yh);
 	rt_add1col(hx, sx, yl, yh);
 }
 
 // Translates and adds all four spans to the screen starting at sx without clamping.
-void rt_tlateadd4cols (int sx, int yl, int yh)
+void rt_tlateadd4cols_c (int sx, int yl, int yh)
 {
 	rt_Translate4cols(dc_translation, yl, yh);
 	rt_add4cols(sx, yl, yh);
 }
 
 // Shades one span at hx to the screen at sx.
-void rt_shaded1col (int hx, int sx, int yl, int yh)
+void rt_shaded1col_c (int hx, int sx, int yl, int yh)
 {
 	BYTE *colormap;
 	canvas_pixel_t *source;
@@ -544,29 +450,6 @@ void rt_shaded1col (int hx, int sx, int yl, int yh)
 	source = &dc_temp[yl*4 + hx];
 	pitch = dc_pitch;
 
-#ifndef PALETTEOUTPUT
-	uint32_t fg = shade_pal_index(dc_color, calc_light_multiplier(0));
-	uint32_t fg_red = (fg >> 16) & 0xff;
-	uint32_t fg_green = (fg >> 8) & 0xff;
-	uint32_t fg_blue = fg & 0xff;
-
-	do {
-		uint32_t alpha = colormap[*source];
-		uint32_t inv_alpha = 64 - alpha;
-
-		uint32_t bg_red = (*dest >> 16) & 0xff;
-		uint32_t bg_green = (*dest >> 8) & 0xff;
-		uint32_t bg_blue = (*dest) & 0xff;
-
-		uint32_t red = (fg_red * alpha + bg_red * inv_alpha) / 64;
-		uint32_t green = (fg_green * alpha + bg_green * inv_alpha) / 64;
-		uint32_t blue = (fg_blue * alpha + bg_blue * inv_alpha) / 64;
-
-		*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
-		source += 4;
-		dest += pitch;
-	} while (--count);
-#else
 	DWORD *fgstart;
 	fgstart = &Col2RGB8[0][dc_color];
 
@@ -578,7 +461,6 @@ void rt_shaded1col (int hx, int sx, int yl, int yh)
 		source += 4;
 		dest += pitch;
 	} while (--count);
-#endif
 }
 
 // Shades all four spans to the screen starting at sx.
@@ -600,32 +482,6 @@ void rt_shaded4cols_c (int sx, int yl, int yh)
 	source = &dc_temp[yl*4];
 	pitch = dc_pitch;
 
-#ifndef PALETTEOUTPUT
-	uint32_t fg = shade_pal_index(dc_color, calc_light_multiplier(0));
-	uint32_t fg_red = (fg >> 16) & 0xff;
-	uint32_t fg_green = (fg >> 8) & 0xff;
-	uint32_t fg_blue = fg & 0xff;
-
-	do {
-		for (int i = 0; i < 4; i++)
-		{
-			uint32_t alpha = colormap[source[i]];
-			uint32_t inv_alpha = 64 - alpha;
-
-			uint32_t bg_red = (dest[i] >> 16) & 0xff;
-			uint32_t bg_green = (dest[i] >> 8) & 0xff;
-			uint32_t bg_blue = (dest[i]) & 0xff;
-
-			uint32_t red = (fg_red * alpha + bg_red * inv_alpha) / 64;
-			uint32_t green = (fg_green * alpha + bg_green * inv_alpha) / 64;
-			uint32_t blue = (fg_blue * alpha + bg_blue * inv_alpha) / 64;
-
-			dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
-		}
-		source += 4;
-		dest += pitch;
-	} while (--count);
-#else
 	DWORD *fgstart;
 	fgstart = &Col2RGB8[0][dc_color];
 
@@ -651,11 +507,10 @@ void rt_shaded4cols_c (int sx, int yl, int yh)
 		source += 4;
 		dest += pitch;
 	} while (--count);
-#endif
 }
 
 // Adds one span at hx to the screen at sx with clamping.
-void rt_addclamp1col (int hx, int sx, int yl, int yh)
+void rt_addclamp1col_c (int hx, int sx, int yl, int yh)
 {
 	BYTE *colormap;
 	canvas_pixel_t *source;
@@ -673,28 +528,6 @@ void rt_addclamp1col (int hx, int sx, int yl, int yh)
 	pitch = dc_pitch;
 	colormap = dc_colormap;
 
-#ifndef PALETTEOUTPUT
-	uint32_t light = calc_light_multiplier(dc_light);
-
-	do {
-		uint32_t fg = shade_pal_index(colormap[*source], light);
-		uint32_t fg_red = (fg >> 16) & 0xff;
-		uint32_t fg_green = (fg >> 8) & 0xff;
-		uint32_t fg_blue = fg & 0xff;
-
-		uint32_t bg_red = (*dest >> 16) & 0xff;
-		uint32_t bg_green = (*dest >> 8) & 0xff;
-		uint32_t bg_blue = (*dest) & 0xff;
-
-		uint32_t red = clamp<uint32_t>(fg_red + bg_red, 0, 255);
-		uint32_t green = clamp<uint32_t>(fg_green + bg_green, 0, 255);
-		uint32_t blue = clamp<uint32_t>(fg_blue + bg_blue, 0, 255);
-
-		*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
-		source += 4;
-		dest += pitch;
-	} while (--count);
-#else
 	DWORD *fg2rgb = dc_srcblend;
 	DWORD *bg2rgb = dc_destblend;
 
@@ -711,7 +544,6 @@ void rt_addclamp1col (int hx, int sx, int yl, int yh)
 		source += 4;
 		dest += pitch;
 	} while (--count);
-#endif
 }
 
 // Adds all four spans to the screen starting at sx with clamping.
@@ -733,31 +565,6 @@ void rt_addclamp4cols_c (int sx, int yl, int yh)
 	pitch = dc_pitch;
 	colormap = dc_colormap;
 
-#ifndef PALETTEOUTPUT
-	uint32_t light = calc_light_multiplier(dc_light);
-
-	do {
-		for (int i = 0; i < 4; i++)
-		{
-			uint32_t fg = shade_pal_index(colormap[source[i]], light);
-			uint32_t fg_red = (fg >> 16) & 0xff;
-			uint32_t fg_green = (fg >> 8) & 0xff;
-			uint32_t fg_blue = fg & 0xff;
-
-			uint32_t bg_red = (dest[i] >> 16) & 0xff;
-			uint32_t bg_green = (dest[i] >> 8) & 0xff;
-			uint32_t bg_blue = (dest[i]) & 0xff;
-
-			uint32_t red = clamp<uint32_t>(fg_red + bg_red, 0, 255);
-			uint32_t green = clamp<uint32_t>(fg_green + bg_green, 0, 255);
-			uint32_t blue = clamp<uint32_t>(fg_blue + bg_blue, 0, 255);
-
-			dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
-		}
-		source += 4;
-		dest += pitch;
-	} while (--count);
-#else
 	DWORD *fg2rgb = dc_srcblend;
 	DWORD *bg2rgb = dc_destblend;
 
@@ -802,25 +609,24 @@ void rt_addclamp4cols_c (int sx, int yl, int yh)
 		source += 4;
 		dest += pitch;
 	} while (--count);
-#endif
 }
 
 // Translates and adds one span at hx to the screen at sx with clamping.
-void rt_tlateaddclamp1col (int hx, int sx, int yl, int yh)
+void rt_tlateaddclamp1col_c (int hx, int sx, int yl, int yh)
 {
 	rt_Translate1col(dc_translation, hx, yl, yh);
 	rt_addclamp1col(hx, sx, yl, yh);
 }
 
 // Translates and adds all four spans to the screen starting at sx with clamping.
-void rt_tlateaddclamp4cols (int sx, int yl, int yh)
+void rt_tlateaddclamp4cols_c (int sx, int yl, int yh)
 {
 	rt_Translate4cols(dc_translation, yl, yh);
 	rt_addclamp4cols(sx, yl, yh);
 }
 
 // Subtracts one span at hx to the screen at sx with clamping.
-void rt_subclamp1col (int hx, int sx, int yl, int yh)
+void rt_subclamp1col_c (int hx, int sx, int yl, int yh)
 {
 	BYTE *colormap;
 	canvas_pixel_t *source;
@@ -838,28 +644,6 @@ void rt_subclamp1col (int hx, int sx, int yl, int yh)
 	pitch = dc_pitch;
 	colormap = dc_colormap;
 
-#ifndef PALETTEOUTPUT
-	uint32_t light = calc_light_multiplier(dc_light);
-
-	do {
-		uint32_t fg = shade_pal_index(colormap[*source], light);
-		uint32_t fg_red = (fg >> 16) & 0xff;
-		uint32_t fg_green = (fg >> 8) & 0xff;
-		uint32_t fg_blue = fg & 0xff;
-
-		uint32_t bg_red = (*dest >> 16) & 0xff;
-		uint32_t bg_green = (*dest >> 8) & 0xff;
-		uint32_t bg_blue = (*dest) & 0xff;
-
-		uint32_t red = clamp<uint32_t>(256 - fg_red + bg_red, 256, 256 + 255) - 256;
-		uint32_t green = clamp<uint32_t>(256 - fg_green + bg_green, 256, 256 + 255) - 256;
-		uint32_t blue = clamp<uint32_t>(256 - fg_blue + bg_blue, 256, 256 + 255) - 256;
-
-		*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
-		source += 4;
-		dest += pitch;
-	} while (--count);
-#else
 	DWORD *fg2rgb = dc_srcblend;
 	DWORD *bg2rgb = dc_destblend;
 	do {
@@ -874,11 +658,10 @@ void rt_subclamp1col (int hx, int sx, int yl, int yh)
 		source += 4;
 		dest += pitch;
 	} while (--count);
-#endif
 }
 
 // Subtracts all four spans to the screen starting at sx with clamping.
-void rt_subclamp4cols (int sx, int yl, int yh)
+void rt_subclamp4cols_c (int sx, int yl, int yh)
 {
 	BYTE *colormap;
 	canvas_pixel_t *source;
@@ -896,32 +679,6 @@ void rt_subclamp4cols (int sx, int yl, int yh)
 	pitch = dc_pitch;
 	colormap = dc_colormap;
 
-#ifndef PALETTEOUTPUT
-	uint32_t light = calc_light_multiplier(dc_light);
-
-	do {
-		for (int i = 0; i < 4; i++)
-		{
-			uint32_t fg = shade_pal_index(colormap[source[i]], light);
-			uint32_t fg_red = (fg >> 16) & 0xff;
-			uint32_t fg_green = (fg >> 8) & 0xff;
-			uint32_t fg_blue = fg & 0xff;
-
-			uint32_t bg_red = (dest[i] >> 16) & 0xff;
-			uint32_t bg_green = (dest[i] >> 8) & 0xff;
-			uint32_t bg_blue = (dest[i]) & 0xff;
-
-			uint32_t red = clamp<uint32_t>(256 - fg_red + bg_red, 256, 256 + 255) - 256;
-			uint32_t green = clamp<uint32_t>(256 - fg_green + bg_green, 256, 256 + 255) - 256;
-			uint32_t blue = clamp<uint32_t>(256 - fg_blue + bg_blue, 256, 256 + 255) - 256;
-
-			dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
-		}
-
-		source += 4;
-		dest += pitch;
-	} while (--count);
-#else
 	DWORD *fg2rgb = dc_srcblend;
 	DWORD *bg2rgb = dc_destblend;
 	do {
@@ -961,25 +718,24 @@ void rt_subclamp4cols (int sx, int yl, int yh)
 		source += 4;
 		dest += pitch;
 	} while (--count);
-#endif
 }
 
 // Translates and subtracts one span at hx to the screen at sx with clamping.
-void rt_tlatesubclamp1col (int hx, int sx, int yl, int yh)
+void rt_tlatesubclamp1col_c (int hx, int sx, int yl, int yh)
 {
 	rt_Translate1col(dc_translation, hx, yl, yh);
 	rt_subclamp1col(hx, sx, yl, yh);
 }
 
 // Translates and subtracts all four spans to the screen starting at sx with clamping.
-void rt_tlatesubclamp4cols (int sx, int yl, int yh)
+void rt_tlatesubclamp4cols_c (int sx, int yl, int yh)
 {
 	rt_Translate4cols(dc_translation, yl, yh);
 	rt_subclamp4cols(sx, yl, yh);
 }
 
 // Subtracts one span at hx from the screen at sx with clamping.
-void rt_revsubclamp1col (int hx, int sx, int yl, int yh)
+void rt_revsubclamp1col_c (int hx, int sx, int yl, int yh)
 {
 	BYTE *colormap;
 	canvas_pixel_t *source;
@@ -999,28 +755,6 @@ void rt_revsubclamp1col (int hx, int sx, int yl, int yh)
 	pitch = dc_pitch;
 	colormap = dc_colormap;
 
-#ifndef PALETTEOUTPUT
-	uint32_t light = calc_light_multiplier(dc_light);
-
-	do {
-		uint32_t fg = shade_pal_index(colormap[*source], light);
-		uint32_t fg_red = (fg >> 16) & 0xff;
-		uint32_t fg_green = (fg >> 8) & 0xff;
-		uint32_t fg_blue = fg & 0xff;
-
-		uint32_t bg_red = (*dest >> 16) & 0xff;
-		uint32_t bg_green = (*dest >> 8) & 0xff;
-		uint32_t bg_blue = (*dest) & 0xff;
-
-		uint32_t red = clamp<uint32_t>(256 + fg_red - bg_red, 256, 256 + 255) - 256;
-		uint32_t green = clamp<uint32_t>(256 + fg_green - bg_green, 256, 256 + 255) - 256;
-		uint32_t blue = clamp<uint32_t>(256 + fg_blue - bg_blue, 256, 256 + 255) - 256;
-
-		*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
-		source += 4;
-		dest += pitch;
-	} while (--count);
-#else
 	do {
 		DWORD a = (bg2rgb[*dest] | 0x40100400) - fg2rgb[colormap[*source]];
 		DWORD b = a;
@@ -1033,11 +767,10 @@ void rt_revsubclamp1col (int hx, int sx, int yl, int yh)
 		source += 4;
 		dest += pitch;
 	} while (--count);
-#endif
 }
 
 // Subtracts all four spans from the screen starting at sx with clamping.
-void rt_revsubclamp4cols (int sx, int yl, int yh)
+void rt_revsubclamp4cols_c (int sx, int yl, int yh)
 {
 	BYTE *colormap;
 	canvas_pixel_t *source;
@@ -1057,32 +790,6 @@ void rt_revsubclamp4cols (int sx, int yl, int yh)
 	pitch = dc_pitch;
 	colormap = dc_colormap;
 
-#ifndef PALETTEOUTPUT
-	uint32_t light = calc_light_multiplier(dc_light);
-
-	do {
-		for (int i = 0; i < 4; i++)
-		{
-			uint32_t fg = shade_pal_index(colormap[source[i]], light);
-			uint32_t fg_red = (fg >> 16) & 0xff;
-			uint32_t fg_green = (fg >> 8) & 0xff;
-			uint32_t fg_blue = fg & 0xff;
-
-			uint32_t bg_red = (dest[i] >> 16) & 0xff;
-			uint32_t bg_green = (dest[i] >> 8) & 0xff;
-			uint32_t bg_blue = (dest[i]) & 0xff;
-
-			uint32_t red = clamp<uint32_t>(256 + fg_red - bg_red, 256, 256 + 255) - 256;
-			uint32_t green = clamp<uint32_t>(256 + fg_green - bg_green, 256, 256 + 255) - 256;
-			uint32_t blue = clamp<uint32_t>(256 + fg_blue - bg_blue, 256, 256 + 255) - 256;
-
-			dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
-		}
-
-		source += 4;
-		dest += pitch;
-	} while (--count);
-#else
 	do {
 		DWORD a = (bg2rgb[dest[0]] | 0x40100400) - fg2rgb[colormap[source[0]]];
 		DWORD b = a;
@@ -1120,18 +827,17 @@ void rt_revsubclamp4cols (int sx, int yl, int yh)
 		source += 4;
 		dest += pitch;
 	} while (--count);
-#endif
 }
 
 // Translates and subtracts one span at hx from the screen at sx with clamping.
-void rt_tlaterevsubclamp1col (int hx, int sx, int yl, int yh)
+void rt_tlaterevsubclamp1col_c (int hx, int sx, int yl, int yh)
 {
 	rt_Translate1col(dc_translation, hx, yl, yh);
 	rt_revsubclamp1col(hx, sx, yl, yh);
 }
 
 // Translates and subtracts all four spans from the screen starting at sx with clamping.
-void rt_tlaterevsubclamp4cols (int sx, int yl, int yh)
+void rt_tlaterevsubclamp4cols_c (int sx, int yl, int yh)
 {
 	rt_Translate4cols(dc_translation, yl, yh);
 	rt_revsubclamp4cols(sx, yl, yh);
@@ -1301,7 +1007,7 @@ void rt_draw4cols (int sx)
 
 // Before each pass through a rendering loop that uses these routines,
 // call this function to set up the span pointers.
-void rt_initcols (canvas_pixel_t *buff)
+void rt_initcols_pal (canvas_pixel_t *buff)
 {
 	int y;
 
@@ -1372,7 +1078,7 @@ void R_DrawColumnHorizP_C (void)
 }
 
 // [RH] Just fills a column with a given color
-void R_FillColumnHorizP (void)
+void R_FillColumnHorizP_C (void)
 {
 	int count = dc_count;
 	BYTE color = dc_color;
diff --git a/src/r_drawt_rgba.cpp b/src/r_drawt_rgba.cpp
new file mode 100644
index 000000000..e8111be8f
--- /dev/null
+++ b/src/r_drawt_rgba.cpp
@@ -0,0 +1,883 @@
+/*
+** r_drawt_rgba.cpp
+** Faster column drawers for modern processors, true color edition
+**
+**---------------------------------------------------------------------------
+** Copyright 1998-2006 Randy Heit
+** All rights reserved.
+**
+** Redistribution and use in source and binary forms, with or without
+** modification, are permitted provided that the following conditions
+** are met:
+**
+** 1. Redistributions of source code must retain the above copyright
+**    notice, this list of conditions and the following disclaimer.
+** 2. Redistributions in binary form must reproduce the above copyright
+**    notice, this list of conditions and the following disclaimer in the
+**    documentation and/or other materials provided with the distribution.
+** 3. The name of the author may not be used to endorse or promote products
+**    derived from this software without specific prior written permission.
+**
+** THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+** IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+** OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+** IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+** INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+** NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+** DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+** THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+** (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+** THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**---------------------------------------------------------------------------
+**
+** True color versions of the similar functions in r_drawt.cpp
+** Please see r_drawt.cpp for a description of the globals used.
+*/
+
+#include "templates.h"
+#include "doomtype.h"
+#include "doomdef.h"
+#include "r_defs.h"
+#include "r_draw.h"
+#include "r_main.h"
+#include "r_things.h"
+#include "v_video.h"
+
+canvas_pixel_t dc_temp_rgbabuff_rgba[MAXHEIGHT*4];
+canvas_pixel_t *dc_temp_rgba;
+
+// Defined in r_draw_t.cpp:
+extern unsigned int dc_tspans[4][MAXHEIGHT];
+extern unsigned int *dc_ctspan[4];
+extern unsigned int *horizspan[4];
+
+// Copies one span at hx to the screen at sx.
+void rt_copy1col_RGBA_c (int hx, int sx, int yl, int yh)
+{
+	canvas_pixel_t *source;
+	canvas_pixel_t *dest;
+	int count;
+	int pitch;
+
+	count = yh-yl;
+	if (count < 0)
+		return;
+	count++;
+
+	dest = ylookup[yl] + sx + dc_destorg;
+	source = &dc_temp_rgba[yl*4 + hx];
+	pitch = dc_pitch;
+
+	if (count & 1) {
+		*dest = *source;
+		source += 4;
+		dest += pitch;
+	}
+	if (count & 2) {
+		dest[0] = source[0];
+		dest[pitch] = source[4];
+		source += 8;
+		dest += pitch*2;
+	}
+	if (!(count >>= 2))
+		return;
+
+	do {
+		dest[0] = source[0];
+		dest[pitch] = source[4];
+		dest[pitch*2] = source[8];
+		dest[pitch*3] = source[12];
+		source += 16;
+		dest += pitch*4;
+	} while (--count);
+}
+
+// Copies all four spans to the screen starting at sx.
+void rt_copy4cols_RGBA_c (int sx, int yl, int yh)
+{
+	// To do: we could do this with SSE using __m128i
+	rt_copy1col_RGBA_c(0, sx, yl, yh);
+	rt_copy1col_RGBA_c(1, sx + 1, yl, yh);
+	rt_copy1col_RGBA_c(2, sx + 2, yl, yh);
+	rt_copy1col_RGBA_c(3, sx + 3, yl, yh);
+}
+
+// Maps one span at hx to the screen at sx.
+void rt_map1col_RGBA_c (int hx, int sx, int yl, int yh)
+{
+	BYTE *colormap;
+	canvas_pixel_t *source;
+	canvas_pixel_t *dest;
+	int count;
+	int pitch;
+
+	count = yh-yl;
+	if (count < 0)
+		return;
+	count++;
+
+	uint32_t light = calc_light_multiplier(dc_light);
+
+	colormap = dc_colormap;
+	dest = ylookup[yl] + sx + dc_destorg;
+	source = &dc_temp_rgba[yl*4 + hx];
+	pitch = dc_pitch;
+
+	if (count & 1) {
+		*dest = shade_pal_index(colormap[*source], light);
+		source += 4;
+		dest += pitch;
+	}
+	if (!(count >>= 1))
+		return;
+
+	do {
+		dest[0] = shade_pal_index(colormap[source[0]], light);
+		dest[pitch] = shade_pal_index(colormap[source[4]], light);
+		source += 8;
+		dest += pitch*2;
+	} while (--count);
+}
+
+// Maps all four spans to the screen starting at sx.
+void rt_map4cols_RGBA_c (int sx, int yl, int yh)
+{
+	BYTE *colormap;
+	canvas_pixel_t *source;
+	canvas_pixel_t *dest;
+	int count;
+	int pitch;
+
+	count = yh-yl;
+	if (count < 0)
+		return;
+	count++;
+
+	uint32_t light = calc_light_multiplier(dc_light);
+
+	colormap = dc_colormap;
+	dest = ylookup[yl] + sx + dc_destorg;
+	source = &dc_temp_rgba[yl*4];
+	pitch = dc_pitch;
+	
+	if (count & 1) {
+		dest[0] = shade_pal_index(colormap[source[0]], light);
+		dest[1] = shade_pal_index(colormap[source[1]], light);
+		dest[2] = shade_pal_index(colormap[source[2]], light);
+		dest[3] = shade_pal_index(colormap[source[3]], light);
+		source += 4;
+		dest += pitch;
+	}
+	if (!(count >>= 1))
+		return;
+
+	do {
+		dest[0] = shade_pal_index(colormap[source[0]], light);
+		dest[1] = shade_pal_index(colormap[source[1]], light);
+		dest[2] = shade_pal_index(colormap[source[2]], light);
+		dest[3] = shade_pal_index(colormap[source[3]], light);
+		dest[pitch] = shade_pal_index(colormap[source[4]], light);
+		dest[pitch + 1] = shade_pal_index(colormap[source[5]], light);
+		dest[pitch + 2] = shade_pal_index(colormap[source[6]], light);
+		dest[pitch + 3] = shade_pal_index(colormap[source[7]], light);
+		source += 8;
+		dest += pitch*2;
+	} while (--count);
+}
+
+void rt_Translate1col_RGBA_c(const BYTE *translation, int hx, int yl, int yh)
+{
+	int count = yh - yl + 1;
+	canvas_pixel_t *source = &dc_temp_rgba[yl*4 + hx];
+
+	// Things we do to hit the compiler's optimizer with a clue bat:
+	// 1. Parallelism is explicitly spelled out by using a separate
+	//    C instruction for each assembly instruction. GCC lets me
+	//    have four temporaries, but VC++ spills to the stack with
+	//    more than two. Two is probably optimal, anyway.
+	// 2. The results of the translation lookups are explicitly
+	//    stored in byte-sized variables. This causes the VC++ code
+	//    to use byte mov instructions in most cases; for apparently
+	//    random reasons, it will use movzx for some places. GCC
+	//    ignores this and uses movzx always.
+
+	// Do 8 rows at a time.
+	for (int count8 = count >> 3; count8; --count8)
+	{
+		int c0, c1;
+		BYTE b0, b1;
+
+		c0 = source[0];			c1 = source[4];
+		b0 = translation[c0];	b1 = translation[c1];
+		source[0] = b0;			source[4] = b1;
+
+		c0 = source[8];			c1 = source[12];
+		b0 = translation[c0];	b1 = translation[c1];
+		source[8] = b0;			source[12] = b1;
+
+		c0 = source[16];		c1 = source[20];
+		b0 = translation[c0];	b1 = translation[c1];
+		source[16] = b0;		source[20] = b1;
+
+		c0 = source[24];		c1 = source[28];
+		b0 = translation[c0];	b1 = translation[c1];
+		source[24] = b0;		source[28] = b1;
+
+		source += 32;
+	}
+	// Finish by doing 1 row at a time.
+	for (count &= 7; count; --count, source += 4)
+	{
+		source[0] = translation[source[0]];
+	}
+}
+
+void rt_Translate4cols_RGBA_c(const BYTE *translation, int yl, int yh)
+{
+	int count = yh - yl + 1;
+	canvas_pixel_t *source = &dc_temp_rgba[yl*4];
+	int c0, c1;
+	BYTE b0, b1;
+
+	// Do 2 rows at a time.
+	for (int count8 = count >> 1; count8; --count8)
+	{
+		c0 = source[0];			c1 = source[1];
+		b0 = translation[c0];	b1 = translation[c1];
+		source[0] = b0;			source[1] = b1;
+
+		c0 = source[2];			c1 = source[3];
+		b0 = translation[c0];	b1 = translation[c1];
+		source[2] = b0;			source[3] = b1;
+
+		c0 = source[4];			c1 = source[5];
+		b0 = translation[c0];	b1 = translation[c1];
+		source[4] = b0;			source[5] = b1;
+
+		c0 = source[6];			c1 = source[7];
+		b0 = translation[c0];	b1 = translation[c1];
+		source[6] = b0;			source[7] = b1;
+
+		source += 8;
+	}
+	// Do the final row if count was odd.
+	if (count & 1)
+	{
+		c0 = source[0];			c1 = source[1];
+		b0 = translation[c0];	b1 = translation[c1];
+		source[0] = b0;			source[1] = b1;
+
+		c0 = source[2];			c1 = source[3];
+		b0 = translation[c0];	b1 = translation[c1];
+		source[2] = b0;			source[3] = b1;
+	}
+}
+
+// Translates one span at hx to the screen at sx.
+void rt_tlate1col_RGBA_c (int hx, int sx, int yl, int yh)
+{
+	rt_Translate1col_RGBA_c(dc_translation, hx, yl, yh);
+	rt_map1col(hx, sx, yl, yh);
+}
+
+// Translates all four spans to the screen starting at sx.
+void rt_tlate4cols_RGBA_c (int sx, int yl, int yh)
+{
+	rt_Translate4cols_RGBA_c(dc_translation, yl, yh);
+	rt_map4cols(sx, yl, yh);
+}
+
+// Adds one span at hx to the screen at sx without clamping.
+void rt_add1col_RGBA_c (int hx, int sx, int yl, int yh)
+{
+	BYTE *colormap;
+	canvas_pixel_t *source;
+	canvas_pixel_t *dest;
+	int count;
+	int pitch;
+
+	count = yh-yl;
+	if (count < 0)
+		return;
+	count++;
+
+	dest = ylookup[yl] + sx + dc_destorg;
+	source = &dc_temp_rgba[yl*4 + hx];
+	pitch = dc_pitch;
+	colormap = dc_colormap;
+
+	uint32_t light = calc_light_multiplier(dc_light);
+
+	do {
+		uint32_t fg = shade_pal_index(colormap[*source], light);
+		uint32_t fg_red = (fg >> 16) & 0xff;
+		uint32_t fg_green = (fg >> 8) & 0xff;
+		uint32_t fg_blue = fg & 0xff;
+
+		uint32_t bg_red = (*dest >> 16) & 0xff;
+		uint32_t bg_green = (*dest >> 8) & 0xff;
+		uint32_t bg_blue = (*dest) & 0xff;
+
+		uint32_t red = clamp<uint32_t>(fg_red + bg_red, 0, 255);
+		uint32_t green = clamp<uint32_t>(fg_green + bg_green, 0, 255);
+		uint32_t blue = clamp<uint32_t>(fg_blue + bg_blue, 0, 255);
+
+		*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
+
+		source += 4;
+		dest += pitch;
+	} while (--count);
+}
+
+// Adds all four spans to the screen starting at sx without clamping.
+void rt_add4cols_RGBA_c (int sx, int yl, int yh)
+{
+	BYTE *colormap;
+	canvas_pixel_t *source;
+	canvas_pixel_t *dest;
+	int count;
+	int pitch;
+
+	count = yh-yl;
+	if (count < 0)
+		return;
+	count++;
+
+	dest = ylookup[yl] + sx + dc_destorg;
+	source = &dc_temp_rgba[yl*4];
+	pitch = dc_pitch;
+	colormap = dc_colormap;
+
+	uint32_t light = calc_light_multiplier(dc_light);
+
+	do {
+		for (int i = 0; i < 4; i++)
+		{
+			uint32_t fg = shade_pal_index(colormap[source[i]], light);
+			uint32_t fg_red = (fg >> 16) & 0xff;
+			uint32_t fg_green = (fg >> 8) & 0xff;
+			uint32_t fg_blue = fg & 0xff;
+
+			uint32_t bg_red = (dest[i] >> 16) & 0xff;
+			uint32_t bg_green = (dest[i] >> 8) & 0xff;
+			uint32_t bg_blue = (dest[i]) & 0xff;
+
+			uint32_t red = clamp<uint32_t>(fg_red + bg_red, 0, 255);
+			uint32_t green = clamp<uint32_t>(fg_green + bg_green, 0, 255);
+			uint32_t blue = clamp<uint32_t>(fg_blue + bg_blue, 0, 255);
+
+			dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
+		}
+
+		source += 4;
+		dest += pitch;
+	} while (--count);
+}
+
+// Translates and adds one span at hx to the screen at sx without clamping.
+void rt_tlateadd1col_RGBA_c (int hx, int sx, int yl, int yh)
+{
+	rt_Translate1col_RGBA_c(dc_translation, hx, yl, yh);
+	rt_add1col(hx, sx, yl, yh);
+}
+
+// Translates and adds all four spans to the screen starting at sx without clamping.
+void rt_tlateadd4cols_RGBA_c(int sx, int yl, int yh)
+{
+	rt_Translate4cols_RGBA_c(dc_translation, yl, yh);
+	rt_add4cols(sx, yl, yh);
+}
+
+// Shades one span at hx to the screen at sx.
+void rt_shaded1col_RGBA_c (int hx, int sx, int yl, int yh)
+{
+	BYTE *colormap;
+	canvas_pixel_t *source;
+	canvas_pixel_t *dest;
+	int count;
+	int pitch;
+
+	count = yh-yl;
+	if (count < 0)
+		return;
+	count++;
+
+	colormap = dc_colormap;
+	dest = ylookup[yl] + sx + dc_destorg;
+	source = &dc_temp_rgba[yl*4 + hx];
+	pitch = dc_pitch;
+
+	uint32_t fg = shade_pal_index(dc_color, calc_light_multiplier(0));
+	uint32_t fg_red = (fg >> 16) & 0xff;
+	uint32_t fg_green = (fg >> 8) & 0xff;
+	uint32_t fg_blue = fg & 0xff;
+
+	do {
+		uint32_t alpha = colormap[*source];
+		uint32_t inv_alpha = 64 - alpha;
+
+		uint32_t bg_red = (*dest >> 16) & 0xff;
+		uint32_t bg_green = (*dest >> 8) & 0xff;
+		uint32_t bg_blue = (*dest) & 0xff;
+
+		uint32_t red = (fg_red * alpha + bg_red * inv_alpha) / 64;
+		uint32_t green = (fg_green * alpha + bg_green * inv_alpha) / 64;
+		uint32_t blue = (fg_blue * alpha + bg_blue * inv_alpha) / 64;
+
+		*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
+		source += 4;
+		dest += pitch;
+	} while (--count);
+}
+
+// Shades all four spans to the screen starting at sx.
+void rt_shaded4cols_RGBA_c (int sx, int yl, int yh)
+{
+	BYTE *colormap;
+	canvas_pixel_t *source;
+	canvas_pixel_t *dest;
+	int count;
+	int pitch;
+
+	count = yh-yl;
+	if (count < 0)
+		return;
+	count++;
+
+	colormap = dc_colormap;
+	dest = ylookup[yl] + sx + dc_destorg;
+	source = &dc_temp_rgba[yl*4];
+	pitch = dc_pitch;
+
+	uint32_t fg = shade_pal_index(dc_color, calc_light_multiplier(0));
+	uint32_t fg_red = (fg >> 16) & 0xff;
+	uint32_t fg_green = (fg >> 8) & 0xff;
+	uint32_t fg_blue = fg & 0xff;
+
+	do {
+		for (int i = 0; i < 4; i++)
+		{
+			uint32_t alpha = colormap[source[i]];
+			uint32_t inv_alpha = 64 - alpha;
+
+			uint32_t bg_red = (dest[i] >> 16) & 0xff;
+			uint32_t bg_green = (dest[i] >> 8) & 0xff;
+			uint32_t bg_blue = (dest[i]) & 0xff;
+
+			uint32_t red = (fg_red * alpha + bg_red * inv_alpha) / 64;
+			uint32_t green = (fg_green * alpha + bg_green * inv_alpha) / 64;
+			uint32_t blue = (fg_blue * alpha + bg_blue * inv_alpha) / 64;
+
+			dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
+		}
+		source += 4;
+		dest += pitch;
+	} while (--count);
+}
+
+// Adds one span at hx to the screen at sx with clamping.
+void rt_addclamp1col_RGBA_c (int hx, int sx, int yl, int yh)
+{
+	BYTE *colormap;
+	canvas_pixel_t *source;
+	canvas_pixel_t *dest;
+	int count;
+	int pitch;
+
+	count = yh-yl;
+	if (count < 0)
+		return;
+	count++;
+
+	dest = ylookup[yl] + sx + dc_destorg;
+	source = &dc_temp_rgba[yl*4 + hx];
+	pitch = dc_pitch;
+	colormap = dc_colormap;
+
+	uint32_t light = calc_light_multiplier(dc_light);
+
+	do {
+		uint32_t fg = shade_pal_index(colormap[*source], light);
+		uint32_t fg_red = (fg >> 16) & 0xff;
+		uint32_t fg_green = (fg >> 8) & 0xff;
+		uint32_t fg_blue = fg & 0xff;
+
+		uint32_t bg_red = (*dest >> 16) & 0xff;
+		uint32_t bg_green = (*dest >> 8) & 0xff;
+		uint32_t bg_blue = (*dest) & 0xff;
+
+		uint32_t red = clamp<uint32_t>(fg_red + bg_red, 0, 255);
+		uint32_t green = clamp<uint32_t>(fg_green + bg_green, 0, 255);
+		uint32_t blue = clamp<uint32_t>(fg_blue + bg_blue, 0, 255);
+
+		*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
+		source += 4;
+		dest += pitch;
+	} while (--count);
+}
+
+// Adds all four spans to the screen starting at sx with clamping.
+void rt_addclamp4cols_RGBA_c (int sx, int yl, int yh)
+{
+	BYTE *colormap;
+	canvas_pixel_t *source;
+	canvas_pixel_t *dest;
+	int count;
+	int pitch;
+
+	count = yh-yl;
+	if (count < 0)
+		return;
+	count++;
+
+	dest = ylookup[yl] + sx + dc_destorg;
+	source = &dc_temp_rgba[yl*4];
+	pitch = dc_pitch;
+	colormap = dc_colormap;
+
+	uint32_t light = calc_light_multiplier(dc_light);
+
+	do {
+		for (int i = 0; i < 4; i++)
+		{
+			uint32_t fg = shade_pal_index(colormap[source[i]], light);
+			uint32_t fg_red = (fg >> 16) & 0xff;
+			uint32_t fg_green = (fg >> 8) & 0xff;
+			uint32_t fg_blue = fg & 0xff;
+
+			uint32_t bg_red = (dest[i] >> 16) & 0xff;
+			uint32_t bg_green = (dest[i] >> 8) & 0xff;
+			uint32_t bg_blue = (dest[i]) & 0xff;
+
+			uint32_t red = clamp<uint32_t>(fg_red + bg_red, 0, 255);
+			uint32_t green = clamp<uint32_t>(fg_green + bg_green, 0, 255);
+			uint32_t blue = clamp<uint32_t>(fg_blue + bg_blue, 0, 255);
+
+			dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
+		}
+		source += 4;
+		dest += pitch;
+	} while (--count);
+}
+
+// Translates and adds one span at hx to the screen at sx with clamping.
+void rt_tlateaddclamp1col_RGBA_c (int hx, int sx, int yl, int yh)
+{
+	rt_Translate1col_RGBA_c(dc_translation, hx, yl, yh);
+	rt_addclamp1col_RGBA_c(hx, sx, yl, yh);
+}
+
+// Translates and adds all four spans to the screen starting at sx with clamping.
+void rt_tlateaddclamp4cols_RGBA_c (int sx, int yl, int yh)
+{
+	rt_Translate4cols_RGBA_c(dc_translation, yl, yh);
+	rt_addclamp4cols(sx, yl, yh);
+}
+
+// Subtracts one span at hx to the screen at sx with clamping.
+void rt_subclamp1col_RGBA_c (int hx, int sx, int yl, int yh)
+{
+	BYTE *colormap;
+	canvas_pixel_t *source;
+	canvas_pixel_t *dest;
+	int count;
+	int pitch;
+
+	count = yh-yl;
+	if (count < 0)
+		return;
+	count++;
+
+	dest = ylookup[yl] + sx + dc_destorg;
+	source = &dc_temp_rgba[yl*4 + hx];
+	pitch = dc_pitch;
+	colormap = dc_colormap;
+
+	uint32_t light = calc_light_multiplier(dc_light);
+
+	do {
+		uint32_t fg = shade_pal_index(colormap[*source], light);
+		uint32_t fg_red = (fg >> 16) & 0xff;
+		uint32_t fg_green = (fg >> 8) & 0xff;
+		uint32_t fg_blue = fg & 0xff;
+
+		uint32_t bg_red = (*dest >> 16) & 0xff;
+		uint32_t bg_green = (*dest >> 8) & 0xff;
+		uint32_t bg_blue = (*dest) & 0xff;
+
+		uint32_t red = clamp<uint32_t>(256 - fg_red + bg_red, 256, 256 + 255) - 256;
+		uint32_t green = clamp<uint32_t>(256 - fg_green + bg_green, 256, 256 + 255) - 256;
+		uint32_t blue = clamp<uint32_t>(256 - fg_blue + bg_blue, 256, 256 + 255) - 256;
+
+		*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
+		source += 4;
+		dest += pitch;
+	} while (--count);
+}
+
+// Subtracts all four spans to the screen starting at sx with clamping.
+void rt_subclamp4cols_RGBA_c (int sx, int yl, int yh)
+{
+	BYTE *colormap;
+	canvas_pixel_t *source;
+	canvas_pixel_t *dest;
+	int count;
+	int pitch;
+
+	count = yh-yl;
+	if (count < 0)
+		return;
+	count++;
+
+	dest = ylookup[yl] + sx + dc_destorg;
+	source = &dc_temp_rgba[yl*4];
+	pitch = dc_pitch;
+	colormap = dc_colormap;
+
+	uint32_t light = calc_light_multiplier(dc_light);
+
+	do {
+		for (int i = 0; i < 4; i++)
+		{
+			uint32_t fg = shade_pal_index(colormap[source[i]], light);
+			uint32_t fg_red = (fg >> 16) & 0xff;
+			uint32_t fg_green = (fg >> 8) & 0xff;
+			uint32_t fg_blue = fg & 0xff;
+
+			uint32_t bg_red = (dest[i] >> 16) & 0xff;
+			uint32_t bg_green = (dest[i] >> 8) & 0xff;
+			uint32_t bg_blue = (dest[i]) & 0xff;
+
+			uint32_t red = clamp<uint32_t>(256 - fg_red + bg_red, 256, 256 + 255) - 256;
+			uint32_t green = clamp<uint32_t>(256 - fg_green + bg_green, 256, 256 + 255) - 256;
+			uint32_t blue = clamp<uint32_t>(256 - fg_blue + bg_blue, 256, 256 + 255) - 256;
+
+			dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
+		}
+
+		source += 4;
+		dest += pitch;
+	} while (--count);
+}
+
+// Translates and subtracts one span at hx to the screen at sx with clamping.
+void rt_tlatesubclamp1col_RGBA_c (int hx, int sx, int yl, int yh)
+{
+	rt_Translate1col_RGBA_c(dc_translation, hx, yl, yh);
+	rt_subclamp1col_RGBA_c(hx, sx, yl, yh);
+}
+
+// Translates and subtracts all four spans to the screen starting at sx with clamping.
+void rt_tlatesubclamp4cols_RGBA_c (int sx, int yl, int yh)
+{
+	rt_Translate4cols_RGBA_c(dc_translation, yl, yh);
+	rt_subclamp4cols_RGBA_c(sx, yl, yh);
+}
+
+// Subtracts one span at hx from the screen at sx with clamping.
+void rt_revsubclamp1col_RGBA_c (int hx, int sx, int yl, int yh)
+{
+	BYTE *colormap;
+	canvas_pixel_t *source;
+	canvas_pixel_t *dest;
+	int count;
+	int pitch;
+
+	count = yh-yl;
+	if (count < 0)
+		return;
+	count++;
+
+	DWORD *fg2rgb = dc_srcblend;
+	DWORD *bg2rgb = dc_destblend;
+	dest = ylookup[yl] + sx + dc_destorg;
+	source = &dc_temp_rgba[yl*4 + hx];
+	pitch = dc_pitch;
+	colormap = dc_colormap;
+
+	uint32_t light = calc_light_multiplier(dc_light);
+
+	do {
+		uint32_t fg = shade_pal_index(colormap[*source], light);
+		uint32_t fg_red = (fg >> 16) & 0xff;
+		uint32_t fg_green = (fg >> 8) & 0xff;
+		uint32_t fg_blue = fg & 0xff;
+
+		uint32_t bg_red = (*dest >> 16) & 0xff;
+		uint32_t bg_green = (*dest >> 8) & 0xff;
+		uint32_t bg_blue = (*dest) & 0xff;
+
+		uint32_t red = clamp<uint32_t>(256 + fg_red - bg_red, 256, 256 + 255) - 256;
+		uint32_t green = clamp<uint32_t>(256 + fg_green - bg_green, 256, 256 + 255) - 256;
+		uint32_t blue = clamp<uint32_t>(256 + fg_blue - bg_blue, 256, 256 + 255) - 256;
+
+		*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
+		source += 4;
+		dest += pitch;
+	} while (--count);
+}
+
+// Subtracts all four spans from the screen starting at sx with clamping.
+void rt_revsubclamp4cols_RGBA_c (int sx, int yl, int yh)
+{
+	BYTE *colormap;
+	canvas_pixel_t *source;
+	canvas_pixel_t *dest;
+	int count;
+	int pitch;
+
+	count = yh-yl;
+	if (count < 0)
+		return;
+	count++;
+
+	DWORD *fg2rgb = dc_srcblend;
+	DWORD *bg2rgb = dc_destblend;
+	dest = ylookup[yl] + sx + dc_destorg;
+	source = &dc_temp_rgba[yl*4];
+	pitch = dc_pitch;
+	colormap = dc_colormap;
+
+	uint32_t light = calc_light_multiplier(dc_light);
+
+	do {
+		for (int i = 0; i < 4; i++)
+		{
+			uint32_t fg = shade_pal_index(colormap[source[i]], light);
+			uint32_t fg_red = (fg >> 16) & 0xff;
+			uint32_t fg_green = (fg >> 8) & 0xff;
+			uint32_t fg_blue = fg & 0xff;
+
+			uint32_t bg_red = (dest[i] >> 16) & 0xff;
+			uint32_t bg_green = (dest[i] >> 8) & 0xff;
+			uint32_t bg_blue = (dest[i]) & 0xff;
+
+			uint32_t red = clamp<uint32_t>(256 + fg_red - bg_red, 256, 256 + 255) - 256;
+			uint32_t green = clamp<uint32_t>(256 + fg_green - bg_green, 256, 256 + 255) - 256;
+			uint32_t blue = clamp<uint32_t>(256 + fg_blue - bg_blue, 256, 256 + 255) - 256;
+
+			dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue;
+		}
+
+		source += 4;
+		dest += pitch;
+	} while (--count);
+}
+
+// Translates and subtracts one span at hx from the screen at sx with clamping.
+void rt_tlaterevsubclamp1col_RGBA_c (int hx, int sx, int yl, int yh)
+{
+	rt_Translate1col_RGBA_c(dc_translation, hx, yl, yh);
+	rt_revsubclamp1col_RGBA_c(hx, sx, yl, yh);
+}
+
+// Translates and subtracts all four spans from the screen starting at sx with clamping.
+void rt_tlaterevsubclamp4cols_RGBA_c (int sx, int yl, int yh)
+{
+	rt_Translate4cols_RGBA_c(dc_translation, yl, yh);
+	rt_revsubclamp4cols_RGBA_c(sx, yl, yh);
+}
+
+// Before each pass through a rendering loop that uses these routines,
+// call this function to set up the span pointers.
+void rt_initcols_rgba (canvas_pixel_t *buff)
+{
+	int y;
+
+	dc_temp_rgba = buff == NULL ? dc_temp_rgbabuff_rgba : buff;
+	for (y = 3; y >= 0; y--)
+		horizspan[y] = dc_ctspan[y] = &dc_tspans[y][0];
+}
+
+// Stretches a column into a temporary buffer which is later
+// drawn to the screen along with up to three other columns.
+void R_DrawColumnHorizP_RGBA_C (void)
+{
+	int count = dc_count;
+	canvas_pixel_t *dest;
+	fixed_t fracstep;
+	fixed_t frac;
+
+	if (count <= 0)
+		return;
+
+	{
+		int x = dc_x & 3;
+		unsigned int **span;
+		
+		span = &dc_ctspan[x];
+		(*span)[0] = dc_yl;
+		(*span)[1] = dc_yh;
+		*span += 2;
+		dest = &dc_temp_rgba[x + 4*dc_yl];
+	}
+	fracstep = dc_iscale;
+	frac = dc_texturefrac;
+
+	{
+		const BYTE *source = dc_source;
+
+		if (count & 1) {
+			*dest = source[frac>>FRACBITS]; dest += 4; frac += fracstep;
+		}
+		if (count & 2) {
+			dest[0] = source[frac>>FRACBITS]; frac += fracstep;
+			dest[4] = source[frac>>FRACBITS]; frac += fracstep;
+			dest += 8;
+		}
+		if (count & 4) {
+			dest[0] = source[frac>>FRACBITS]; frac += fracstep;
+			dest[4] = source[frac>>FRACBITS]; frac += fracstep;
+			dest[8] = source[frac>>FRACBITS]; frac += fracstep;
+			dest[12]= source[frac>>FRACBITS]; frac += fracstep;
+			dest += 16;
+		}
+		count >>= 3;
+		if (!count) return;
+
+		do
+		{
+			dest[0] = source[frac>>FRACBITS]; frac += fracstep;
+			dest[4] = source[frac>>FRACBITS]; frac += fracstep;
+			dest[8] = source[frac>>FRACBITS]; frac += fracstep;
+			dest[12]= source[frac>>FRACBITS]; frac += fracstep;
+			dest[16]= source[frac>>FRACBITS]; frac += fracstep;
+			dest[20]= source[frac>>FRACBITS]; frac += fracstep;
+			dest[24]= source[frac>>FRACBITS]; frac += fracstep;
+			dest[28]= source[frac>>FRACBITS]; frac += fracstep;
+			dest += 32;
+		} while (--count);
+	}
+}
+
+// [RH] Just fills a column with a given color
+void R_FillColumnHorizP_RGBA_C (void)
+{
+	int count = dc_count;
+	BYTE color = dc_color;
+	canvas_pixel_t *dest;
+
+	if (count <= 0)
+		return;
+
+	{
+		int x = dc_x & 3;
+		unsigned int **span = &dc_ctspan[x];
+
+		(*span)[0] = dc_yl;
+		(*span)[1] = dc_yh;
+		*span += 2;
+		dest = &dc_temp_rgba[x + 4*dc_yl];
+	}
+
+	if (count & 1) {
+		*dest = color;
+		dest += 4;
+	}
+	if (!(count >>= 1))
+		return;
+	do {
+		dest[0] = color; dest[4] = color;
+		dest += 8;
+	} while (--count);
+}
diff --git a/src/r_main.cpp b/src/r_main.cpp
index 04e798981..b7723d07d 100644
--- a/src/r_main.cpp
+++ b/src/r_main.cpp
@@ -847,10 +847,10 @@ void R_RenderActorView (AActor *actor, bool dontmaplines)
 	// [RH] Show off segs if r_drawflat is 1
 	if (r_drawflat)
 	{
-		hcolfunc_pre = R_FillColumnHorizP;
+		hcolfunc_pre = R_FillColumnHoriz;
 		hcolfunc_post1 = rt_copy1col;
 		hcolfunc_post4 = rt_copy4cols;
-		colfunc = R_FillColumnP;
+		colfunc = R_FillColumn;
 		spanfunc = R_FillSpan;
 	}
 	else
diff --git a/src/r_plane.cpp b/src/r_plane.cpp
index b385302e5..c8258a1ba 100644
--- a/src/r_plane.cpp
+++ b/src/r_plane.cpp
@@ -491,18 +491,19 @@ void R_MapTiltedPlane (int y, int x1)
 //
 //==========================================================================
 
-void R_MapColoredPlane (int y, int x1)
+void R_MapColoredPlane_C (int y, int x1)
+{
+	memset (ylookup[y] + x1 + dc_destorg, ds_color, (spanend[y] - x1 + 1));
+}
+
+void R_MapColoredPlane_RGBA(int y, int x1)
 {
-#ifndef PALETTEOUTPUT
 	canvas_pixel_t *dest = ylookup[y] + x1 + dc_destorg;
 	int count = (spanend[y] - x1 + 1);
 	uint32_t light = calc_light_multiplier(ds_light);
 	uint32_t color = shade_pal_index(ds_color, light);
 	for (int i = 0; i < count; i++)
 		dest[i] = color;
-#else
-	memset (ylookup[y] + x1 + dc_destorg, ds_color, (spanend[y] - x1 + 1) * sizeof(canvas_pixel_t));
-#endif
 }
 
 //==========================================================================
diff --git a/src/r_plane.h b/src/r_plane.h
index d4db3dc09..ac63501e3 100644
--- a/src/r_plane.h
+++ b/src/r_plane.h
@@ -93,6 +93,10 @@ void R_DrawNormalPlane (visplane_t *pl, double xscale, double yscale, fixed_t al
 void R_DrawTiltedPlane (visplane_t *pl, double xscale, double yscale, fixed_t alpha, bool additive, bool masked);
 void R_MapVisPlane (visplane_t *pl, void (*mapfunc)(int y, int x1));
 
+extern void(*R_MapColoredPlane)(int y, int x1);
+void R_MapColoredPlane_C(int y, int x1);
+void R_MapColoredPlane_RGBA(int y, int x1);
+
 visplane_t *R_FindPlane
 ( const secplane_t &height,
   FTextureID	picnum,
diff --git a/src/r_segs.cpp b/src/r_segs.cpp
index 1cdb78555..fb27a99de 100644
--- a/src/r_segs.cpp
+++ b/src/r_segs.cpp
@@ -463,7 +463,7 @@ void R_RenderMaskedSegRange (drawseg_t *ds, int x1, int x2)
 
 			while (dc_x < stop)
 			{
-				rt_initcols();
+				rt_initcols(nullptr);
 				BlastMaskedColumn (R_DrawMaskedColumnHoriz, tex); dc_x++;
 				BlastMaskedColumn (R_DrawMaskedColumnHoriz, tex); dc_x++;
 				BlastMaskedColumn (R_DrawMaskedColumnHoriz, tex); dc_x++;
@@ -3319,7 +3319,7 @@ static void R_RenderDecal (side_t *wall, DBaseDecal *decal, drawseg_t *clipper,
 					dc_light = 0;
 #endif
 				}
-				rt_initcols();
+				rt_initcols(nullptr);
 				for (int zz = 4; zz; --zz)
 				{
 					R_WallSpriteColumn (R_DrawMaskedColumnHoriz);
diff --git a/src/r_things.cpp b/src/r_things.cpp
index 0e55b45f9..a6f6aea28 100644
--- a/src/r_things.cpp
+++ b/src/r_things.cpp
@@ -470,7 +470,7 @@ void R_DrawVisSprite (vissprite_t *vis)
 
 			while (dc_x < stop4)
 			{
-				rt_initcols();
+				rt_initcols(nullptr);
 				for (int zz = 4; zz; --zz)
 				{
 					pixels = tex->GetColumn (frac >> FRACBITS, &spans);
@@ -619,7 +619,7 @@ void R_DrawWallSprite(vissprite_t *spr)
 				dc_light = FLOAT2FIXED(MAXLIGHTVIS);
 #endif
 			}
-			rt_initcols();
+			rt_initcols(nullptr);
 			for (int zz = 4; zz; --zz)
 			{
 				if (!R_ClipSpriteColumnWithPortals(spr))
@@ -681,7 +681,7 @@ void R_DrawVisVoxel(vissprite_t *spr, int minslabz, int maxslabz, short *cliptop
 	{
 		return;
 	}
-	if (colfunc == fuzzcolfunc || colfunc == R_FillColumnP)
+	if (colfunc == fuzzcolfunc || colfunc == R_FillColumn)
 	{
 		flags = DVF_OFFSCREEN | DVF_SPANSONLY;
 	}
@@ -2617,7 +2617,7 @@ static void R_DrawMaskedSegsBehindParticle (const vissprite_t *vis)
 	}
 }
 
-void R_DrawParticle (vissprite_t *vis)
+void R_DrawParticle_C (vissprite_t *vis)
 {
 	int spacing;
 	canvas_pixel_t *dest;
@@ -2629,44 +2629,6 @@ void R_DrawParticle (vissprite_t *vis)
 
 	R_DrawMaskedSegsBehindParticle (vis);
 
-#ifndef PALETTEOUTPUT
-	uint32_t fg = shade_pal_index(color, calc_light_multiplier(0));
-	uint32_t fg_red = (fg >> 16) & 0xff;
-	uint32_t fg_green = (fg >> 8) & 0xff;
-	uint32_t fg_blue = fg & 0xff;
-
-	// vis->renderflags holds translucency level (0-255)
-	fixed_t fglevel = ((vis->renderflags + 1) << 8) & ~0x3ff;
-	uint32_t alpha = fglevel * 256 / FRACUNIT;
-	uint32_t inv_alpha = 256 - alpha;
-
-	fg_red *= alpha;
-	fg_green *= alpha;
-	fg_blue *= alpha;
-
-	spacing = RenderTarget->GetPitch();
-
-	for (int x = x1; x < (x1 + countbase); x++)
-	{
-		dc_x = x;
-		if (R_ClipSpriteColumnWithPortals(vis))
-			continue;
-		dest = ylookup[yl] + x + dc_destorg;
-		for (int y = 0; y < ycount; y++)
-		{
-			uint32_t bg_red = (*dest >> 16) & 0xff;
-			uint32_t bg_green = (*dest >> 8) & 0xff;
-			uint32_t bg_blue = (*dest) & 0xff;
-
-			uint32_t red = (fg_red + bg_red * alpha) / 256;
-			uint32_t green = (fg_green + bg_green * alpha) / 256;
-			uint32_t blue = (fg_blue + bg_blue * alpha) / 256;
-
-			*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
-			dest += spacing;
-		}
-	}
-#else
 	DWORD *bg2rgb;
 	DWORD fg;
 
@@ -2719,7 +2681,56 @@ void R_DrawParticle (vissprite_t *vis)
 			dest += spacing;
 		}
 	}
-#endif
+}
+
+void R_DrawParticle_RGBA(vissprite_t *vis)
+{
+	int spacing;
+	canvas_pixel_t *dest;
+	BYTE color = vis->Style.colormap[vis->startfrac];
+	int yl = vis->y1;
+	int ycount = vis->y2 - yl + 1;
+	int x1 = vis->x1;
+	int countbase = vis->x2 - x1;
+
+	R_DrawMaskedSegsBehindParticle(vis);
+
+	uint32_t fg = shade_pal_index(color, calc_light_multiplier(0));
+	uint32_t fg_red = (fg >> 16) & 0xff;
+	uint32_t fg_green = (fg >> 8) & 0xff;
+	uint32_t fg_blue = fg & 0xff;
+
+	// vis->renderflags holds translucency level (0-255)
+	fixed_t fglevel = ((vis->renderflags + 1) << 8) & ~0x3ff;
+	uint32_t alpha = fglevel * 256 / FRACUNIT;
+	uint32_t inv_alpha = 256 - alpha;
+
+	fg_red *= alpha;
+	fg_green *= alpha;
+	fg_blue *= alpha;
+
+	spacing = RenderTarget->GetPitch();
+
+	for (int x = x1; x < (x1 + countbase); x++)
+	{
+		dc_x = x;
+		if (R_ClipSpriteColumnWithPortals(vis))
+			continue;
+		dest = ylookup[yl] + x + dc_destorg;
+		for (int y = 0; y < ycount; y++)
+		{
+			uint32_t bg_red = (*dest >> 16) & 0xff;
+			uint32_t bg_green = (*dest >> 8) & 0xff;
+			uint32_t bg_blue = (*dest) & 0xff;
+
+			uint32_t red = (fg_red + bg_red * alpha) / 256;
+			uint32_t green = (fg_green + bg_green * alpha) / 256;
+			uint32_t blue = (fg_blue + bg_blue * alpha) / 256;
+
+			*dest = 0xff000000 | (red << 16) | (green << 8) | blue;
+			dest += spacing;
+		}
+	}
 }
 
 extern double BaseYaspectMul;;
diff --git a/src/r_things.h b/src/r_things.h
index 1cf9b0200..057b7cfe2 100644
--- a/src/r_things.h
+++ b/src/r_things.h
@@ -97,7 +97,10 @@ struct vissprite_t
 
 struct particle_t;
 
-void R_DrawParticle (vissprite_t *);
+extern void(*R_DrawParticle)(vissprite_t *);
+void R_DrawParticle_C (vissprite_t *);
+void R_DrawParticle_RGBA (vissprite_t *);
+
 void R_ProjectParticle (particle_t *, const sector_t *sector, int shade, int fakeside);
 
 extern int MaxVisSprites;
diff --git a/src/v_draw.cpp b/src/v_draw.cpp
index fd14b5e0a..8853fc947 100644
--- a/src/v_draw.cpp
+++ b/src/v_draw.cpp
@@ -300,7 +300,7 @@ void DCanvas::DrawTextureParms(FTexture *img, DrawParms &parms)
 
 			while (dc_x < stop4)
 			{
-				rt_initcols();
+				rt_initcols(nullptr);
 				for (int zz = 4; zz; --zz)
 				{
 					pixels = img->GetColumn(frac >> FRACBITS, spanptr);