diff --git a/src/hardware/hw_cache.c b/src/hardware/hw_cache.c
index 84ad4c55b..504ded35b 100644
--- a/src/hardware/hw_cache.c
+++ b/src/hardware/hw_cache.c
@@ -679,21 +679,6 @@ static void HWR_LoadPatchFlat(GLMipmap_t *grMipmap, lumpnum_t flatlumpnum)
 	R_FlatPatch(patch, Z_Malloc(grMipmap->width * grMipmap->height, PU_HWRCACHE, &grMipmap->grInfo.data));
 }
 
-static void HWR_LoadTextureFlat(GLMipmap_t *grMipmap, INT32 texturenum)
-{
-	// setup the texture info
-	grMipmap->grInfo.smallLodLog2 = GR_LOD_LOG2_64;
-	grMipmap->grInfo.largeLodLog2 = GR_LOD_LOG2_64;
-	grMipmap->grInfo.aspectRatioLog2 = GR_ASPECT_LOG2_1x1;
-	grMipmap->grInfo.format = GR_TEXFMT_P_8;
-	grMipmap->flags = TF_WRAPXY|TF_CHROMAKEYED;
-
-	grMipmap->width  = (UINT16)textures[texturenum]->width;
-	grMipmap->height = (UINT16)textures[texturenum]->height;
-
-	R_FlatTexture(texturenum, Z_Malloc(grMipmap->width * grMipmap->height, PU_HWRCACHE, &grMipmap->grInfo.data));
-}
-
 static void HWR_CacheFlat(GLMipmap_t *grMipmap, lumpnum_t flatlumpnum)
 {
 	size_t size, pflatsize;
@@ -765,6 +750,21 @@ void HWR_GetFlat(lumpnum_t flatlumpnum)
 		gr_patchflat = flatlumpnum;
 }
 
+static void HWR_LoadTextureFlat(GLMipmap_t *grMipmap, INT32 texturenum)
+{
+	// setup the texture info
+	grMipmap->grInfo.smallLodLog2 = GR_LOD_LOG2_64;
+	grMipmap->grInfo.largeLodLog2 = GR_LOD_LOG2_64;
+	grMipmap->grInfo.aspectRatioLog2 = GR_ASPECT_LOG2_1x1;
+	grMipmap->grInfo.format = GR_TEXFMT_P_8;
+	grMipmap->flags = TF_WRAPXY|TF_CHROMAKEYED;
+
+	grMipmap->width  = (UINT16)textures[texturenum]->width;
+	grMipmap->height = (UINT16)textures[texturenum]->height;
+
+	R_FlatTexture(texturenum, Z_Malloc(grMipmap->width * grMipmap->height, PU_HWRCACHE, &grMipmap->grInfo.data));
+}
+
 void HWR_GetTextureFlat(INT32 texturenum)
 {
 	GLTexture_t *grtex;
diff --git a/src/hardware/hw_main.c b/src/hardware/hw_main.c
index 793050aa2..02e731164 100644
--- a/src/hardware/hw_main.c
+++ b/src/hardware/hw_main.c
@@ -531,6 +531,8 @@ static void HWR_RenderPlane(sector_t *sector, extrasubsector_t *xsub, boolean is
 	INT32             i;
 	float           flatxref,flatyref;
 	float fflatwidth, fflatheight;
+	INT32 flatflag;
+	boolean texflat = true;
 	size_t len;
 	float scrollx = 0.0f, scrolly = 0.0f;
 	angle_t angle = 0;
@@ -622,22 +624,25 @@ static void HWR_RenderPlane(sector_t *sector, extrasubsector_t *xsub, boolean is
 			break;
 	}
 
-	if (gr_patchflat && R_CheckIfPatch(gr_patchflat))		// Just in case?
-	{
-		patch = (patch_t *)W_CacheLumpNum(gr_patchflat, PU_STATIC);
-		fflatwidth = patch->width;
-		fflatheight = patch->height;
-	}
+	flatflag = ((INT32)fflatwidth)-1;
 
 	if (texturenum != 0 && texturenum != -1)
 	{
 		fflatwidth = textures[texturenum]->width;
 		fflatheight = textures[texturenum]->height;
 	}
+	else if (gr_patchflat && R_CheckIfPatch(gr_patchflat))		// Just in case?
+	{
+		patch = (patch_t *)W_CacheLumpNum(gr_patchflat, PU_STATIC);
+		fflatwidth = SHORT(patch->width);
+		fflatheight = SHORT(patch->height);
+	}
+	else
+		texflat = false;
 
 	// reference point for flat texture coord for each vertex around the polygon
-	flatxref = (float)((FLOAT_TO_FIXED(pv->x) % llrint(fflatwidth)) / fflatwidth);
-	flatyref = (float)((FLOAT_TO_FIXED(pv->y) % llrint(fflatheight)) / fflatheight);
+	flatxref = (float)(((fixed_t)pv->x & (~flatflag)) / fflatwidth);
+	flatyref = (float)(((fixed_t)pv->y & (~flatflag)) / fflatheight);
 
 	// transform
 	v3d = planeVerts;
@@ -693,17 +698,24 @@ static void HWR_RenderPlane(sector_t *sector, extrasubsector_t *xsub, boolean is
 	for (i = 0; i < nrPlaneVerts; i++,v3d++,pv++)
 	{
 		// Hurdler: add scrolling texture on floor/ceiling
-		v3d->sow = (float)((pv->x / fflatwidth) - flatxref + scrollx);
-		v3d->tow = (float)(flatyref - (pv->y / fflatheight) + scrolly);
-
-		//v3d->sow = (float)(pv->x / fflatsize);
-		//v3d->tow = (float)(pv->y / fflatsize);
+		if (texflat)
+		{
+			v3d->sow = (float)(pv->x / fflatwidth) + scrollx;
+			v3d->tow = -(float)(pv->y / fflatheight) + scrolly;
+		}
+		else
+		{
+			v3d->sow = (float)((pv->x / fflatwidth) - flatxref + scrollx);
+			v3d->tow = (float)(flatyref - (pv->y / fflatheight) + scrolly);
+		}
 
 		// Need to rotate before translate
 		if (angle) // Only needs to be done if there's an altered angle
 		{
 			tempxsow = FLOAT_TO_FIXED(v3d->sow);
 			tempytow = FLOAT_TO_FIXED(v3d->tow);
+			if (texflat)
+				tempytow = -tempytow;
 			v3d->sow = (FIXED_TO_FLOAT(FixedMul(tempxsow, FINECOSINE(angle)) - FixedMul(tempytow, FINESINE(angle))));
 			v3d->tow = (FIXED_TO_FLOAT(-FixedMul(tempxsow, FINESINE(angle)) - FixedMul(tempytow, FINECOSINE(angle))));
 		}
diff --git a/src/r_data.c b/src/r_data.c
index 4157a8850..092dc069a 100644
--- a/src/r_data.c
+++ b/src/r_data.c
@@ -1694,7 +1694,7 @@ void R_FlatPatch(patch_t *patch, UINT8 *flat)
 	UINT8 *source;
 
 	desttop = flat;
-	deststop = desttop + (patch->width * patch->height);
+	deststop = desttop + (SHORT(patch->width) * SHORT(patch->height));
 
 	for (col = 0; col < SHORT(patch->width); col++, desttop++)
 	{
@@ -1708,13 +1708,13 @@ void R_FlatPatch(patch_t *patch, UINT8 *flat)
 				topdelta += prevdelta;
 			prevdelta = topdelta;
 
-			dest = desttop + (topdelta * patch->width);
+			dest = desttop + (topdelta * SHORT(patch->width));
 			source = (UINT8 *)(column) + 3;
 			for (ofs = 0; dest < deststop && ofs < column->length; ofs++)
 			{
 				if (source[ofs] != TRANSPARENTPIXEL)
 					*dest = source[ofs];
-				dest += patch->width;
+				dest += SHORT(patch->width);
 			}
 			column = (column_t *)((UINT8 *)column + column->length + 4);
 		}
@@ -1733,9 +1733,8 @@ void R_FlatTexture(size_t tex, UINT8 *flat)
 	desttop = flat;
 	deststop = desttop + (texture->width * texture->height);
 
-	for (col = 0; col < SHORT(texture->width); col++, desttop++)
+	for (col = 0; col < texture->width; col++, desttop++)
 	{
-		INT32 topdelta, prevdelta = -1;
 		column = (column_t *)R_GetColumn(tex, col);
 		if (!texture->holes)
 		{
@@ -1750,6 +1749,7 @@ void R_FlatTexture(size_t tex, UINT8 *flat)
 		}
 		else
 		{
+			INT32 topdelta, prevdelta = -1;
 			while (column->topdelta != 0xff)
 			{
 				topdelta = column->topdelta;
diff --git a/src/r_draw8.c b/src/r_draw8.c
index f829707d3..542572707 100644
--- a/src/r_draw8.c
+++ b/src/r_draw8.c
@@ -550,12 +550,18 @@ void R_DrawSpan_8 (void)
 	UINT8 *dest;
 	const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height;
 
-	UINT32 flatsize = ds_flatwidth * ds_flatheight;
 	size_t count = (ds_x2 - ds_x1 + 1);
 
 	xposition = ds_xfrac; yposition = ds_yfrac;
 	xstep = ds_xstep; ystep = ds_ystep;
 
+	// SoM: we only need 6 bits for the integer part (0 thru 63) so the rest
+	// can be used for the fraction part. This allows calculation of the memory address in the
+	// texture with two shifts, an OR and one AND. (see below)
+	// for texture sizes > 64 the amount of precision we can allow will decrease, but only by one
+	// bit per power of two (obviously)
+	// Ok, because I was able to eliminate the variable spot below, this function is now FASTER
+	// than the original span renderer. Whodathunkit?
 	if (ds_powersoftwo)
 	{
 		xposition <<= nflatshiftup; yposition <<= nflatshiftup;
@@ -566,7 +572,7 @@ void R_DrawSpan_8 (void)
 	colormap = ds_colormap;
 	dest = ylookup[ds_y] + columnofs[ds_x1];
 
-	if (dest > deststop)
+	if (dest+8 > deststop)
 		return;
 
 	if (!ds_powersoftwo)
@@ -582,13 +588,56 @@ void R_DrawSpan_8 (void)
 			if (y < 0)
 				y = ds_flatheight - ((UINT32)(ds_flatheight - y) % ds_flatheight);
 
-			*dest++ = colormap[source[((y * ds_flatwidth) + x) % flatsize]];
+			x %= ds_flatwidth;
+			y %= ds_flatheight;
+
+			*dest++ = colormap[source[((y * ds_flatwidth) + x)]];
 			xposition += xstep;
 			yposition += ystep;
 		}
 	}
 	else
 	{
+		while (count >= 8)
+		{
+			// SoM: Why didn't I see this earlier? the spot variable is a waste now because we don't
+			// have the uber complicated math to calculate it now, so that was a memory write we didn't
+			// need!
+			dest[0] = colormap[source[(((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift)]];
+			xposition += xstep;
+			yposition += ystep;
+
+			dest[1] = colormap[source[(((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift)]];
+			xposition += xstep;
+			yposition += ystep;
+
+			dest[2] = colormap[source[(((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift)]];
+			xposition += xstep;
+			yposition += ystep;
+
+			dest[3] = colormap[source[(((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift)]];
+			xposition += xstep;
+			yposition += ystep;
+
+			dest[4] = colormap[source[(((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift)]];
+			xposition += xstep;
+			yposition += ystep;
+
+			dest[5] = colormap[source[(((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift)]];
+			xposition += xstep;
+			yposition += ystep;
+
+			dest[6] = colormap[source[(((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift)]];
+			xposition += xstep;
+			yposition += ystep;
+
+			dest[7] = colormap[source[(((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift)]];
+			xposition += xstep;
+			yposition += ystep;
+
+			dest += 8;
+			count -= 8;
+		}
 		while (count-- && dest <= deststop)
 		{
 			*dest++ = colormap[source[(((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift)]];
@@ -1052,13 +1101,19 @@ void R_DrawSplat_8 (void)
 	UINT8 *dest;
 	const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height;
 
-	UINT32 flatsize = ds_flatwidth * ds_flatheight;
 	size_t count = (ds_x2 - ds_x1 + 1);
 	UINT32 val;
 
 	xposition = ds_xfrac; yposition = ds_yfrac;
 	xstep = ds_xstep; ystep = ds_ystep;
 
+	// SoM: we only need 6 bits for the integer part (0 thru 63) so the rest
+	// can be used for the fraction part. This allows calculation of the memory address in the
+	// texture with two shifts, an OR and one AND. (see below)
+	// for texture sizes > 64 the amount of precision we can allow will decrease, but only by one
+	// bit per power of two (obviously)
+	// Ok, because I was able to eliminate the variable spot below, this function is now FASTER
+	// than the original span renderer. Whodathunkit?
 	if (ds_powersoftwo)
 	{
 		xposition <<= nflatshiftup; yposition <<= nflatshiftup;
@@ -1082,7 +1137,10 @@ void R_DrawSplat_8 (void)
 			if (y < 0)
 				y = ds_flatheight - ((UINT32)(ds_flatheight - y) % ds_flatheight);
 
-			val = source[((y * ds_flatwidth) + x) % flatsize];
+			x %= ds_flatwidth;
+			y %= ds_flatheight;
+
+			val = source[((y * ds_flatwidth) + x)];
 			if (val != TRANSPARENTPIXEL)
 				*dest = colormap[val];
 			dest++;
@@ -1092,6 +1150,80 @@ void R_DrawSplat_8 (void)
 	}
 	else
 	{
+		while (count >= 8)
+		{
+			// SoM: Why didn't I see this earlier? the spot variable is a waste now because we don't
+			// have the uber complicated math to calculate it now, so that was a memory write we didn't
+			// need!
+			//
+			// <Callum> 4194303 = (2048x2048)-1 (2048x2048 is maximum flat size)
+			val = (((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift);
+			val &= 4194303;
+			val = source[val];
+			if (val != TRANSPARENTPIXEL)
+				dest[0] = colormap[val];
+			xposition += xstep;
+			yposition += ystep;
+
+			val = (((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift);
+			val &= 4194303;
+			val = source[val];
+			if (val != TRANSPARENTPIXEL)
+				dest[1] = colormap[val];
+			xposition += xstep;
+			yposition += ystep;
+
+			val = (((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift);
+			val &= 4194303;
+			val = source[val];
+			if (val != TRANSPARENTPIXEL)
+				dest[2] = colormap[val];
+			xposition += xstep;
+			yposition += ystep;
+
+			val = (((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift);
+			val &= 4194303;
+			val = source[val];
+			if (val != TRANSPARENTPIXEL)
+				dest[3] = colormap[val];
+			xposition += xstep;
+			yposition += ystep;
+
+			val = (((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift);
+			val &= 4194303;
+			val = source[val];
+			if (val != TRANSPARENTPIXEL)
+				dest[4] = colormap[val];
+			xposition += xstep;
+			yposition += ystep;
+
+			val = (((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift);
+			val &= 4194303;
+			val = source[val];
+			if (val != TRANSPARENTPIXEL)
+				dest[5] = colormap[val];
+			xposition += xstep;
+			yposition += ystep;
+
+			val = (((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift);
+			val &= 4194303;
+			val = source[val];
+			if (val != TRANSPARENTPIXEL)
+				dest[6] = colormap[val];
+			xposition += xstep;
+			yposition += ystep;
+
+			val = (((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift);
+			val &= 4194303;
+			val = source[val];
+			if (val != TRANSPARENTPIXEL)
+				dest[7] = colormap[val];
+			xposition += xstep;
+			yposition += ystep;
+
+			dest += 8;
+			count -= 8;
+		}
 		while (count-- && dest <= deststop)
 		{
 			val = source[(((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift)];
@@ -1118,13 +1250,19 @@ void R_DrawTranslucentSplat_8 (void)
 	UINT8 *dest;
 	const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height;
 
-	UINT32 flatsize = ds_flatwidth * ds_flatheight;
 	size_t count = (ds_x2 - ds_x1 + 1);
 	UINT32 val;
 
 	xposition = ds_xfrac; yposition = ds_yfrac;
 	xstep = ds_xstep; ystep = ds_ystep;
 
+	// SoM: we only need 6 bits for the integer part (0 thru 63) so the rest
+	// can be used for the fraction part. This allows calculation of the memory address in the
+	// texture with two shifts, an OR and one AND. (see below)
+	// for texture sizes > 64 the amount of precision we can allow will decrease, but only by one
+	// bit per power of two (obviously)
+	// Ok, because I was able to eliminate the variable spot below, this function is now FASTER
+	// than the original span renderer. Whodathunkit?
 	if (ds_powersoftwo)
 	{
 		xposition <<= nflatshiftup; yposition <<= nflatshiftup;
@@ -1148,7 +1286,10 @@ void R_DrawTranslucentSplat_8 (void)
 			if (y < 0)
 				y = ds_flatheight - ((UINT32)(ds_flatheight - y) % ds_flatheight);
 
-			val = source[((y * ds_flatwidth) + x) % flatsize];
+			x %= ds_flatwidth;
+			y %= ds_flatheight;
+
+			val = source[((y * ds_flatwidth) + x)];
 			if (val != TRANSPARENTPIXEL)
 				*dest = *(ds_transmap + (colormap[val] << 8) + *dest);
 			dest++;
@@ -1158,6 +1299,62 @@ void R_DrawTranslucentSplat_8 (void)
 	}
 	else
 	{
+		while (count >= 8)
+		{
+			// SoM: Why didn't I see this earlier? the spot variable is a waste now because we don't
+			// have the uber complicated math to calculate it now, so that was a memory write we didn't
+			// need!
+			val = source[(((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift)];
+			if (val != TRANSPARENTPIXEL)
+				dest[0] = *(ds_transmap + (colormap[val] << 8) + dest[0]);
+			xposition += xstep;
+			yposition += ystep;
+
+			val = source[(((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift)];
+			if (val != TRANSPARENTPIXEL)
+				dest[1] = *(ds_transmap + (colormap[val] << 8) + dest[1]);
+			xposition += xstep;
+			yposition += ystep;
+
+			val = source[(((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift)];
+			if (val != TRANSPARENTPIXEL)
+				dest[2] = *(ds_transmap + (colormap[val] << 8) + dest[2]);
+			xposition += xstep;
+			yposition += ystep;
+
+			val = source[(((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift)];
+			if (val != TRANSPARENTPIXEL)
+				dest[3] = *(ds_transmap + (colormap[val] << 8) + dest[3]);
+			xposition += xstep;
+			yposition += ystep;
+
+			val = source[(((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift)];
+			if (val != TRANSPARENTPIXEL)
+				dest[4] = *(ds_transmap + (colormap[val] << 8) + dest[4]);
+			xposition += xstep;
+			yposition += ystep;
+
+			val = source[(((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift)];
+			if (val != TRANSPARENTPIXEL)
+				dest[5] = *(ds_transmap + (colormap[val] << 8) + dest[5]);
+			xposition += xstep;
+			yposition += ystep;
+
+			val = source[(((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift)];
+			if (val != TRANSPARENTPIXEL)
+				dest[6] = *(ds_transmap + (colormap[val] << 8) + dest[6]);
+			xposition += xstep;
+			yposition += ystep;
+
+			val = source[(((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift)];
+			if (val != TRANSPARENTPIXEL)
+				dest[7] = *(ds_transmap + (colormap[val] << 8) + dest[7]);
+			xposition += xstep;
+			yposition += ystep;
+
+			dest += 8;
+			count -= 8;
+		}
 		while (count-- && dest <= deststop)
 		{
 			val = source[(((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift)];
@@ -1184,13 +1381,19 @@ void R_DrawTranslucentSpan_8 (void)
 	UINT8 *dest;
 	const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height;
 
-	UINT32 flatsize = ds_flatwidth * ds_flatheight;
 	size_t count = (ds_x2 - ds_x1 + 1);
 	UINT32 val;
 
 	xposition = ds_xfrac; yposition = ds_yfrac;
 	xstep = ds_xstep; ystep = ds_ystep;
 
+	// SoM: we only need 6 bits for the integer part (0 thru 63) so the rest
+	// can be used for the fraction part. This allows calculation of the memory address in the
+	// texture with two shifts, an OR and one AND. (see below)
+	// for texture sizes > 64 the amount of precision we can allow will decrease, but only by one
+	// bit per power of two (obviously)
+	// Ok, because I was able to eliminate the variable spot below, this function is now FASTER
+	// than the original span renderer. Whodathunkit?
 	if (ds_powersoftwo)
 	{
 		xposition <<= nflatshiftup; yposition <<= nflatshiftup;
@@ -1214,7 +1417,10 @@ void R_DrawTranslucentSpan_8 (void)
 			if (y < 0)
 				y = ds_flatheight - ((UINT32)(ds_flatheight - y) % ds_flatheight);
 
-			val = ((y * ds_flatwidth) + x) % flatsize;
+			x %= ds_flatwidth;
+			y %= ds_flatheight;
+
+			val = ((y * ds_flatwidth) + x);
 			*dest = *(ds_transmap + (colormap[source[val]] << 8) + *dest);
 			dest++;
 			xposition += xstep;
@@ -1223,6 +1429,46 @@ void R_DrawTranslucentSpan_8 (void)
 	}
 	else
 	{
+		while (count >= 8)
+		{
+			// SoM: Why didn't I see this earlier? the spot variable is a waste now because we don't
+			// have the uber complicated math to calculate it now, so that was a memory write we didn't
+			// need!
+			dest[0] = *(ds_transmap + (colormap[source[(((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift)]] << 8) + dest[0]);
+			xposition += xstep;
+			yposition += ystep;
+
+			dest[1] = *(ds_transmap + (colormap[source[(((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift)]] << 8) + dest[1]);
+			xposition += xstep;
+			yposition += ystep;
+
+			dest[2] = *(ds_transmap + (colormap[source[(((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift)]] << 8) + dest[2]);
+			xposition += xstep;
+			yposition += ystep;
+
+			dest[3] = *(ds_transmap + (colormap[source[(((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift)]] << 8) + dest[3]);
+			xposition += xstep;
+			yposition += ystep;
+
+			dest[4] = *(ds_transmap + (colormap[source[(((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift)]] << 8) + dest[4]);
+			xposition += xstep;
+			yposition += ystep;
+
+			dest[5] = *(ds_transmap + (colormap[source[(((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift)]] << 8) + dest[5]);
+			xposition += xstep;
+			yposition += ystep;
+
+			dest[6] = *(ds_transmap + (colormap[source[(((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift)]] << 8) + dest[6]);
+			xposition += xstep;
+			yposition += ystep;
+
+			dest[7] = *(ds_transmap + (colormap[source[(((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift)]] << 8) + dest[7]);
+			xposition += xstep;
+			yposition += ystep;
+
+			dest += 8;
+			count -= 8;
+		}
 		while (count-- && dest <= deststop)
 		{
 			val = (((UINT32)yposition >> nflatyshift) & nflatmask) | ((UINT32)xposition >> nflatxshift);
@@ -1247,12 +1493,18 @@ void R_DrawTranslucentWaterSpan_8(void)
 	UINT8 *dsrc;
 	const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height;
 
-	UINT32 flatsize = ds_flatwidth * ds_flatheight;
 	size_t count = (ds_x2 - ds_x1 + 1);
 
 	xposition = ds_xfrac; yposition = (ds_yfrac + ds_waterofs);
 	xstep = ds_xstep; ystep = ds_ystep;
 
+	// SoM: we only need 6 bits for the integer part (0 thru 63) so the rest
+	// can be used for the fraction part. This allows calculation of the memory address in the
+	// texture with two shifts, an OR and one AND. (see below)
+	// for texture sizes > 64 the amount of precision we can allow will decrease, but only by one
+	// bit per power of two (obviously)
+	// Ok, because I was able to eliminate the variable spot below, this function is now FASTER
+	// than the original span renderer. Whodathunkit?
 	if (ds_powersoftwo)
 	{
 		xposition <<= nflatshiftup; yposition <<= nflatshiftup;
@@ -1277,7 +1529,10 @@ void R_DrawTranslucentWaterSpan_8(void)
 			if (y < 0)
 				y = ds_flatheight - ((UINT32)(ds_flatheight - y) % ds_flatheight);
 
-			*dest++ = colormap[*(ds_transmap + (source[((y * ds_flatwidth) + x) % flatsize] << 8) + *dsrc++)];
+			x %= ds_flatwidth;
+			y %= ds_flatheight;
+
+			*dest++ = colormap[*(ds_transmap + (source[((y * ds_flatwidth) + x)] << 8) + *dsrc++)];
 			xposition += xstep;
 			yposition += ystep;
 		}
diff --git a/src/r_plane.c b/src/r_plane.c
index 01d0fdd37..37a76e2cd 100644
--- a/src/r_plane.c
+++ b/src/r_plane.c
@@ -758,8 +758,8 @@ static void R_GetPatchFlat(levelflat_t *levelflat, boolean leveltexture)
 		else
 		{
 			patch = (patch_t *)ds_source;
-			levelflat->width = ds_flatwidth = patch->width;
-			levelflat->height = ds_flatheight = patch->height;
+			levelflat->width = ds_flatwidth = SHORT(patch->width);
+			levelflat->height = ds_flatheight = SHORT(patch->height);
 
 			levelflat->topoffset = patch->topoffset * FRACUNIT;
 			levelflat->leftoffset = patch->leftoffset * FRACUNIT;
@@ -1031,13 +1031,13 @@ void R_DrawSinglePlane(visplane_t *pl)
 	levelflat = &levelflats[pl->picnum];
 	size = W_LumpLength(levelflat->lumpnum);
 
-	// Check if the flat is actually a texture.
+	// Check if the flat is actually a wall texture.
 	if (levelflat->texturenum != 0 && levelflat->texturenum != -1)
 		R_GetPatchFlat(levelflat, true);
-	// Check if the flat is actually a patch.
+	// Maybe it's just a patch, then?
 	else if (R_CheckIfPatch(levelflat->lumpnum))
 		R_GetPatchFlat(levelflat, false);
-	// Raw flat.
+	// It's a raw flat.
 	else
 	{
 		ds_source = (UINT8 *)W_CacheLumpNum(levelflat->lumpnum, PU_STATIC); // Stay here until Z_ChangeTag