Added SSE versions of bilinear filtering

2025-02-20 02:22:11 +00:00 · 2016-06-21 06:22:43 +02:00 · 2016-06-21 06:22:43 +02:00 · c1b5ba5b90
commit c1b5ba5b90
parent c70aa1fe99
3 changed files with 611 additions and 291 deletions
--- a/src/r_draw_rgba.cpp
+++ b/src/r_draw_rgba.cpp
@ -58,7 +58,7 @@ extern float rw_lightstep;
 extern int wallshade;

 CVAR(Bool, r_multithreaded, true, 0)
-CVAR(Bool, r_bilinear, false, 0)
+CVAR(Bool, r_bilinear, true, 0)

 #ifndef NO_SSE

@ -1680,43 +1680,70 @@ public:
 		xstep = _xstep;
 		ystep = _ystep;

-		if (_xbits == 6 && _ybits == 6)
-		{
-			// 64x64 is the most common case by far, so special case it.
-			do
-			{
-				uint32_t texdata;
+		fixed_t xmagnitude = abs((fixed_t)xstep) >> (32 - _xbits - FRACBITS);
+		fixed_t ymagnitude = abs((fixed_t)ystep) >> (32 - _ybits - FRACBITS);
+		fixed_t magnitude = xmagnitude + ymagnitude;

-				spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
-				texdata = source[spot];
-				if (texdata != 0)
+		bool magnifying = !r_bilinear || magnitude >> (FRACBITS - 1) == 0;
+		if (magnifying)
+		{
+			if (_xbits == 6 && _ybits == 6)
+			{
+				// 64x64 is the most common case by far, so special case it.
+				do
 				{
-					*dest = shade_bgra(texdata, light, shade_constants);
-				}
-				dest++;
-				xfrac += xstep;
-				yfrac += ystep;
-			} while (--count);
+					uint32_t texdata;
+
+					spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
+					texdata = source[spot];
+					*dest = alpha_blend(shade_bgra(texdata, light, shade_constants), *dest);
+					dest++;
+					xfrac += xstep;
+					yfrac += ystep;
+				} while (--count);
+			}
+			else
+			{
+				BYTE yshift = 32 - _ybits;
+				BYTE xshift = yshift - _xbits;
+				int xmask = ((1 << _xbits) - 1) << _ybits;
+				do
+				{
+					uint32_t texdata;
+
+					spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
+					texdata = source[spot];
+					*dest = alpha_blend(shade_bgra(texdata, light, shade_constants), *dest);
+					dest++;
+					xfrac += xstep;
+					yfrac += ystep;
+				} while (--count);
+			}
 		}
 		else
 		{
-			BYTE yshift = 32 - _ybits;
-			BYTE xshift = yshift - _xbits;
-			int xmask = ((1 << _xbits) - 1) << _ybits;
-			do
+			if (_xbits == 6 && _ybits == 6)
 			{
-				uint32_t texdata;
-
-				spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
-				texdata = source[spot];
-				if (texdata != 0)
+				// 64x64 is the most common case by far, so special case it.
+				do
 				{
-					*dest = shade_bgra(texdata, light, shade_constants);
-				}
-				dest++;
-				xfrac += xstep;
-				yfrac += ystep;
-			} while (--count);
+					*dest++ = alpha_blend(shade_bgra(sample_bilinear(source, xfrac, yfrac, 26, 26), light, shade_constants), *dest);
+					xfrac += xstep;
+					yfrac += ystep;
+				} while (--count);
+			}
+			else
+			{
+				BYTE yshift = 32 - _ybits;
+				BYTE xshift = yshift - _xbits;
+				int xmask = ((1 << _xbits) - 1) << _ybits;
+				do
+				{
+					*dest++ = alpha_blend(shade_bgra(sample_bilinear(source, xfrac, yfrac, 32 - _xbits, 32 - _ybits), light, shade_constants), *dest);
+					xfrac += xstep;
+					yfrac += ystep;
+				} while (--count);
+			}
 		}
 	}
 };
@ -2439,6 +2466,8 @@ class Mvlinec1RGBACommand : public DrawerCommand
 	DWORD _texturefrac;
 	int _count;
 	const BYTE * RESTRICT _source;
+	const BYTE * RESTRICT _source2;
+	uint32_t _texturefracx;
 	BYTE * RESTRICT _dest;
 	int mvlinebits;
 	int _pitch;
@ -2452,6 +2481,8 @@ public:
 		_texturefrac = dc_texturefrac;
 		_count = dc_count;
 		_source = dc_source;
+		_source2 = dc_source2;
+		_texturefracx = dc_texturefracx;
 		_dest = dc_dest;
 		mvlinebits = ::mvlinebits;
 		_pitch = dc_pitch;
@ -2468,6 +2499,8 @@ public:
 		DWORD fracstep = _iscale * thread->num_cores;
 		DWORD frac = _texturefrac + _iscale * thread->skipped_by_thread(_dest_y);
 		const uint32 *source = (const uint32 *)_source;
+		const uint32 *source2 = (const uint32 *)_source2;
+		uint32_t texturefracx = _texturefracx;
 		uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
 		int bits = mvlinebits;
 		int pitch = _pitch * thread->num_cores;
@ -2475,13 +2508,25 @@ public:
 		uint32_t light = calc_light_multiplier(_light);
 		ShadeConstants shade_constants = _shade_constants;

-		do
+		if (_source2 == nullptr)
 		{
-			uint32_t pix = source[frac >> bits];
-			*dest = alpha_blend(shade_bgra(pix, light, shade_constants), *dest);
-			frac += fracstep;
-			dest += pitch;
-		} while (--count);
+			do
+			{
+				uint32_t pix = source[frac >> bits];
+				*dest = alpha_blend(shade_bgra(pix, light, shade_constants), *dest);
+				frac += fracstep;
+				dest += pitch;
+			} while (--count);
+		}
+		else
+		{
+			do
+			{
+				*dest = alpha_blend(shade_bgra(sample_bilinear(source, source2, texturefracx, frac, bits), light, shade_constants), *dest);
+				frac += fracstep;
+				dest += pitch;
+			} while (--count);
+		}
 	}
 };

@ -2496,6 +2541,8 @@ class Mvlinec4RGBACommand : public DrawerCommand
 	DWORD vplce[4];
 	DWORD vince[4];
 	const uint32 * RESTRICT bufplce[4];
+	const uint32 * RESTRICT bufplce2[4];
+	uint32_t buftexturefracx[4];

 public:
 	Mvlinec4RGBACommand()
@ -2511,6 +2558,8 @@ public:
 			vplce[i] = ::vplce[i];
 			vince[i] = ::vince[i];
 			bufplce[i] = (const uint32 *)::bufplce[i];
+			bufplce2[i] = (const uint32_t *)::bufplce2[i];
+			buftexturefracx[i] = ::buftexturefracx[i];
 		}
 	}

@ -2541,15 +2590,29 @@ public:
 			local_vince[i] *= thread->num_cores;
 		}

-		do
+		if (bufplce2[0] == nullptr)
 		{
-			uint32_t pix;
-			pix = bufplce[0][(place = local_vplce[0]) >> bits]; dest[0] = alpha_blend(shade_bgra(pix, light0, shade_constants), dest[0]); local_vplce[0] = place + local_vince[0];
-			pix = bufplce[1][(place = local_vplce[1]) >> bits]; dest[1] = alpha_blend(shade_bgra(pix, light1, shade_constants), dest[1]); local_vplce[1] = place + local_vince[1];
-			pix = bufplce[2][(place = local_vplce[2]) >> bits]; dest[2] = alpha_blend(shade_bgra(pix, light2, shade_constants), dest[2]); local_vplce[2] = place + local_vince[2];
-			pix = bufplce[3][(place = local_vplce[3]) >> bits]; dest[3] = alpha_blend(shade_bgra(pix, light3, shade_constants), dest[3]); local_vplce[3] = place + local_vince[3];
-			dest += pitch;
-		} while (--count);
+			do
+			{
+				uint32_t pix;
+				pix = bufplce[0][(place = local_vplce[0]) >> bits]; dest[0] = alpha_blend(shade_bgra(pix, light0, shade_constants), dest[0]); local_vplce[0] = place + local_vince[0];
+				pix = bufplce[1][(place = local_vplce[1]) >> bits]; dest[1] = alpha_blend(shade_bgra(pix, light1, shade_constants), dest[1]); local_vplce[1] = place + local_vince[1];
+				pix = bufplce[2][(place = local_vplce[2]) >> bits]; dest[2] = alpha_blend(shade_bgra(pix, light2, shade_constants), dest[2]); local_vplce[2] = place + local_vince[2];
+				pix = bufplce[3][(place = local_vplce[3]) >> bits]; dest[3] = alpha_blend(shade_bgra(pix, light3, shade_constants), dest[3]); local_vplce[3] = place + local_vince[3];
+				dest += pitch;
+			} while (--count);
+		}
+		else
+		{
+			do
+			{
+				dest[0] = alpha_blend(shade_bgra(sample_bilinear(bufplce[0], bufplce2[0], buftexturefracx[0], place = local_vplce[0], bits), light0, shade_constants), dest[0]); local_vplce[0] = place + local_vince[0];
+				dest[1] = alpha_blend(shade_bgra(sample_bilinear(bufplce[1], bufplce2[1], buftexturefracx[1], place = local_vplce[1], bits), light1, shade_constants), dest[1]); local_vplce[1] = place + local_vince[1];
+				dest[2] = alpha_blend(shade_bgra(sample_bilinear(bufplce[2], bufplce2[2], buftexturefracx[2], place = local_vplce[2], bits), light2, shade_constants), dest[2]); local_vplce[2] = place + local_vince[2];
+				dest[3] = alpha_blend(shade_bgra(sample_bilinear(bufplce[3], bufplce2[3], buftexturefracx[3], place = local_vplce[3], bits), light3, shade_constants), dest[3]); local_vplce[3] = place + local_vince[3];
+				dest += pitch;
+			} while (--count);
+		}
 	}
 };

@ -3719,10 +3782,7 @@ void R_DrawSpan_rgba()
 #ifdef NO_SSE
 	DrawerCommandQueue::QueueCommand<DrawSpanRGBACommand>();
 #else
-	if (!r_bilinear)
-		DrawerCommandQueue::QueueCommand<DrawSpanRGBA_SSE_Command>();
-	else
-		DrawerCommandQueue::QueueCommand<DrawSpanRGBACommand>();
+	DrawerCommandQueue::QueueCommand<DrawSpanRGBA_SSE_Command>();
 #endif
 }

@ -3776,10 +3836,7 @@ void vlinec4_rgba()
 #ifdef NO_SSE
 	DrawerCommandQueue::QueueCommand<Vlinec4RGBACommand>();
 #else
-	if (!r_bilinear)
-		DrawerCommandQueue::QueueCommand<Vlinec4RGBA_SSE_Command>();
-	else
-		DrawerCommandQueue::QueueCommand<Vlinec4RGBACommand>();
+	DrawerCommandQueue::QueueCommand<Vlinec4RGBA_SSE_Command>();
 #endif
 	for (int i = 0; i < 4; i++)
 		vplce[i] += vince[i] * dc_count;
--- a/src/r_draw_rgba.h
+++ b/src/r_draw_rgba.h
@ -478,6 +478,88 @@ FORCEINLINE uint32_t sample_bilinear(const uint32_t *texture, dsfixed_t xfrac, d
 	return (alpha << 24) | (red << 16) | (green << 8) | blue;
 }

+#ifndef NO_SSE
+FORCEINLINE __m128i sample_bilinear4_sse(const uint32_t **col0, const uint32_t **col1, uint32_t texturefracx[4], uint32_t texturefracy[4], int ybits)
+{
+	uint32_t half = 1 << (ybits - 1);
+
+	__m128i m127 = _mm_set1_epi16(127);
+	__m128i fg = _mm_setzero_si128();
+	for (int i = 0; i < 4; i++)
+	{
+		uint32_t y = (texturefracy[i] - half) >> ybits;
+
+		uint32_t inv_b = texturefracx[i];
+		uint32_t inv_a = ((texturefracy[i] + half) >> (ybits - 4)) & 15;
+		uint32_t a = 16 - inv_a;
+		uint32_t b = 16 - inv_b;
+
+		uint32_t ab = a * b;
+		uint32_t invab = inv_a * b;
+		uint32_t ainvb = a * inv_b;
+		uint32_t invainvb = inv_a * inv_b;
+		__m128i ab_invab = _mm_set_epi16(invab, invab, invab, invab, ab, ab, ab, ab);
+		__m128i ainvb_invainvb = _mm_set_epi16(invainvb, invainvb, invainvb, invainvb, ainvb, ainvb, ainvb, ainvb);
+
+		__m128i p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(col0[i] + y)), _mm_setzero_si128());
+		__m128i p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(col1[i] + y)), _mm_setzero_si128());
+
+		__m128i tmp = _mm_adds_epu16(_mm_mullo_epi16(p0, ab_invab), _mm_mullo_epi16(p1, ainvb_invainvb));
+		__m128i color = _mm_srli_epi16(_mm_adds_epu16(_mm_adds_epu16(_mm_srli_si128(tmp, 8), tmp), m127), 8);
+
+		fg = _mm_or_si128(_mm_srli_si128(fg, 4), _mm_slli_si128(_mm_packus_epi16(color, _mm_setzero_si128()), 12));
+	}
+	return fg;
+}
+
+FORCEINLINE __m128i sample_bilinear4_sse(const uint32_t *texture, dsfixed_t &xfrac, dsfixed_t &yfrac, dsfixed_t xstep, dsfixed_t ystep, int xbits, int ybits)
+{
+	int xshift = (32 - xbits);
+	int yshift = (32 - ybits);
+	int xmask = (1 << xshift) - 1;
+	int ymask = (1 << yshift) - 1;
+	uint32_t xhalf = 1 << (xbits - 1);
+	uint32_t yhalf = 1 << (ybits - 1);
+
+	__m128i m127 = _mm_set1_epi16(127);
+	__m128i fg = _mm_setzero_si128();
+	for (int i = 0; i < 4; i++)
+	{
+		uint32_t x = (xfrac - xhalf) >> xbits;
+		uint32_t y = (yfrac - yhalf) >> ybits;
+
+		uint32_t p00 = texture[(y & ymask) + ((x & xmask) << yshift)];
+		uint32_t p01 = texture[(y + 1 & ymask) + ((x & xmask) << yshift)];
+		uint32_t p10 = texture[(y & ymask) + (((x + 1) & xmask) << yshift)];
+		uint32_t p11 = texture[(y + 1 & ymask) + (((x + 1) & xmask) << yshift)];
+
+		uint32_t inv_b = ((xfrac + xhalf) >> (xbits - 4)) & 15;
+		uint32_t inv_a = ((yfrac + yhalf) >> (ybits - 4)) & 15;
+		uint32_t a = 16 - inv_a;
+		uint32_t b = 16 - inv_b;
+
+		uint32_t ab = a * b;
+		uint32_t invab = inv_a * b;
+		uint32_t ainvb = a * inv_b;
+		uint32_t invainvb = inv_a * inv_b;
+		__m128i ab_invab = _mm_set_epi16(invab, invab, invab, invab, ab, ab, ab, ab);
+		__m128i ainvb_invainvb = _mm_set_epi16(invainvb, invainvb, invainvb, invainvb, ainvb, ainvb, ainvb, ainvb);
+
+		__m128i p0 = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, p01, p00), _mm_setzero_si128());
+		__m128i p1 = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, p11, p10), _mm_setzero_si128());
+
+		__m128i tmp = _mm_adds_epu16(_mm_mullo_epi16(p0, ab_invab), _mm_mullo_epi16(p1, ainvb_invainvb));
+		__m128i color = _mm_srli_epi16(_mm_adds_epu16(_mm_adds_epu16(_mm_srli_si128(tmp, 8), tmp), m127), 8);
+
+		fg = _mm_or_si128(_mm_srli_si128(fg, 4), _mm_slli_si128(_mm_packus_epi16(color, _mm_setzero_si128()), 12));
+
+		xfrac += xstep;
+		yfrac += ystep;
+	}
+	return fg;
+}
+#endif
+
 // Calculate constants for a simple shade with gamma correction
 #define AVX_LINEAR_SHADE_SIMPLE_INIT(light) \
 	__m256 mlight_hi = _mm256_set_ps(1.0f, light * (1.0f/256.0f), light * (1.0f/256.0f), light * (1.0f/256.0f), 1.0f, light * (1.0f/256.0f), light * (1.0f/256.0f), light * (1.0f/256.0f)); \
--- a/src/r_draw_rgba_sse.h
+++ b/src/r_draw_rgba_sse.h
@ -71,195 +71,284 @@ public:
 		uint32_t light = calc_light_multiplier(_light);
 		ShadeConstants shade_constants = _shade_constants;

-		if (_xbits == 6 && _ybits == 6)
+		fixed_t xmagnitude = abs((fixed_t)xstep) >> (32 - _xbits - FRACBITS);
+		fixed_t ymagnitude = abs((fixed_t)ystep) >> (32 - _ybits - FRACBITS);
+		fixed_t magnitude = xmagnitude + ymagnitude;
+
+		bool magnifying = !r_bilinear || magnitude >> (FRACBITS - 1) == 0;
+		if (magnifying)
 		{
-			// 64x64 is the most common case by far, so special case it.
-
-			int sse_count = count / 4;
-			count -= sse_count * 4;
-
-			if (shade_constants.simple_shade)
+			if (_xbits == 6 && _ybits == 6)
 			{
-				VEC_SHADE_SIMPLE_INIT(light);
+				// 64x64 is the most common case by far, so special case it.

-				while (sse_count--)
+				int sse_count = count / 4;
+				count -= sse_count * 4;
+
+				if (shade_constants.simple_shade)
+				{
+					VEC_SHADE_SIMPLE_INIT(light);
+
+					while (sse_count--)
+					{
+						// Current texture index in u,v.
+						spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
+						uint32_t p0 = source[spot];
+						xfrac += xstep;
+						yfrac += ystep;
+
+						spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
+						uint32_t p1 = source[spot];
+						xfrac += xstep;
+						yfrac += ystep;
+
+						spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
+						uint32_t p2 = source[spot];
+						xfrac += xstep;
+						yfrac += ystep;
+
+						spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
+						uint32_t p3 = source[spot];
+						xfrac += xstep;
+						yfrac += ystep;
+
+						// Lookup pixel from flat texture tile,
+						//  re-index using light/colormap.
+						__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
+						VEC_SHADE_SIMPLE(fg);
+						_mm_storeu_si128((__m128i*)dest, fg);
+
+						// Next step in u,v.
+						dest += 4;
+					}
+				}
+				else
+				{
+					VEC_SHADE_INIT(light, shade_constants);
+
+					while (sse_count--)
+					{
+						// Current texture index in u,v.
+						spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
+						uint32_t p0 = source[spot];
+						xfrac += xstep;
+						yfrac += ystep;
+
+						spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
+						uint32_t p1 = source[spot];
+						xfrac += xstep;
+						yfrac += ystep;
+
+						spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
+						uint32_t p2 = source[spot];
+						xfrac += xstep;
+						yfrac += ystep;
+
+						spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
+						uint32_t p3 = source[spot];
+						xfrac += xstep;
+						yfrac += ystep;
+
+						// Lookup pixel from flat texture tile,
+						//  re-index using light/colormap.
+						__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
+						VEC_SHADE(fg, shade_constants);
+						_mm_storeu_si128((__m128i*)dest, fg);
+
+						// Next step in u,v.
+						dest += 4;
+					}
+				}
+
+				if (count == 0)
+					return;
+
+				do
 				{
 					// Current texture index in u,v.
 					spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
-					uint32_t p0 = source[spot];
-					xfrac += xstep;
-					yfrac += ystep;

-					spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
-					uint32_t p1 = source[spot];
-					xfrac += xstep;
-					yfrac += ystep;
-
-					spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
-					uint32_t p2 = source[spot];
-					xfrac += xstep;
-					yfrac += ystep;
-
-					spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
-					uint32_t p3 = source[spot];
-					xfrac += xstep;
-					yfrac += ystep;
-
-					// Lookup pixel from flat texture tile,
-					//  re-index using light/colormap.
-					__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
-					VEC_SHADE_SIMPLE(fg);
-					_mm_storeu_si128((__m128i*)dest, fg);
+					// Lookup pixel from flat texture tile
+					*dest++ = shade_bgra(source[spot], light, shade_constants);

 					// Next step in u,v.
-					dest += 4;
-				}
+					xfrac += xstep;
+					yfrac += ystep;
+				} while (--count);
 			}
 			else
 			{
-				VEC_SHADE_INIT(light, shade_constants);
+				BYTE yshift = 32 - _ybits;
+				BYTE xshift = yshift - _xbits;
+				int xmask = ((1 << _xbits) - 1) << _ybits;

-				while (sse_count--)
+				int sse_count = count / 4;
+				count -= sse_count * 4;
+
+				if (shade_constants.simple_shade)
+				{
+					VEC_SHADE_SIMPLE_INIT(light);
+
+					while (sse_count--)
+					{
+						spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
+						uint32_t p0 = source[spot];
+						xfrac += xstep;
+						yfrac += ystep;
+
+						spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
+						uint32_t p1 = source[spot];
+						xfrac += xstep;
+						yfrac += ystep;
+
+						spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
+						uint32_t p2 = source[spot];
+						xfrac += xstep;
+						yfrac += ystep;
+
+						spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
+						uint32_t p3 = source[spot];
+						xfrac += xstep;
+						yfrac += ystep;
+
+						// Lookup pixel from flat texture tile
+						__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
+						VEC_SHADE_SIMPLE(fg);
+						_mm_storeu_si128((__m128i*)dest, fg);
+						dest += 4;
+					}
+				}
+				else
+				{
+					VEC_SHADE_INIT(light, shade_constants);
+
+					while (sse_count--)
+					{
+						spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
+						uint32_t p0 = source[spot];
+						xfrac += xstep;
+						yfrac += ystep;
+
+						spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
+						uint32_t p1 = source[spot];
+						xfrac += xstep;
+						yfrac += ystep;
+
+						spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
+						uint32_t p2 = source[spot];
+						xfrac += xstep;
+						yfrac += ystep;
+
+						spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
+						uint32_t p3 = source[spot];
+						xfrac += xstep;
+						yfrac += ystep;
+
+						// Lookup pixel from flat texture tile
+						__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
+						VEC_SHADE(fg, shade_constants);
+						_mm_storeu_si128((__m128i*)dest, fg);
+						dest += 4;
+					}
+				}
+
+				if (count == 0)
+					return;
+
+				do
 				{
 					// Current texture index in u,v.
-					spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
-					uint32_t p0 = source[spot];
-					xfrac += xstep;
-					yfrac += ystep;
+					spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);

-					spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
-					uint32_t p1 = source[spot];
-					xfrac += xstep;
-					yfrac += ystep;
-
-					spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
-					uint32_t p2 = source[spot];
-					xfrac += xstep;
-					yfrac += ystep;
-
-					spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
-					uint32_t p3 = source[spot];
-					xfrac += xstep;
-					yfrac += ystep;
-
-					// Lookup pixel from flat texture tile,
-					//  re-index using light/colormap.
-					__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
-					VEC_SHADE(fg, shade_constants);
-					_mm_storeu_si128((__m128i*)dest, fg);
+					// Lookup pixel from flat texture tile
+					*dest++ = shade_bgra(source[spot], light, shade_constants);

 					// Next step in u,v.
-					dest += 4;
-				}
+					xfrac += xstep;
+					yfrac += ystep;
+				} while (--count);
 			}
-
-			if (count == 0)
-				return;
-
-			do
-			{
-				// Current texture index in u,v.
-				spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
-
-				// Lookup pixel from flat texture tile
-				*dest++ = shade_bgra(source[spot], light, shade_constants);
-
-				// Next step in u,v.
-				xfrac += xstep;
-				yfrac += ystep;
-			} while (--count);
 		}
 		else
 		{
-			BYTE yshift = 32 - _ybits;
-			BYTE xshift = yshift - _xbits;
-			int xmask = ((1 << _xbits) - 1) << _ybits;
-
-			int sse_count = count / 4;
-			count -= sse_count * 4;
-
-			if (shade_constants.simple_shade)
+			if (_xbits == 6 && _ybits == 6)
 			{
-				VEC_SHADE_SIMPLE_INIT(light);
+				// 64x64 is the most common case by far, so special case it.

-				while (sse_count--)
+				int sse_count = count / 4;
+				count -= sse_count * 4;
+
+				if (shade_constants.simple_shade)
 				{
-					spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
-					uint32_t p0 = source[spot];
-					xfrac += xstep;
-					yfrac += ystep;
-
-					spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
-					uint32_t p1 = source[spot];
-					xfrac += xstep;
-					yfrac += ystep;
-
-					spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
-					uint32_t p2 = source[spot];
-					xfrac += xstep;
-					yfrac += ystep;
-
-					spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
-					uint32_t p3 = source[spot];
-					xfrac += xstep;
-					yfrac += ystep;
-
-					// Lookup pixel from flat texture tile
-					__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
-					VEC_SHADE_SIMPLE(fg);
-					_mm_storeu_si128((__m128i*)dest, fg);
-					dest += 4;
+					VEC_SHADE_SIMPLE_INIT(light);
+					while (sse_count--)
+					{
+						__m128i fg = sample_bilinear4_sse(source, xfrac, yfrac, xstep, ystep, 26, 26);
+						VEC_SHADE_SIMPLE(fg);
+						_mm_storeu_si128((__m128i*)dest, fg);
+						dest += 4;
+					}
 				}
+				else
+				{
+					VEC_SHADE_INIT(light, shade_constants);
+					while (sse_count--)
+					{
+						__m128i fg = sample_bilinear4_sse(source, xfrac, yfrac, xstep, ystep, 26, 26);
+						VEC_SHADE(fg, shade_constants);
+						_mm_storeu_si128((__m128i*)dest, fg);
+						dest += 4;
+					}
+				}
+
+				if (count == 0)
+					return;
+
+				do
+				{
+					*dest++ = shade_bgra(sample_bilinear(source, xfrac, yfrac, 26, 26), light, shade_constants);
+					xfrac += xstep;
+					yfrac += ystep;
+				} while (--count);
 			}
 			else
 			{
-				VEC_SHADE_INIT(light, shade_constants);
+				int sse_count = count / 4;
+				count -= sse_count * 4;

-				while (sse_count--)
+				if (shade_constants.simple_shade)
 				{
-					spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
-					uint32_t p0 = source[spot];
-					xfrac += xstep;
-					yfrac += ystep;
-
-					spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
-					uint32_t p1 = source[spot];
-					xfrac += xstep;
-					yfrac += ystep;
-
-					spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
-					uint32_t p2 = source[spot];
-					xfrac += xstep;
-					yfrac += ystep;
-
-					spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
-					uint32_t p3 = source[spot];
-					xfrac += xstep;
-					yfrac += ystep;
-
-					// Lookup pixel from flat texture tile
-					__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
-					VEC_SHADE(fg, shade_constants);
-					_mm_storeu_si128((__m128i*)dest, fg);
-					dest += 4;
+					VEC_SHADE_SIMPLE_INIT(light);
+					while (sse_count--)
+					{
+						__m128i fg = sample_bilinear4_sse(source, xfrac, yfrac, xstep, ystep, 32 -_xbits, 32 - _ybits);
+						VEC_SHADE_SIMPLE(fg);
+						_mm_storeu_si128((__m128i*)dest, fg);
+						dest += 4;
+					}
 				}
+				else
+				{
+					VEC_SHADE_INIT(light, shade_constants);
+					while (sse_count--)
+					{
+						__m128i fg = sample_bilinear4_sse(source, xfrac, yfrac, xstep, ystep, 32 - _xbits, 32 - _ybits);
+						VEC_SHADE(fg, shade_constants);
+						_mm_storeu_si128((__m128i*)dest, fg);
+						dest += 4;
+					}
+				}
+
+				if (count == 0)
+					return;
+
+				do
+				{
+					*dest++ = shade_bgra(sample_bilinear(source, xfrac, yfrac, 32 - _xbits, 32 - _ybits), light, shade_constants);
+					xfrac += xstep;
+					yfrac += ystep;
+				} while (--count);
 			}
-
-			if (count == 0)
-				return;
-
-			do
-			{
-				// Current texture index in u,v.
-				spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
-
-				// Lookup pixel from flat texture tile
-				*dest++ = shade_bgra(source[spot], light, shade_constants);
-
-				// Next step in u,v.
-				xfrac += xstep;
-				yfrac += ystep;
-			} while (--count);
 		}
 	}
 };
@ -275,6 +364,8 @@ class VecCommand(Vlinec4RGBA) : public DrawerCommand
 	DWORD vplce[4];
 	DWORD vince[4];
 	const uint32 * RESTRICT bufplce[4];
+	const uint32_t * RESTRICT bufplce2[4];
+	uint32_t buftexturefracx[4];

 public:
 	VecCommand(Vlinec4RGBA)()
@ -290,6 +381,8 @@ public:
 			vplce[i] = ::vplce[i];
 			vince[i] = ::vince[i];
 			bufplce[i] = (const uint32 *)::bufplce[i];
+			bufplce2[i] = (const uint32_t *)::bufplce2[i];
+			buftexturefracx[i] = ::buftexturefracx[i];
 		}
 	}

@ -319,57 +412,97 @@ public:
 			local_vince[i] *= thread->num_cores;
 		}

-		if (shade_constants.simple_shade)
+		if (bufplce2[0] == nullptr)
 		{
-			VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
-			do
+			if (shade_constants.simple_shade)
 			{
-				DWORD place0 = local_vplce[0];
-				DWORD place1 = local_vplce[1];
-				DWORD place2 = local_vplce[2];
-				DWORD place3 = local_vplce[3];
+				VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
+				do
+				{
+					DWORD place0 = local_vplce[0];
+					DWORD place1 = local_vplce[1];
+					DWORD place2 = local_vplce[2];
+					DWORD place3 = local_vplce[3];

-				uint32_t p0 = bufplce[0][place0 >> bits];
-				uint32_t p1 = bufplce[1][place1 >> bits];
-				uint32_t p2 = bufplce[2][place2 >> bits];
-				uint32_t p3 = bufplce[3][place3 >> bits];
+					uint32_t p0 = bufplce[0][place0 >> bits];
+					uint32_t p1 = bufplce[1][place1 >> bits];
+					uint32_t p2 = bufplce[2][place2 >> bits];
+					uint32_t p3 = bufplce[3][place3 >> bits];

-				local_vplce[0] = place0 + local_vince[0];
-				local_vplce[1] = place1 + local_vince[1];
-				local_vplce[2] = place2 + local_vince[2];
-				local_vplce[3] = place3 + local_vince[3];
+					local_vplce[0] = place0 + local_vince[0];
+					local_vplce[1] = place1 + local_vince[1];
+					local_vplce[2] = place2 + local_vince[2];
+					local_vplce[3] = place3 + local_vince[3];

-				__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
-				VEC_SHADE_SIMPLE(fg);
-				_mm_storeu_si128((__m128i*)dest, fg);
-				dest += pitch;
-			} while (--count);
+					__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
+					VEC_SHADE_SIMPLE(fg);
+					_mm_storeu_si128((__m128i*)dest, fg);
+					dest += pitch;
+				} while (--count);
+			}
+			else
+			{
+				VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
+				do
+				{
+					DWORD place0 = local_vplce[0];
+					DWORD place1 = local_vplce[1];
+					DWORD place2 = local_vplce[2];
+					DWORD place3 = local_vplce[3];
+
+					uint32_t p0 = bufplce[0][place0 >> bits];
+					uint32_t p1 = bufplce[1][place1 >> bits];
+					uint32_t p2 = bufplce[2][place2 >> bits];
+					uint32_t p3 = bufplce[3][place3 >> bits];
+
+					local_vplce[0] = place0 + local_vince[0];
+					local_vplce[1] = place1 + local_vince[1];
+					local_vplce[2] = place2 + local_vince[2];
+					local_vplce[3] = place3 + local_vince[3];
+
+					__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
+					VEC_SHADE(fg, shade_constants);
+					_mm_storeu_si128((__m128i*)dest, fg);
+					dest += pitch;
+				} while (--count);
+			}
 		}
 		else
 		{
-			VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
-			do
+			if (shade_constants.simple_shade)
 			{
-				DWORD place0 = local_vplce[0];
-				DWORD place1 = local_vplce[1];
-				DWORD place2 = local_vplce[2];
-				DWORD place3 = local_vplce[3];
+				VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
+				do
+				{
+					__m128i fg = sample_bilinear4_sse(bufplce, bufplce2, buftexturefracx, local_vplce, bits);

-				uint32_t p0 = bufplce[0][place0 >> bits];
-				uint32_t p1 = bufplce[1][place1 >> bits];
-				uint32_t p2 = bufplce[2][place2 >> bits];
-				uint32_t p3 = bufplce[3][place3 >> bits];
+					local_vplce[0] = local_vplce[0] + local_vince[0];
+					local_vplce[1] = local_vplce[1] + local_vince[1];
+					local_vplce[2] = local_vplce[2] + local_vince[2];
+					local_vplce[3] = local_vplce[3] + local_vince[3];

-				local_vplce[0] = place0 + local_vince[0];
-				local_vplce[1] = place1 + local_vince[1];
-				local_vplce[2] = place2 + local_vince[2];
-				local_vplce[3] = place3 + local_vince[3];
+					VEC_SHADE_SIMPLE(fg);
+					_mm_storeu_si128((__m128i*)dest, fg);
+					dest += pitch;
+				} while (--count);
+			}
+			else
+			{
+				VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
+				do
+				{
+					__m128i fg = sample_bilinear4_sse(bufplce, bufplce2, buftexturefracx, local_vplce, bits);

-				__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
-				VEC_SHADE(fg, shade_constants);
-				_mm_storeu_si128((__m128i*)dest, fg);
-				dest += pitch;
-			} while (--count);
+					local_vplce[0] = local_vplce[0] + local_vince[0];
+					local_vplce[1] = local_vplce[1] + local_vince[1];
+					local_vplce[2] = local_vplce[2] + local_vince[2];
+					local_vplce[3] = local_vplce[3] + local_vince[3];
+
+					VEC_SHADE(fg, shade_constants);
+					_mm_storeu_si128((__m128i*)dest, fg);
+					dest += pitch;
+				} while (--count);
+			}
 		}
 	}
 };
@ -385,6 +518,8 @@ class VecCommand(Mvlinec4RGBA) : public DrawerCommand
 	DWORD vplce[4];
 	DWORD vince[4];
 	const uint32 * RESTRICT bufplce[4];
+	const uint32 * RESTRICT bufplce2[4];
+	uint32_t buftexturefracx[4];

 public:
 	VecCommand(Mvlinec4RGBA)()
@ -400,6 +535,8 @@ public:
 			vplce[i] = ::vplce[i];
 			vince[i] = ::vince[i];
 			bufplce[i] = (const uint32 *)::bufplce[i];
+			bufplce2[i] = (const uint32_t *)::bufplce2[i];
+			buftexturefracx[i] = ::buftexturefracx[i];
 		}
 	}

@ -429,61 +566,105 @@ public:
 			local_vince[i] *= thread->num_cores;
 		}

-		if (shade_constants.simple_shade)
+		if (bufplce2[0] == nullptr)
 		{
-			VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
-			do
+			if (shade_constants.simple_shade)
 			{
-				DWORD place0 = local_vplce[0];
-				DWORD place1 = local_vplce[1];
-				DWORD place2 = local_vplce[2];
-				DWORD place3 = local_vplce[3];
+				VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
+				do
+				{
+					DWORD place0 = local_vplce[0];
+					DWORD place1 = local_vplce[1];
+					DWORD place2 = local_vplce[2];
+					DWORD place3 = local_vplce[3];

-				uint32_t pix0 = bufplce[0][place0 >> bits];
-				uint32_t pix1 = bufplce[1][place1 >> bits];
-				uint32_t pix2 = bufplce[2][place2 >> bits];
-				uint32_t pix3 = bufplce[3][place3 >> bits];
+					uint32_t pix0 = bufplce[0][place0 >> bits];
+					uint32_t pix1 = bufplce[1][place1 >> bits];
+					uint32_t pix2 = bufplce[2][place2 >> bits];
+					uint32_t pix3 = bufplce[3][place3 >> bits];

-				local_vplce[0] = place0 + local_vince[0];
-				local_vplce[1] = place1 + local_vince[1];
-				local_vplce[2] = place2 + local_vince[2];
-				local_vplce[3] = place3 + local_vince[3];
+					local_vplce[0] = place0 + local_vince[0];
+					local_vplce[1] = place1 + local_vince[1];
+					local_vplce[2] = place2 + local_vince[2];
+					local_vplce[3] = place3 + local_vince[3];

-				__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
-				__m128i bg = _mm_loadu_si128((const __m128i*)dest);
-				VEC_SHADE_SIMPLE(fg);
-				VEC_ALPHA_BLEND(fg, bg);
-				_mm_storeu_si128((__m128i*)dest, fg);
-				dest += pitch;
-			} while (--count);
+					__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
+					__m128i bg = _mm_loadu_si128((const __m128i*)dest);
+					VEC_SHADE_SIMPLE(fg);
+					VEC_ALPHA_BLEND(fg, bg);
+					_mm_storeu_si128((__m128i*)dest, fg);
+					dest += pitch;
+				} while (--count);
+			}
+			else
+			{
+				VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
+				do
+				{
+					DWORD place0 = local_vplce[0];
+					DWORD place1 = local_vplce[1];
+					DWORD place2 = local_vplce[2];
+					DWORD place3 = local_vplce[3];
+
+					uint32_t pix0 = bufplce[0][place0 >> bits];
+					uint32_t pix1 = bufplce[1][place1 >> bits];
+					uint32_t pix2 = bufplce[2][place2 >> bits];
+					uint32_t pix3 = bufplce[3][place3 >> bits];
+
+					local_vplce[0] = place0 + local_vince[0];
+					local_vplce[1] = place1 + local_vince[1];
+					local_vplce[2] = place2 + local_vince[2];
+					local_vplce[3] = place3 + local_vince[3];
+
+					__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
+					__m128i bg = _mm_loadu_si128((const __m128i*)dest);
+					VEC_SHADE(fg, shade_constants);
+					VEC_ALPHA_BLEND(fg, bg);
+					_mm_storeu_si128((__m128i*)dest, fg);
+					dest += pitch;
+				} while (--count);
+			}
 		}
 		else
 		{
-			VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
-			do
+			if (shade_constants.simple_shade)
 			{
-				DWORD place0 = local_vplce[0];
-				DWORD place1 = local_vplce[1];
-				DWORD place2 = local_vplce[2];
-				DWORD place3 = local_vplce[3];
+				VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
+				do
+				{
+					__m128i fg = sample_bilinear4_sse(bufplce, bufplce2, buftexturefracx, local_vplce, bits);

-				uint32_t pix0 = bufplce[0][place0 >> bits];
-				uint32_t pix1 = bufplce[1][place1 >> bits];
-				uint32_t pix2 = bufplce[2][place2 >> bits];
-				uint32_t pix3 = bufplce[3][place3 >> bits];
+					local_vplce[0] = local_vplce[0] + local_vince[0];
+					local_vplce[1] = local_vplce[1] + local_vince[1];
+					local_vplce[2] = local_vplce[2] + local_vince[2];
+					local_vplce[3] = local_vplce[3] + local_vince[3];

-				local_vplce[0] = place0 + local_vince[0];
-				local_vplce[1] = place1 + local_vince[1];
-				local_vplce[2] = place2 + local_vince[2];
-				local_vplce[3] = place3 + local_vince[3];
+					__m128i bg = _mm_loadu_si128((const __m128i*)dest);
+					VEC_SHADE_SIMPLE(fg);
+					VEC_ALPHA_BLEND(fg, bg);
+					_mm_storeu_si128((__m128i*)dest, fg);
+					dest += pitch;
+				} while (--count);
+			}
+			else
+			{
+				VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
+				do
+				{
+					__m128i fg = sample_bilinear4_sse(bufplce, bufplce2, buftexturefracx, local_vplce, bits);

-				__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
-				__m128i bg = _mm_loadu_si128((const __m128i*)dest);
-				VEC_SHADE(fg, shade_constants);
-				VEC_ALPHA_BLEND(fg, bg);
-				_mm_storeu_si128((__m128i*)dest, fg);
-				dest += pitch;
-			} while (--count);
+					local_vplce[0] = local_vplce[0] + local_vince[0];
+					local_vplce[1] = local_vplce[1] + local_vince[1];
+					local_vplce[2] = local_vplce[2] + local_vince[2];
+					local_vplce[3] = local_vplce[3] + local_vince[3];
+
+					__m128i bg = _mm_loadu_si128((const __m128i*)dest);
+					VEC_SHADE(fg, shade_constants);
+					VEC_ALPHA_BLEND(fg, bg);
+					_mm_storeu_si128((__m128i*)dest, fg);
+					dest += pitch;
+				} while (--count);
+			}
 		}
 	}
 };