From c1b5ba5b9064997cbe9802f1b5df59a88231d4e3 Mon Sep 17 00:00:00 2001 From: Magnus Norddahl Date: Tue, 21 Jun 2016 06:22:43 +0200 Subject: [PATCH] Added SSE versions of bilinear filtering --- src/r_draw_rgba.cpp | 163 +++++++---- src/r_draw_rgba.h | 82 ++++++ src/r_draw_rgba_sse.h | 657 +++++++++++++++++++++++++++--------------- 3 files changed, 611 insertions(+), 291 deletions(-) diff --git a/src/r_draw_rgba.cpp b/src/r_draw_rgba.cpp index d85d9994b..869edaba1 100644 --- a/src/r_draw_rgba.cpp +++ b/src/r_draw_rgba.cpp @@ -58,7 +58,7 @@ extern float rw_lightstep; extern int wallshade; CVAR(Bool, r_multithreaded, true, 0) -CVAR(Bool, r_bilinear, false, 0) +CVAR(Bool, r_bilinear, true, 0) #ifndef NO_SSE @@ -1680,43 +1680,70 @@ public: xstep = _xstep; ystep = _ystep; - if (_xbits == 6 && _ybits == 6) - { - // 64x64 is the most common case by far, so special case it. - do - { - uint32_t texdata; + fixed_t xmagnitude = abs((fixed_t)xstep) >> (32 - _xbits - FRACBITS); + fixed_t ymagnitude = abs((fixed_t)ystep) >> (32 - _ybits - FRACBITS); + fixed_t magnitude = xmagnitude + ymagnitude; - spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); - texdata = source[spot]; - if (texdata != 0) + bool magnifying = !r_bilinear || magnitude >> (FRACBITS - 1) == 0; + if (magnifying) + { + if (_xbits == 6 && _ybits == 6) + { + // 64x64 is the most common case by far, so special case it. + do { - *dest = shade_bgra(texdata, light, shade_constants); - } - dest++; - xfrac += xstep; - yfrac += ystep; - } while (--count); + uint32_t texdata; + + spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); + texdata = source[spot]; + *dest = alpha_blend(shade_bgra(texdata, light, shade_constants), *dest); + dest++; + xfrac += xstep; + yfrac += ystep; + } while (--count); + } + else + { + BYTE yshift = 32 - _ybits; + BYTE xshift = yshift - _xbits; + int xmask = ((1 << _xbits) - 1) << _ybits; + do + { + uint32_t texdata; + + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + texdata = source[spot]; + *dest = alpha_blend(shade_bgra(texdata, light, shade_constants), *dest); + dest++; + xfrac += xstep; + yfrac += ystep; + } while (--count); + } } else { - BYTE yshift = 32 - _ybits; - BYTE xshift = yshift - _xbits; - int xmask = ((1 << _xbits) - 1) << _ybits; - do + if (_xbits == 6 && _ybits == 6) { - uint32_t texdata; - - spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - texdata = source[spot]; - if (texdata != 0) + // 64x64 is the most common case by far, so special case it. + do { - *dest = shade_bgra(texdata, light, shade_constants); - } - dest++; - xfrac += xstep; - yfrac += ystep; - } while (--count); + *dest++ = alpha_blend(shade_bgra(sample_bilinear(source, xfrac, yfrac, 26, 26), light, shade_constants), *dest); + xfrac += xstep; + yfrac += ystep; + } while (--count); + } + else + { + BYTE yshift = 32 - _ybits; + BYTE xshift = yshift - _xbits; + int xmask = ((1 << _xbits) - 1) << _ybits; + do + { + *dest++ = alpha_blend(shade_bgra(sample_bilinear(source, xfrac, yfrac, 32 - _xbits, 32 - _ybits), light, shade_constants), *dest); + xfrac += xstep; + yfrac += ystep; + } while (--count); + } } } }; @@ -2439,6 +2466,8 @@ class Mvlinec1RGBACommand : public DrawerCommand DWORD _texturefrac; int _count; const BYTE * RESTRICT _source; + const BYTE * RESTRICT _source2; + uint32_t _texturefracx; BYTE * RESTRICT _dest; int mvlinebits; int _pitch; @@ -2452,6 +2481,8 @@ public: _texturefrac = dc_texturefrac; _count = dc_count; _source = dc_source; + _source2 = dc_source2; + _texturefracx = dc_texturefracx; _dest = dc_dest; mvlinebits = ::mvlinebits; _pitch = dc_pitch; @@ -2468,6 +2499,8 @@ public: DWORD fracstep = _iscale * thread->num_cores; DWORD frac = _texturefrac + _iscale * thread->skipped_by_thread(_dest_y); const uint32 *source = (const uint32 *)_source; + const uint32 *source2 = (const uint32 *)_source2; + uint32_t texturefracx = _texturefracx; uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest); int bits = mvlinebits; int pitch = _pitch * thread->num_cores; @@ -2475,13 +2508,25 @@ public: uint32_t light = calc_light_multiplier(_light); ShadeConstants shade_constants = _shade_constants; - do + if (_source2 == nullptr) { - uint32_t pix = source[frac >> bits]; - *dest = alpha_blend(shade_bgra(pix, light, shade_constants), *dest); - frac += fracstep; - dest += pitch; - } while (--count); + do + { + uint32_t pix = source[frac >> bits]; + *dest = alpha_blend(shade_bgra(pix, light, shade_constants), *dest); + frac += fracstep; + dest += pitch; + } while (--count); + } + else + { + do + { + *dest = alpha_blend(shade_bgra(sample_bilinear(source, source2, texturefracx, frac, bits), light, shade_constants), *dest); + frac += fracstep; + dest += pitch; + } while (--count); + } } }; @@ -2496,6 +2541,8 @@ class Mvlinec4RGBACommand : public DrawerCommand DWORD vplce[4]; DWORD vince[4]; const uint32 * RESTRICT bufplce[4]; + const uint32 * RESTRICT bufplce2[4]; + uint32_t buftexturefracx[4]; public: Mvlinec4RGBACommand() @@ -2511,6 +2558,8 @@ public: vplce[i] = ::vplce[i]; vince[i] = ::vince[i]; bufplce[i] = (const uint32 *)::bufplce[i]; + bufplce2[i] = (const uint32_t *)::bufplce2[i]; + buftexturefracx[i] = ::buftexturefracx[i]; } } @@ -2541,15 +2590,29 @@ public: local_vince[i] *= thread->num_cores; } - do + if (bufplce2[0] == nullptr) { - uint32_t pix; - pix = bufplce[0][(place = local_vplce[0]) >> bits]; dest[0] = alpha_blend(shade_bgra(pix, light0, shade_constants), dest[0]); local_vplce[0] = place + local_vince[0]; - pix = bufplce[1][(place = local_vplce[1]) >> bits]; dest[1] = alpha_blend(shade_bgra(pix, light1, shade_constants), dest[1]); local_vplce[1] = place + local_vince[1]; - pix = bufplce[2][(place = local_vplce[2]) >> bits]; dest[2] = alpha_blend(shade_bgra(pix, light2, shade_constants), dest[2]); local_vplce[2] = place + local_vince[2]; - pix = bufplce[3][(place = local_vplce[3]) >> bits]; dest[3] = alpha_blend(shade_bgra(pix, light3, shade_constants), dest[3]); local_vplce[3] = place + local_vince[3]; - dest += pitch; - } while (--count); + do + { + uint32_t pix; + pix = bufplce[0][(place = local_vplce[0]) >> bits]; dest[0] = alpha_blend(shade_bgra(pix, light0, shade_constants), dest[0]); local_vplce[0] = place + local_vince[0]; + pix = bufplce[1][(place = local_vplce[1]) >> bits]; dest[1] = alpha_blend(shade_bgra(pix, light1, shade_constants), dest[1]); local_vplce[1] = place + local_vince[1]; + pix = bufplce[2][(place = local_vplce[2]) >> bits]; dest[2] = alpha_blend(shade_bgra(pix, light2, shade_constants), dest[2]); local_vplce[2] = place + local_vince[2]; + pix = bufplce[3][(place = local_vplce[3]) >> bits]; dest[3] = alpha_blend(shade_bgra(pix, light3, shade_constants), dest[3]); local_vplce[3] = place + local_vince[3]; + dest += pitch; + } while (--count); + } + else + { + do + { + dest[0] = alpha_blend(shade_bgra(sample_bilinear(bufplce[0], bufplce2[0], buftexturefracx[0], place = local_vplce[0], bits), light0, shade_constants), dest[0]); local_vplce[0] = place + local_vince[0]; + dest[1] = alpha_blend(shade_bgra(sample_bilinear(bufplce[1], bufplce2[1], buftexturefracx[1], place = local_vplce[1], bits), light1, shade_constants), dest[1]); local_vplce[1] = place + local_vince[1]; + dest[2] = alpha_blend(shade_bgra(sample_bilinear(bufplce[2], bufplce2[2], buftexturefracx[2], place = local_vplce[2], bits), light2, shade_constants), dest[2]); local_vplce[2] = place + local_vince[2]; + dest[3] = alpha_blend(shade_bgra(sample_bilinear(bufplce[3], bufplce2[3], buftexturefracx[3], place = local_vplce[3], bits), light3, shade_constants), dest[3]); local_vplce[3] = place + local_vince[3]; + dest += pitch; + } while (--count); + } } }; @@ -3719,10 +3782,7 @@ void R_DrawSpan_rgba() #ifdef NO_SSE DrawerCommandQueue::QueueCommand(); #else - if (!r_bilinear) - DrawerCommandQueue::QueueCommand(); - else - DrawerCommandQueue::QueueCommand(); + DrawerCommandQueue::QueueCommand(); #endif } @@ -3776,10 +3836,7 @@ void vlinec4_rgba() #ifdef NO_SSE DrawerCommandQueue::QueueCommand(); #else - if (!r_bilinear) - DrawerCommandQueue::QueueCommand(); - else - DrawerCommandQueue::QueueCommand(); + DrawerCommandQueue::QueueCommand(); #endif for (int i = 0; i < 4; i++) vplce[i] += vince[i] * dc_count; diff --git a/src/r_draw_rgba.h b/src/r_draw_rgba.h index a266ce878..0900e8997 100644 --- a/src/r_draw_rgba.h +++ b/src/r_draw_rgba.h @@ -478,6 +478,88 @@ FORCEINLINE uint32_t sample_bilinear(const uint32_t *texture, dsfixed_t xfrac, d return (alpha << 24) | (red << 16) | (green << 8) | blue; } +#ifndef NO_SSE +FORCEINLINE __m128i sample_bilinear4_sse(const uint32_t **col0, const uint32_t **col1, uint32_t texturefracx[4], uint32_t texturefracy[4], int ybits) +{ + uint32_t half = 1 << (ybits - 1); + + __m128i m127 = _mm_set1_epi16(127); + __m128i fg = _mm_setzero_si128(); + for (int i = 0; i < 4; i++) + { + uint32_t y = (texturefracy[i] - half) >> ybits; + + uint32_t inv_b = texturefracx[i]; + uint32_t inv_a = ((texturefracy[i] + half) >> (ybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t ab = a * b; + uint32_t invab = inv_a * b; + uint32_t ainvb = a * inv_b; + uint32_t invainvb = inv_a * inv_b; + __m128i ab_invab = _mm_set_epi16(invab, invab, invab, invab, ab, ab, ab, ab); + __m128i ainvb_invainvb = _mm_set_epi16(invainvb, invainvb, invainvb, invainvb, ainvb, ainvb, ainvb, ainvb); + + __m128i p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(col0[i] + y)), _mm_setzero_si128()); + __m128i p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(col1[i] + y)), _mm_setzero_si128()); + + __m128i tmp = _mm_adds_epu16(_mm_mullo_epi16(p0, ab_invab), _mm_mullo_epi16(p1, ainvb_invainvb)); + __m128i color = _mm_srli_epi16(_mm_adds_epu16(_mm_adds_epu16(_mm_srli_si128(tmp, 8), tmp), m127), 8); + + fg = _mm_or_si128(_mm_srli_si128(fg, 4), _mm_slli_si128(_mm_packus_epi16(color, _mm_setzero_si128()), 12)); + } + return fg; +} + +FORCEINLINE __m128i sample_bilinear4_sse(const uint32_t *texture, dsfixed_t &xfrac, dsfixed_t &yfrac, dsfixed_t xstep, dsfixed_t ystep, int xbits, int ybits) +{ + int xshift = (32 - xbits); + int yshift = (32 - ybits); + int xmask = (1 << xshift) - 1; + int ymask = (1 << yshift) - 1; + uint32_t xhalf = 1 << (xbits - 1); + uint32_t yhalf = 1 << (ybits - 1); + + __m128i m127 = _mm_set1_epi16(127); + __m128i fg = _mm_setzero_si128(); + for (int i = 0; i < 4; i++) + { + uint32_t x = (xfrac - xhalf) >> xbits; + uint32_t y = (yfrac - yhalf) >> ybits; + + uint32_t p00 = texture[(y & ymask) + ((x & xmask) << yshift)]; + uint32_t p01 = texture[(y + 1 & ymask) + ((x & xmask) << yshift)]; + uint32_t p10 = texture[(y & ymask) + (((x + 1) & xmask) << yshift)]; + uint32_t p11 = texture[(y + 1 & ymask) + (((x + 1) & xmask) << yshift)]; + + uint32_t inv_b = ((xfrac + xhalf) >> (xbits - 4)) & 15; + uint32_t inv_a = ((yfrac + yhalf) >> (ybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t ab = a * b; + uint32_t invab = inv_a * b; + uint32_t ainvb = a * inv_b; + uint32_t invainvb = inv_a * inv_b; + __m128i ab_invab = _mm_set_epi16(invab, invab, invab, invab, ab, ab, ab, ab); + __m128i ainvb_invainvb = _mm_set_epi16(invainvb, invainvb, invainvb, invainvb, ainvb, ainvb, ainvb, ainvb); + + __m128i p0 = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, p01, p00), _mm_setzero_si128()); + __m128i p1 = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, p11, p10), _mm_setzero_si128()); + + __m128i tmp = _mm_adds_epu16(_mm_mullo_epi16(p0, ab_invab), _mm_mullo_epi16(p1, ainvb_invainvb)); + __m128i color = _mm_srli_epi16(_mm_adds_epu16(_mm_adds_epu16(_mm_srli_si128(tmp, 8), tmp), m127), 8); + + fg = _mm_or_si128(_mm_srli_si128(fg, 4), _mm_slli_si128(_mm_packus_epi16(color, _mm_setzero_si128()), 12)); + + xfrac += xstep; + yfrac += ystep; + } + return fg; +} +#endif + // Calculate constants for a simple shade with gamma correction #define AVX_LINEAR_SHADE_SIMPLE_INIT(light) \ __m256 mlight_hi = _mm256_set_ps(1.0f, light * (1.0f/256.0f), light * (1.0f/256.0f), light * (1.0f/256.0f), 1.0f, light * (1.0f/256.0f), light * (1.0f/256.0f), light * (1.0f/256.0f)); \ diff --git a/src/r_draw_rgba_sse.h b/src/r_draw_rgba_sse.h index 220638c75..721471724 100644 --- a/src/r_draw_rgba_sse.h +++ b/src/r_draw_rgba_sse.h @@ -71,195 +71,284 @@ public: uint32_t light = calc_light_multiplier(_light); ShadeConstants shade_constants = _shade_constants; - if (_xbits == 6 && _ybits == 6) + fixed_t xmagnitude = abs((fixed_t)xstep) >> (32 - _xbits - FRACBITS); + fixed_t ymagnitude = abs((fixed_t)ystep) >> (32 - _ybits - FRACBITS); + fixed_t magnitude = xmagnitude + ymagnitude; + + bool magnifying = !r_bilinear || magnitude >> (FRACBITS - 1) == 0; + if (magnifying) { - // 64x64 is the most common case by far, so special case it. - - int sse_count = count / 4; - count -= sse_count * 4; - - if (shade_constants.simple_shade) + if (_xbits == 6 && _ybits == 6) { - VEC_SHADE_SIMPLE_INIT(light); + // 64x64 is the most common case by far, so special case it. - while (sse_count--) + int sse_count = count / 4; + count -= sse_count * 4; + + if (shade_constants.simple_shade) + { + VEC_SHADE_SIMPLE_INIT(light); + + while (sse_count--) + { + // Current texture index in u,v. + spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); + uint32_t p0 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); + uint32_t p1 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); + uint32_t p2 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); + uint32_t p3 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + // Lookup pixel from flat texture tile, + // re-index using light/colormap. + __m128i fg = _mm_set_epi32(p3, p2, p1, p0); + VEC_SHADE_SIMPLE(fg); + _mm_storeu_si128((__m128i*)dest, fg); + + // Next step in u,v. + dest += 4; + } + } + else + { + VEC_SHADE_INIT(light, shade_constants); + + while (sse_count--) + { + // Current texture index in u,v. + spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); + uint32_t p0 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); + uint32_t p1 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); + uint32_t p2 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); + uint32_t p3 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + // Lookup pixel from flat texture tile, + // re-index using light/colormap. + __m128i fg = _mm_set_epi32(p3, p2, p1, p0); + VEC_SHADE(fg, shade_constants); + _mm_storeu_si128((__m128i*)dest, fg); + + // Next step in u,v. + dest += 4; + } + } + + if (count == 0) + return; + + do { // Current texture index in u,v. spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); - uint32_t p0 = source[spot]; - xfrac += xstep; - yfrac += ystep; - spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); - uint32_t p1 = source[spot]; - xfrac += xstep; - yfrac += ystep; - - spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); - uint32_t p2 = source[spot]; - xfrac += xstep; - yfrac += ystep; - - spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); - uint32_t p3 = source[spot]; - xfrac += xstep; - yfrac += ystep; - - // Lookup pixel from flat texture tile, - // re-index using light/colormap. - __m128i fg = _mm_set_epi32(p3, p2, p1, p0); - VEC_SHADE_SIMPLE(fg); - _mm_storeu_si128((__m128i*)dest, fg); + // Lookup pixel from flat texture tile + *dest++ = shade_bgra(source[spot], light, shade_constants); // Next step in u,v. - dest += 4; - } + xfrac += xstep; + yfrac += ystep; + } while (--count); } else { - VEC_SHADE_INIT(light, shade_constants); + BYTE yshift = 32 - _ybits; + BYTE xshift = yshift - _xbits; + int xmask = ((1 << _xbits) - 1) << _ybits; - while (sse_count--) + int sse_count = count / 4; + count -= sse_count * 4; + + if (shade_constants.simple_shade) + { + VEC_SHADE_SIMPLE_INIT(light); + + while (sse_count--) + { + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p0 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p1 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p2 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p3 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + // Lookup pixel from flat texture tile + __m128i fg = _mm_set_epi32(p3, p2, p1, p0); + VEC_SHADE_SIMPLE(fg); + _mm_storeu_si128((__m128i*)dest, fg); + dest += 4; + } + } + else + { + VEC_SHADE_INIT(light, shade_constants); + + while (sse_count--) + { + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p0 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p1 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p2 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p3 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + // Lookup pixel from flat texture tile + __m128i fg = _mm_set_epi32(p3, p2, p1, p0); + VEC_SHADE(fg, shade_constants); + _mm_storeu_si128((__m128i*)dest, fg); + dest += 4; + } + } + + if (count == 0) + return; + + do { // Current texture index in u,v. - spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); - uint32_t p0 = source[spot]; - xfrac += xstep; - yfrac += ystep; + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); - uint32_t p1 = source[spot]; - xfrac += xstep; - yfrac += ystep; - - spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); - uint32_t p2 = source[spot]; - xfrac += xstep; - yfrac += ystep; - - spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); - uint32_t p3 = source[spot]; - xfrac += xstep; - yfrac += ystep; - - // Lookup pixel from flat texture tile, - // re-index using light/colormap. - __m128i fg = _mm_set_epi32(p3, p2, p1, p0); - VEC_SHADE(fg, shade_constants); - _mm_storeu_si128((__m128i*)dest, fg); + // Lookup pixel from flat texture tile + *dest++ = shade_bgra(source[spot], light, shade_constants); // Next step in u,v. - dest += 4; - } + xfrac += xstep; + yfrac += ystep; + } while (--count); } - - if (count == 0) - return; - - do - { - // Current texture index in u,v. - spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); - - // Lookup pixel from flat texture tile - *dest++ = shade_bgra(source[spot], light, shade_constants); - - // Next step in u,v. - xfrac += xstep; - yfrac += ystep; - } while (--count); } else { - BYTE yshift = 32 - _ybits; - BYTE xshift = yshift - _xbits; - int xmask = ((1 << _xbits) - 1) << _ybits; - - int sse_count = count / 4; - count -= sse_count * 4; - - if (shade_constants.simple_shade) + if (_xbits == 6 && _ybits == 6) { - VEC_SHADE_SIMPLE_INIT(light); + // 64x64 is the most common case by far, so special case it. - while (sse_count--) + int sse_count = count / 4; + count -= sse_count * 4; + + if (shade_constants.simple_shade) { - spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - uint32_t p0 = source[spot]; - xfrac += xstep; - yfrac += ystep; - - spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - uint32_t p1 = source[spot]; - xfrac += xstep; - yfrac += ystep; - - spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - uint32_t p2 = source[spot]; - xfrac += xstep; - yfrac += ystep; - - spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - uint32_t p3 = source[spot]; - xfrac += xstep; - yfrac += ystep; - - // Lookup pixel from flat texture tile - __m128i fg = _mm_set_epi32(p3, p2, p1, p0); - VEC_SHADE_SIMPLE(fg); - _mm_storeu_si128((__m128i*)dest, fg); - dest += 4; + VEC_SHADE_SIMPLE_INIT(light); + while (sse_count--) + { + __m128i fg = sample_bilinear4_sse(source, xfrac, yfrac, xstep, ystep, 26, 26); + VEC_SHADE_SIMPLE(fg); + _mm_storeu_si128((__m128i*)dest, fg); + dest += 4; + } } + else + { + VEC_SHADE_INIT(light, shade_constants); + while (sse_count--) + { + __m128i fg = sample_bilinear4_sse(source, xfrac, yfrac, xstep, ystep, 26, 26); + VEC_SHADE(fg, shade_constants); + _mm_storeu_si128((__m128i*)dest, fg); + dest += 4; + } + } + + if (count == 0) + return; + + do + { + *dest++ = shade_bgra(sample_bilinear(source, xfrac, yfrac, 26, 26), light, shade_constants); + xfrac += xstep; + yfrac += ystep; + } while (--count); } else { - VEC_SHADE_INIT(light, shade_constants); + int sse_count = count / 4; + count -= sse_count * 4; - while (sse_count--) + if (shade_constants.simple_shade) { - spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - uint32_t p0 = source[spot]; - xfrac += xstep; - yfrac += ystep; - - spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - uint32_t p1 = source[spot]; - xfrac += xstep; - yfrac += ystep; - - spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - uint32_t p2 = source[spot]; - xfrac += xstep; - yfrac += ystep; - - spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - uint32_t p3 = source[spot]; - xfrac += xstep; - yfrac += ystep; - - // Lookup pixel from flat texture tile - __m128i fg = _mm_set_epi32(p3, p2, p1, p0); - VEC_SHADE(fg, shade_constants); - _mm_storeu_si128((__m128i*)dest, fg); - dest += 4; + VEC_SHADE_SIMPLE_INIT(light); + while (sse_count--) + { + __m128i fg = sample_bilinear4_sse(source, xfrac, yfrac, xstep, ystep, 32 -_xbits, 32 - _ybits); + VEC_SHADE_SIMPLE(fg); + _mm_storeu_si128((__m128i*)dest, fg); + dest += 4; + } } + else + { + VEC_SHADE_INIT(light, shade_constants); + while (sse_count--) + { + __m128i fg = sample_bilinear4_sse(source, xfrac, yfrac, xstep, ystep, 32 - _xbits, 32 - _ybits); + VEC_SHADE(fg, shade_constants); + _mm_storeu_si128((__m128i*)dest, fg); + dest += 4; + } + } + + if (count == 0) + return; + + do + { + *dest++ = shade_bgra(sample_bilinear(source, xfrac, yfrac, 32 - _xbits, 32 - _ybits), light, shade_constants); + xfrac += xstep; + yfrac += ystep; + } while (--count); } - - if (count == 0) - return; - - do - { - // Current texture index in u,v. - spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - - // Lookup pixel from flat texture tile - *dest++ = shade_bgra(source[spot], light, shade_constants); - - // Next step in u,v. - xfrac += xstep; - yfrac += ystep; - } while (--count); } } }; @@ -275,6 +364,8 @@ class VecCommand(Vlinec4RGBA) : public DrawerCommand DWORD vplce[4]; DWORD vince[4]; const uint32 * RESTRICT bufplce[4]; + const uint32_t * RESTRICT bufplce2[4]; + uint32_t buftexturefracx[4]; public: VecCommand(Vlinec4RGBA)() @@ -290,6 +381,8 @@ public: vplce[i] = ::vplce[i]; vince[i] = ::vince[i]; bufplce[i] = (const uint32 *)::bufplce[i]; + bufplce2[i] = (const uint32_t *)::bufplce2[i]; + buftexturefracx[i] = ::buftexturefracx[i]; } } @@ -319,57 +412,97 @@ public: local_vince[i] *= thread->num_cores; } - if (shade_constants.simple_shade) + if (bufplce2[0] == nullptr) { - VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0); - do + if (shade_constants.simple_shade) { - DWORD place0 = local_vplce[0]; - DWORD place1 = local_vplce[1]; - DWORD place2 = local_vplce[2]; - DWORD place3 = local_vplce[3]; + VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0); + do + { + DWORD place0 = local_vplce[0]; + DWORD place1 = local_vplce[1]; + DWORD place2 = local_vplce[2]; + DWORD place3 = local_vplce[3]; - uint32_t p0 = bufplce[0][place0 >> bits]; - uint32_t p1 = bufplce[1][place1 >> bits]; - uint32_t p2 = bufplce[2][place2 >> bits]; - uint32_t p3 = bufplce[3][place3 >> bits]; + uint32_t p0 = bufplce[0][place0 >> bits]; + uint32_t p1 = bufplce[1][place1 >> bits]; + uint32_t p2 = bufplce[2][place2 >> bits]; + uint32_t p3 = bufplce[3][place3 >> bits]; - local_vplce[0] = place0 + local_vince[0]; - local_vplce[1] = place1 + local_vince[1]; - local_vplce[2] = place2 + local_vince[2]; - local_vplce[3] = place3 + local_vince[3]; + local_vplce[0] = place0 + local_vince[0]; + local_vplce[1] = place1 + local_vince[1]; + local_vplce[2] = place2 + local_vince[2]; + local_vplce[3] = place3 + local_vince[3]; - __m128i fg = _mm_set_epi32(p3, p2, p1, p0); - VEC_SHADE_SIMPLE(fg); - _mm_storeu_si128((__m128i*)dest, fg); - dest += pitch; - } while (--count); + __m128i fg = _mm_set_epi32(p3, p2, p1, p0); + VEC_SHADE_SIMPLE(fg); + _mm_storeu_si128((__m128i*)dest, fg); + dest += pitch; + } while (--count); + } + else + { + VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants); + do + { + DWORD place0 = local_vplce[0]; + DWORD place1 = local_vplce[1]; + DWORD place2 = local_vplce[2]; + DWORD place3 = local_vplce[3]; + + uint32_t p0 = bufplce[0][place0 >> bits]; + uint32_t p1 = bufplce[1][place1 >> bits]; + uint32_t p2 = bufplce[2][place2 >> bits]; + uint32_t p3 = bufplce[3][place3 >> bits]; + + local_vplce[0] = place0 + local_vince[0]; + local_vplce[1] = place1 + local_vince[1]; + local_vplce[2] = place2 + local_vince[2]; + local_vplce[3] = place3 + local_vince[3]; + + __m128i fg = _mm_set_epi32(p3, p2, p1, p0); + VEC_SHADE(fg, shade_constants); + _mm_storeu_si128((__m128i*)dest, fg); + dest += pitch; + } while (--count); + } } else { - VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants); - do + if (shade_constants.simple_shade) { - DWORD place0 = local_vplce[0]; - DWORD place1 = local_vplce[1]; - DWORD place2 = local_vplce[2]; - DWORD place3 = local_vplce[3]; + VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0); + do + { + __m128i fg = sample_bilinear4_sse(bufplce, bufplce2, buftexturefracx, local_vplce, bits); - uint32_t p0 = bufplce[0][place0 >> bits]; - uint32_t p1 = bufplce[1][place1 >> bits]; - uint32_t p2 = bufplce[2][place2 >> bits]; - uint32_t p3 = bufplce[3][place3 >> bits]; + local_vplce[0] = local_vplce[0] + local_vince[0]; + local_vplce[1] = local_vplce[1] + local_vince[1]; + local_vplce[2] = local_vplce[2] + local_vince[2]; + local_vplce[3] = local_vplce[3] + local_vince[3]; - local_vplce[0] = place0 + local_vince[0]; - local_vplce[1] = place1 + local_vince[1]; - local_vplce[2] = place2 + local_vince[2]; - local_vplce[3] = place3 + local_vince[3]; + VEC_SHADE_SIMPLE(fg); + _mm_storeu_si128((__m128i*)dest, fg); + dest += pitch; + } while (--count); + } + else + { + VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants); + do + { + __m128i fg = sample_bilinear4_sse(bufplce, bufplce2, buftexturefracx, local_vplce, bits); - __m128i fg = _mm_set_epi32(p3, p2, p1, p0); - VEC_SHADE(fg, shade_constants); - _mm_storeu_si128((__m128i*)dest, fg); - dest += pitch; - } while (--count); + local_vplce[0] = local_vplce[0] + local_vince[0]; + local_vplce[1] = local_vplce[1] + local_vince[1]; + local_vplce[2] = local_vplce[2] + local_vince[2]; + local_vplce[3] = local_vplce[3] + local_vince[3]; + + VEC_SHADE(fg, shade_constants); + _mm_storeu_si128((__m128i*)dest, fg); + dest += pitch; + } while (--count); + } } } }; @@ -385,6 +518,8 @@ class VecCommand(Mvlinec4RGBA) : public DrawerCommand DWORD vplce[4]; DWORD vince[4]; const uint32 * RESTRICT bufplce[4]; + const uint32 * RESTRICT bufplce2[4]; + uint32_t buftexturefracx[4]; public: VecCommand(Mvlinec4RGBA)() @@ -400,6 +535,8 @@ public: vplce[i] = ::vplce[i]; vince[i] = ::vince[i]; bufplce[i] = (const uint32 *)::bufplce[i]; + bufplce2[i] = (const uint32_t *)::bufplce2[i]; + buftexturefracx[i] = ::buftexturefracx[i]; } } @@ -429,61 +566,105 @@ public: local_vince[i] *= thread->num_cores; } - if (shade_constants.simple_shade) + if (bufplce2[0] == nullptr) { - VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0); - do + if (shade_constants.simple_shade) { - DWORD place0 = local_vplce[0]; - DWORD place1 = local_vplce[1]; - DWORD place2 = local_vplce[2]; - DWORD place3 = local_vplce[3]; + VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0); + do + { + DWORD place0 = local_vplce[0]; + DWORD place1 = local_vplce[1]; + DWORD place2 = local_vplce[2]; + DWORD place3 = local_vplce[3]; - uint32_t pix0 = bufplce[0][place0 >> bits]; - uint32_t pix1 = bufplce[1][place1 >> bits]; - uint32_t pix2 = bufplce[2][place2 >> bits]; - uint32_t pix3 = bufplce[3][place3 >> bits]; + uint32_t pix0 = bufplce[0][place0 >> bits]; + uint32_t pix1 = bufplce[1][place1 >> bits]; + uint32_t pix2 = bufplce[2][place2 >> bits]; + uint32_t pix3 = bufplce[3][place3 >> bits]; - local_vplce[0] = place0 + local_vince[0]; - local_vplce[1] = place1 + local_vince[1]; - local_vplce[2] = place2 + local_vince[2]; - local_vplce[3] = place3 + local_vince[3]; + local_vplce[0] = place0 + local_vince[0]; + local_vplce[1] = place1 + local_vince[1]; + local_vplce[2] = place2 + local_vince[2]; + local_vplce[3] = place3 + local_vince[3]; - __m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0); - __m128i bg = _mm_loadu_si128((const __m128i*)dest); - VEC_SHADE_SIMPLE(fg); - VEC_ALPHA_BLEND(fg, bg); - _mm_storeu_si128((__m128i*)dest, fg); - dest += pitch; - } while (--count); + __m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0); + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + VEC_SHADE_SIMPLE(fg); + VEC_ALPHA_BLEND(fg, bg); + _mm_storeu_si128((__m128i*)dest, fg); + dest += pitch; + } while (--count); + } + else + { + VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants); + do + { + DWORD place0 = local_vplce[0]; + DWORD place1 = local_vplce[1]; + DWORD place2 = local_vplce[2]; + DWORD place3 = local_vplce[3]; + + uint32_t pix0 = bufplce[0][place0 >> bits]; + uint32_t pix1 = bufplce[1][place1 >> bits]; + uint32_t pix2 = bufplce[2][place2 >> bits]; + uint32_t pix3 = bufplce[3][place3 >> bits]; + + local_vplce[0] = place0 + local_vince[0]; + local_vplce[1] = place1 + local_vince[1]; + local_vplce[2] = place2 + local_vince[2]; + local_vplce[3] = place3 + local_vince[3]; + + __m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0); + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + VEC_SHADE(fg, shade_constants); + VEC_ALPHA_BLEND(fg, bg); + _mm_storeu_si128((__m128i*)dest, fg); + dest += pitch; + } while (--count); + } } else { - VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants); - do + if (shade_constants.simple_shade) { - DWORD place0 = local_vplce[0]; - DWORD place1 = local_vplce[1]; - DWORD place2 = local_vplce[2]; - DWORD place3 = local_vplce[3]; + VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0); + do + { + __m128i fg = sample_bilinear4_sse(bufplce, bufplce2, buftexturefracx, local_vplce, bits); - uint32_t pix0 = bufplce[0][place0 >> bits]; - uint32_t pix1 = bufplce[1][place1 >> bits]; - uint32_t pix2 = bufplce[2][place2 >> bits]; - uint32_t pix3 = bufplce[3][place3 >> bits]; + local_vplce[0] = local_vplce[0] + local_vince[0]; + local_vplce[1] = local_vplce[1] + local_vince[1]; + local_vplce[2] = local_vplce[2] + local_vince[2]; + local_vplce[3] = local_vplce[3] + local_vince[3]; - local_vplce[0] = place0 + local_vince[0]; - local_vplce[1] = place1 + local_vince[1]; - local_vplce[2] = place2 + local_vince[2]; - local_vplce[3] = place3 + local_vince[3]; + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + VEC_SHADE_SIMPLE(fg); + VEC_ALPHA_BLEND(fg, bg); + _mm_storeu_si128((__m128i*)dest, fg); + dest += pitch; + } while (--count); + } + else + { + VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants); + do + { + __m128i fg = sample_bilinear4_sse(bufplce, bufplce2, buftexturefracx, local_vplce, bits); - __m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0); - __m128i bg = _mm_loadu_si128((const __m128i*)dest); - VEC_SHADE(fg, shade_constants); - VEC_ALPHA_BLEND(fg, bg); - _mm_storeu_si128((__m128i*)dest, fg); - dest += pitch; - } while (--count); + local_vplce[0] = local_vplce[0] + local_vince[0]; + local_vplce[1] = local_vplce[1] + local_vince[1]; + local_vplce[2] = local_vplce[2] + local_vince[2]; + local_vplce[3] = local_vplce[3] + local_vince[3]; + + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + VEC_SHADE(fg, shade_constants); + VEC_ALPHA_BLEND(fg, bg); + _mm_storeu_si128((__m128i*)dest, fg); + dest += pitch; + } while (--count); + } } } };