Added SSE versions of bilinear filtering

This commit is contained in:
Magnus Norddahl 2016-06-21 06:22:43 +02:00
parent c70aa1fe99
commit c1b5ba5b90
3 changed files with 611 additions and 291 deletions

View File

@ -58,7 +58,7 @@ extern float rw_lightstep;
extern int wallshade;
CVAR(Bool, r_multithreaded, true, 0)
CVAR(Bool, r_bilinear, false, 0)
CVAR(Bool, r_bilinear, true, 0)
#ifndef NO_SSE
@ -1680,43 +1680,70 @@ public:
xstep = _xstep;
ystep = _ystep;
if (_xbits == 6 && _ybits == 6)
{
// 64x64 is the most common case by far, so special case it.
do
{
uint32_t texdata;
fixed_t xmagnitude = abs((fixed_t)xstep) >> (32 - _xbits - FRACBITS);
fixed_t ymagnitude = abs((fixed_t)ystep) >> (32 - _ybits - FRACBITS);
fixed_t magnitude = xmagnitude + ymagnitude;
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
texdata = source[spot];
if (texdata != 0)
bool magnifying = !r_bilinear || magnitude >> (FRACBITS - 1) == 0;
if (magnifying)
{
if (_xbits == 6 && _ybits == 6)
{
// 64x64 is the most common case by far, so special case it.
do
{
*dest = shade_bgra(texdata, light, shade_constants);
}
dest++;
xfrac += xstep;
yfrac += ystep;
} while (--count);
uint32_t texdata;
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
texdata = source[spot];
*dest = alpha_blend(shade_bgra(texdata, light, shade_constants), *dest);
dest++;
xfrac += xstep;
yfrac += ystep;
} while (--count);
}
else
{
BYTE yshift = 32 - _ybits;
BYTE xshift = yshift - _xbits;
int xmask = ((1 << _xbits) - 1) << _ybits;
do
{
uint32_t texdata;
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
texdata = source[spot];
*dest = alpha_blend(shade_bgra(texdata, light, shade_constants), *dest);
dest++;
xfrac += xstep;
yfrac += ystep;
} while (--count);
}
}
else
{
BYTE yshift = 32 - _ybits;
BYTE xshift = yshift - _xbits;
int xmask = ((1 << _xbits) - 1) << _ybits;
do
if (_xbits == 6 && _ybits == 6)
{
uint32_t texdata;
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
texdata = source[spot];
if (texdata != 0)
// 64x64 is the most common case by far, so special case it.
do
{
*dest = shade_bgra(texdata, light, shade_constants);
}
dest++;
xfrac += xstep;
yfrac += ystep;
} while (--count);
*dest++ = alpha_blend(shade_bgra(sample_bilinear(source, xfrac, yfrac, 26, 26), light, shade_constants), *dest);
xfrac += xstep;
yfrac += ystep;
} while (--count);
}
else
{
BYTE yshift = 32 - _ybits;
BYTE xshift = yshift - _xbits;
int xmask = ((1 << _xbits) - 1) << _ybits;
do
{
*dest++ = alpha_blend(shade_bgra(sample_bilinear(source, xfrac, yfrac, 32 - _xbits, 32 - _ybits), light, shade_constants), *dest);
xfrac += xstep;
yfrac += ystep;
} while (--count);
}
}
}
};
@ -2439,6 +2466,8 @@ class Mvlinec1RGBACommand : public DrawerCommand
DWORD _texturefrac;
int _count;
const BYTE * RESTRICT _source;
const BYTE * RESTRICT _source2;
uint32_t _texturefracx;
BYTE * RESTRICT _dest;
int mvlinebits;
int _pitch;
@ -2452,6 +2481,8 @@ public:
_texturefrac = dc_texturefrac;
_count = dc_count;
_source = dc_source;
_source2 = dc_source2;
_texturefracx = dc_texturefracx;
_dest = dc_dest;
mvlinebits = ::mvlinebits;
_pitch = dc_pitch;
@ -2468,6 +2499,8 @@ public:
DWORD fracstep = _iscale * thread->num_cores;
DWORD frac = _texturefrac + _iscale * thread->skipped_by_thread(_dest_y);
const uint32 *source = (const uint32 *)_source;
const uint32 *source2 = (const uint32 *)_source2;
uint32_t texturefracx = _texturefracx;
uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
int bits = mvlinebits;
int pitch = _pitch * thread->num_cores;
@ -2475,13 +2508,25 @@ public:
uint32_t light = calc_light_multiplier(_light);
ShadeConstants shade_constants = _shade_constants;
do
if (_source2 == nullptr)
{
uint32_t pix = source[frac >> bits];
*dest = alpha_blend(shade_bgra(pix, light, shade_constants), *dest);
frac += fracstep;
dest += pitch;
} while (--count);
do
{
uint32_t pix = source[frac >> bits];
*dest = alpha_blend(shade_bgra(pix, light, shade_constants), *dest);
frac += fracstep;
dest += pitch;
} while (--count);
}
else
{
do
{
*dest = alpha_blend(shade_bgra(sample_bilinear(source, source2, texturefracx, frac, bits), light, shade_constants), *dest);
frac += fracstep;
dest += pitch;
} while (--count);
}
}
};
@ -2496,6 +2541,8 @@ class Mvlinec4RGBACommand : public DrawerCommand
DWORD vplce[4];
DWORD vince[4];
const uint32 * RESTRICT bufplce[4];
const uint32 * RESTRICT bufplce2[4];
uint32_t buftexturefracx[4];
public:
Mvlinec4RGBACommand()
@ -2511,6 +2558,8 @@ public:
vplce[i] = ::vplce[i];
vince[i] = ::vince[i];
bufplce[i] = (const uint32 *)::bufplce[i];
bufplce2[i] = (const uint32_t *)::bufplce2[i];
buftexturefracx[i] = ::buftexturefracx[i];
}
}
@ -2541,15 +2590,29 @@ public:
local_vince[i] *= thread->num_cores;
}
do
if (bufplce2[0] == nullptr)
{
uint32_t pix;
pix = bufplce[0][(place = local_vplce[0]) >> bits]; dest[0] = alpha_blend(shade_bgra(pix, light0, shade_constants), dest[0]); local_vplce[0] = place + local_vince[0];
pix = bufplce[1][(place = local_vplce[1]) >> bits]; dest[1] = alpha_blend(shade_bgra(pix, light1, shade_constants), dest[1]); local_vplce[1] = place + local_vince[1];
pix = bufplce[2][(place = local_vplce[2]) >> bits]; dest[2] = alpha_blend(shade_bgra(pix, light2, shade_constants), dest[2]); local_vplce[2] = place + local_vince[2];
pix = bufplce[3][(place = local_vplce[3]) >> bits]; dest[3] = alpha_blend(shade_bgra(pix, light3, shade_constants), dest[3]); local_vplce[3] = place + local_vince[3];
dest += pitch;
} while (--count);
do
{
uint32_t pix;
pix = bufplce[0][(place = local_vplce[0]) >> bits]; dest[0] = alpha_blend(shade_bgra(pix, light0, shade_constants), dest[0]); local_vplce[0] = place + local_vince[0];
pix = bufplce[1][(place = local_vplce[1]) >> bits]; dest[1] = alpha_blend(shade_bgra(pix, light1, shade_constants), dest[1]); local_vplce[1] = place + local_vince[1];
pix = bufplce[2][(place = local_vplce[2]) >> bits]; dest[2] = alpha_blend(shade_bgra(pix, light2, shade_constants), dest[2]); local_vplce[2] = place + local_vince[2];
pix = bufplce[3][(place = local_vplce[3]) >> bits]; dest[3] = alpha_blend(shade_bgra(pix, light3, shade_constants), dest[3]); local_vplce[3] = place + local_vince[3];
dest += pitch;
} while (--count);
}
else
{
do
{
dest[0] = alpha_blend(shade_bgra(sample_bilinear(bufplce[0], bufplce2[0], buftexturefracx[0], place = local_vplce[0], bits), light0, shade_constants), dest[0]); local_vplce[0] = place + local_vince[0];
dest[1] = alpha_blend(shade_bgra(sample_bilinear(bufplce[1], bufplce2[1], buftexturefracx[1], place = local_vplce[1], bits), light1, shade_constants), dest[1]); local_vplce[1] = place + local_vince[1];
dest[2] = alpha_blend(shade_bgra(sample_bilinear(bufplce[2], bufplce2[2], buftexturefracx[2], place = local_vplce[2], bits), light2, shade_constants), dest[2]); local_vplce[2] = place + local_vince[2];
dest[3] = alpha_blend(shade_bgra(sample_bilinear(bufplce[3], bufplce2[3], buftexturefracx[3], place = local_vplce[3], bits), light3, shade_constants), dest[3]); local_vplce[3] = place + local_vince[3];
dest += pitch;
} while (--count);
}
}
};
@ -3719,10 +3782,7 @@ void R_DrawSpan_rgba()
#ifdef NO_SSE
DrawerCommandQueue::QueueCommand<DrawSpanRGBACommand>();
#else
if (!r_bilinear)
DrawerCommandQueue::QueueCommand<DrawSpanRGBA_SSE_Command>();
else
DrawerCommandQueue::QueueCommand<DrawSpanRGBACommand>();
DrawerCommandQueue::QueueCommand<DrawSpanRGBA_SSE_Command>();
#endif
}
@ -3776,10 +3836,7 @@ void vlinec4_rgba()
#ifdef NO_SSE
DrawerCommandQueue::QueueCommand<Vlinec4RGBACommand>();
#else
if (!r_bilinear)
DrawerCommandQueue::QueueCommand<Vlinec4RGBA_SSE_Command>();
else
DrawerCommandQueue::QueueCommand<Vlinec4RGBACommand>();
DrawerCommandQueue::QueueCommand<Vlinec4RGBA_SSE_Command>();
#endif
for (int i = 0; i < 4; i++)
vplce[i] += vince[i] * dc_count;

View File

@ -478,6 +478,88 @@ FORCEINLINE uint32_t sample_bilinear(const uint32_t *texture, dsfixed_t xfrac, d
return (alpha << 24) | (red << 16) | (green << 8) | blue;
}
#ifndef NO_SSE
FORCEINLINE __m128i sample_bilinear4_sse(const uint32_t **col0, const uint32_t **col1, uint32_t texturefracx[4], uint32_t texturefracy[4], int ybits)
{
uint32_t half = 1 << (ybits - 1);
__m128i m127 = _mm_set1_epi16(127);
__m128i fg = _mm_setzero_si128();
for (int i = 0; i < 4; i++)
{
uint32_t y = (texturefracy[i] - half) >> ybits;
uint32_t inv_b = texturefracx[i];
uint32_t inv_a = ((texturefracy[i] + half) >> (ybits - 4)) & 15;
uint32_t a = 16 - inv_a;
uint32_t b = 16 - inv_b;
uint32_t ab = a * b;
uint32_t invab = inv_a * b;
uint32_t ainvb = a * inv_b;
uint32_t invainvb = inv_a * inv_b;
__m128i ab_invab = _mm_set_epi16(invab, invab, invab, invab, ab, ab, ab, ab);
__m128i ainvb_invainvb = _mm_set_epi16(invainvb, invainvb, invainvb, invainvb, ainvb, ainvb, ainvb, ainvb);
__m128i p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(col0[i] + y)), _mm_setzero_si128());
__m128i p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(col1[i] + y)), _mm_setzero_si128());
__m128i tmp = _mm_adds_epu16(_mm_mullo_epi16(p0, ab_invab), _mm_mullo_epi16(p1, ainvb_invainvb));
__m128i color = _mm_srli_epi16(_mm_adds_epu16(_mm_adds_epu16(_mm_srli_si128(tmp, 8), tmp), m127), 8);
fg = _mm_or_si128(_mm_srli_si128(fg, 4), _mm_slli_si128(_mm_packus_epi16(color, _mm_setzero_si128()), 12));
}
return fg;
}
FORCEINLINE __m128i sample_bilinear4_sse(const uint32_t *texture, dsfixed_t &xfrac, dsfixed_t &yfrac, dsfixed_t xstep, dsfixed_t ystep, int xbits, int ybits)
{
int xshift = (32 - xbits);
int yshift = (32 - ybits);
int xmask = (1 << xshift) - 1;
int ymask = (1 << yshift) - 1;
uint32_t xhalf = 1 << (xbits - 1);
uint32_t yhalf = 1 << (ybits - 1);
__m128i m127 = _mm_set1_epi16(127);
__m128i fg = _mm_setzero_si128();
for (int i = 0; i < 4; i++)
{
uint32_t x = (xfrac - xhalf) >> xbits;
uint32_t y = (yfrac - yhalf) >> ybits;
uint32_t p00 = texture[(y & ymask) + ((x & xmask) << yshift)];
uint32_t p01 = texture[(y + 1 & ymask) + ((x & xmask) << yshift)];
uint32_t p10 = texture[(y & ymask) + (((x + 1) & xmask) << yshift)];
uint32_t p11 = texture[(y + 1 & ymask) + (((x + 1) & xmask) << yshift)];
uint32_t inv_b = ((xfrac + xhalf) >> (xbits - 4)) & 15;
uint32_t inv_a = ((yfrac + yhalf) >> (ybits - 4)) & 15;
uint32_t a = 16 - inv_a;
uint32_t b = 16 - inv_b;
uint32_t ab = a * b;
uint32_t invab = inv_a * b;
uint32_t ainvb = a * inv_b;
uint32_t invainvb = inv_a * inv_b;
__m128i ab_invab = _mm_set_epi16(invab, invab, invab, invab, ab, ab, ab, ab);
__m128i ainvb_invainvb = _mm_set_epi16(invainvb, invainvb, invainvb, invainvb, ainvb, ainvb, ainvb, ainvb);
__m128i p0 = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, p01, p00), _mm_setzero_si128());
__m128i p1 = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, p11, p10), _mm_setzero_si128());
__m128i tmp = _mm_adds_epu16(_mm_mullo_epi16(p0, ab_invab), _mm_mullo_epi16(p1, ainvb_invainvb));
__m128i color = _mm_srli_epi16(_mm_adds_epu16(_mm_adds_epu16(_mm_srli_si128(tmp, 8), tmp), m127), 8);
fg = _mm_or_si128(_mm_srli_si128(fg, 4), _mm_slli_si128(_mm_packus_epi16(color, _mm_setzero_si128()), 12));
xfrac += xstep;
yfrac += ystep;
}
return fg;
}
#endif
// Calculate constants for a simple shade with gamma correction
#define AVX_LINEAR_SHADE_SIMPLE_INIT(light) \
__m256 mlight_hi = _mm256_set_ps(1.0f, light * (1.0f/256.0f), light * (1.0f/256.0f), light * (1.0f/256.0f), 1.0f, light * (1.0f/256.0f), light * (1.0f/256.0f), light * (1.0f/256.0f)); \

View File

@ -71,195 +71,284 @@ public:
uint32_t light = calc_light_multiplier(_light);
ShadeConstants shade_constants = _shade_constants;
if (_xbits == 6 && _ybits == 6)
fixed_t xmagnitude = abs((fixed_t)xstep) >> (32 - _xbits - FRACBITS);
fixed_t ymagnitude = abs((fixed_t)ystep) >> (32 - _ybits - FRACBITS);
fixed_t magnitude = xmagnitude + ymagnitude;
bool magnifying = !r_bilinear || magnitude >> (FRACBITS - 1) == 0;
if (magnifying)
{
// 64x64 is the most common case by far, so special case it.
int sse_count = count / 4;
count -= sse_count * 4;
if (shade_constants.simple_shade)
if (_xbits == 6 && _ybits == 6)
{
VEC_SHADE_SIMPLE_INIT(light);
// 64x64 is the most common case by far, so special case it.
while (sse_count--)
int sse_count = count / 4;
count -= sse_count * 4;
if (shade_constants.simple_shade)
{
VEC_SHADE_SIMPLE_INIT(light);
while (sse_count--)
{
// Current texture index in u,v.
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
uint32_t p0 = source[spot];
xfrac += xstep;
yfrac += ystep;
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
uint32_t p1 = source[spot];
xfrac += xstep;
yfrac += ystep;
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
uint32_t p2 = source[spot];
xfrac += xstep;
yfrac += ystep;
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
uint32_t p3 = source[spot];
xfrac += xstep;
yfrac += ystep;
// Lookup pixel from flat texture tile,
// re-index using light/colormap.
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
VEC_SHADE_SIMPLE(fg);
_mm_storeu_si128((__m128i*)dest, fg);
// Next step in u,v.
dest += 4;
}
}
else
{
VEC_SHADE_INIT(light, shade_constants);
while (sse_count--)
{
// Current texture index in u,v.
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
uint32_t p0 = source[spot];
xfrac += xstep;
yfrac += ystep;
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
uint32_t p1 = source[spot];
xfrac += xstep;
yfrac += ystep;
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
uint32_t p2 = source[spot];
xfrac += xstep;
yfrac += ystep;
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
uint32_t p3 = source[spot];
xfrac += xstep;
yfrac += ystep;
// Lookup pixel from flat texture tile,
// re-index using light/colormap.
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
VEC_SHADE(fg, shade_constants);
_mm_storeu_si128((__m128i*)dest, fg);
// Next step in u,v.
dest += 4;
}
}
if (count == 0)
return;
do
{
// Current texture index in u,v.
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
uint32_t p0 = source[spot];
xfrac += xstep;
yfrac += ystep;
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
uint32_t p1 = source[spot];
xfrac += xstep;
yfrac += ystep;
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
uint32_t p2 = source[spot];
xfrac += xstep;
yfrac += ystep;
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
uint32_t p3 = source[spot];
xfrac += xstep;
yfrac += ystep;
// Lookup pixel from flat texture tile,
// re-index using light/colormap.
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
VEC_SHADE_SIMPLE(fg);
_mm_storeu_si128((__m128i*)dest, fg);
// Lookup pixel from flat texture tile
*dest++ = shade_bgra(source[spot], light, shade_constants);
// Next step in u,v.
dest += 4;
}
xfrac += xstep;
yfrac += ystep;
} while (--count);
}
else
{
VEC_SHADE_INIT(light, shade_constants);
BYTE yshift = 32 - _ybits;
BYTE xshift = yshift - _xbits;
int xmask = ((1 << _xbits) - 1) << _ybits;
while (sse_count--)
int sse_count = count / 4;
count -= sse_count * 4;
if (shade_constants.simple_shade)
{
VEC_SHADE_SIMPLE_INIT(light);
while (sse_count--)
{
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
uint32_t p0 = source[spot];
xfrac += xstep;
yfrac += ystep;
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
uint32_t p1 = source[spot];
xfrac += xstep;
yfrac += ystep;
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
uint32_t p2 = source[spot];
xfrac += xstep;
yfrac += ystep;
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
uint32_t p3 = source[spot];
xfrac += xstep;
yfrac += ystep;
// Lookup pixel from flat texture tile
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
VEC_SHADE_SIMPLE(fg);
_mm_storeu_si128((__m128i*)dest, fg);
dest += 4;
}
}
else
{
VEC_SHADE_INIT(light, shade_constants);
while (sse_count--)
{
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
uint32_t p0 = source[spot];
xfrac += xstep;
yfrac += ystep;
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
uint32_t p1 = source[spot];
xfrac += xstep;
yfrac += ystep;
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
uint32_t p2 = source[spot];
xfrac += xstep;
yfrac += ystep;
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
uint32_t p3 = source[spot];
xfrac += xstep;
yfrac += ystep;
// Lookup pixel from flat texture tile
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
VEC_SHADE(fg, shade_constants);
_mm_storeu_si128((__m128i*)dest, fg);
dest += 4;
}
}
if (count == 0)
return;
do
{
// Current texture index in u,v.
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
uint32_t p0 = source[spot];
xfrac += xstep;
yfrac += ystep;
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
uint32_t p1 = source[spot];
xfrac += xstep;
yfrac += ystep;
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
uint32_t p2 = source[spot];
xfrac += xstep;
yfrac += ystep;
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
uint32_t p3 = source[spot];
xfrac += xstep;
yfrac += ystep;
// Lookup pixel from flat texture tile,
// re-index using light/colormap.
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
VEC_SHADE(fg, shade_constants);
_mm_storeu_si128((__m128i*)dest, fg);
// Lookup pixel from flat texture tile
*dest++ = shade_bgra(source[spot], light, shade_constants);
// Next step in u,v.
dest += 4;
}
xfrac += xstep;
yfrac += ystep;
} while (--count);
}
if (count == 0)
return;
do
{
// Current texture index in u,v.
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
// Lookup pixel from flat texture tile
*dest++ = shade_bgra(source[spot], light, shade_constants);
// Next step in u,v.
xfrac += xstep;
yfrac += ystep;
} while (--count);
}
else
{
BYTE yshift = 32 - _ybits;
BYTE xshift = yshift - _xbits;
int xmask = ((1 << _xbits) - 1) << _ybits;
int sse_count = count / 4;
count -= sse_count * 4;
if (shade_constants.simple_shade)
if (_xbits == 6 && _ybits == 6)
{
VEC_SHADE_SIMPLE_INIT(light);
// 64x64 is the most common case by far, so special case it.
while (sse_count--)
int sse_count = count / 4;
count -= sse_count * 4;
if (shade_constants.simple_shade)
{
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
uint32_t p0 = source[spot];
xfrac += xstep;
yfrac += ystep;
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
uint32_t p1 = source[spot];
xfrac += xstep;
yfrac += ystep;
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
uint32_t p2 = source[spot];
xfrac += xstep;
yfrac += ystep;
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
uint32_t p3 = source[spot];
xfrac += xstep;
yfrac += ystep;
// Lookup pixel from flat texture tile
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
VEC_SHADE_SIMPLE(fg);
_mm_storeu_si128((__m128i*)dest, fg);
dest += 4;
VEC_SHADE_SIMPLE_INIT(light);
while (sse_count--)
{
__m128i fg = sample_bilinear4_sse(source, xfrac, yfrac, xstep, ystep, 26, 26);
VEC_SHADE_SIMPLE(fg);
_mm_storeu_si128((__m128i*)dest, fg);
dest += 4;
}
}
else
{
VEC_SHADE_INIT(light, shade_constants);
while (sse_count--)
{
__m128i fg = sample_bilinear4_sse(source, xfrac, yfrac, xstep, ystep, 26, 26);
VEC_SHADE(fg, shade_constants);
_mm_storeu_si128((__m128i*)dest, fg);
dest += 4;
}
}
if (count == 0)
return;
do
{
*dest++ = shade_bgra(sample_bilinear(source, xfrac, yfrac, 26, 26), light, shade_constants);
xfrac += xstep;
yfrac += ystep;
} while (--count);
}
else
{
VEC_SHADE_INIT(light, shade_constants);
int sse_count = count / 4;
count -= sse_count * 4;
while (sse_count--)
if (shade_constants.simple_shade)
{
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
uint32_t p0 = source[spot];
xfrac += xstep;
yfrac += ystep;
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
uint32_t p1 = source[spot];
xfrac += xstep;
yfrac += ystep;
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
uint32_t p2 = source[spot];
xfrac += xstep;
yfrac += ystep;
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
uint32_t p3 = source[spot];
xfrac += xstep;
yfrac += ystep;
// Lookup pixel from flat texture tile
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
VEC_SHADE(fg, shade_constants);
_mm_storeu_si128((__m128i*)dest, fg);
dest += 4;
VEC_SHADE_SIMPLE_INIT(light);
while (sse_count--)
{
__m128i fg = sample_bilinear4_sse(source, xfrac, yfrac, xstep, ystep, 32 -_xbits, 32 - _ybits);
VEC_SHADE_SIMPLE(fg);
_mm_storeu_si128((__m128i*)dest, fg);
dest += 4;
}
}
else
{
VEC_SHADE_INIT(light, shade_constants);
while (sse_count--)
{
__m128i fg = sample_bilinear4_sse(source, xfrac, yfrac, xstep, ystep, 32 - _xbits, 32 - _ybits);
VEC_SHADE(fg, shade_constants);
_mm_storeu_si128((__m128i*)dest, fg);
dest += 4;
}
}
if (count == 0)
return;
do
{
*dest++ = shade_bgra(sample_bilinear(source, xfrac, yfrac, 32 - _xbits, 32 - _ybits), light, shade_constants);
xfrac += xstep;
yfrac += ystep;
} while (--count);
}
if (count == 0)
return;
do
{
// Current texture index in u,v.
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
// Lookup pixel from flat texture tile
*dest++ = shade_bgra(source[spot], light, shade_constants);
// Next step in u,v.
xfrac += xstep;
yfrac += ystep;
} while (--count);
}
}
};
@ -275,6 +364,8 @@ class VecCommand(Vlinec4RGBA) : public DrawerCommand
DWORD vplce[4];
DWORD vince[4];
const uint32 * RESTRICT bufplce[4];
const uint32_t * RESTRICT bufplce2[4];
uint32_t buftexturefracx[4];
public:
VecCommand(Vlinec4RGBA)()
@ -290,6 +381,8 @@ public:
vplce[i] = ::vplce[i];
vince[i] = ::vince[i];
bufplce[i] = (const uint32 *)::bufplce[i];
bufplce2[i] = (const uint32_t *)::bufplce2[i];
buftexturefracx[i] = ::buftexturefracx[i];
}
}
@ -319,57 +412,97 @@ public:
local_vince[i] *= thread->num_cores;
}
if (shade_constants.simple_shade)
if (bufplce2[0] == nullptr)
{
VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
do
if (shade_constants.simple_shade)
{
DWORD place0 = local_vplce[0];
DWORD place1 = local_vplce[1];
DWORD place2 = local_vplce[2];
DWORD place3 = local_vplce[3];
VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
do
{
DWORD place0 = local_vplce[0];
DWORD place1 = local_vplce[1];
DWORD place2 = local_vplce[2];
DWORD place3 = local_vplce[3];
uint32_t p0 = bufplce[0][place0 >> bits];
uint32_t p1 = bufplce[1][place1 >> bits];
uint32_t p2 = bufplce[2][place2 >> bits];
uint32_t p3 = bufplce[3][place3 >> bits];
uint32_t p0 = bufplce[0][place0 >> bits];
uint32_t p1 = bufplce[1][place1 >> bits];
uint32_t p2 = bufplce[2][place2 >> bits];
uint32_t p3 = bufplce[3][place3 >> bits];
local_vplce[0] = place0 + local_vince[0];
local_vplce[1] = place1 + local_vince[1];
local_vplce[2] = place2 + local_vince[2];
local_vplce[3] = place3 + local_vince[3];
local_vplce[0] = place0 + local_vince[0];
local_vplce[1] = place1 + local_vince[1];
local_vplce[2] = place2 + local_vince[2];
local_vplce[3] = place3 + local_vince[3];
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
VEC_SHADE_SIMPLE(fg);
_mm_storeu_si128((__m128i*)dest, fg);
dest += pitch;
} while (--count);
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
VEC_SHADE_SIMPLE(fg);
_mm_storeu_si128((__m128i*)dest, fg);
dest += pitch;
} while (--count);
}
else
{
VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
do
{
DWORD place0 = local_vplce[0];
DWORD place1 = local_vplce[1];
DWORD place2 = local_vplce[2];
DWORD place3 = local_vplce[3];
uint32_t p0 = bufplce[0][place0 >> bits];
uint32_t p1 = bufplce[1][place1 >> bits];
uint32_t p2 = bufplce[2][place2 >> bits];
uint32_t p3 = bufplce[3][place3 >> bits];
local_vplce[0] = place0 + local_vince[0];
local_vplce[1] = place1 + local_vince[1];
local_vplce[2] = place2 + local_vince[2];
local_vplce[3] = place3 + local_vince[3];
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
VEC_SHADE(fg, shade_constants);
_mm_storeu_si128((__m128i*)dest, fg);
dest += pitch;
} while (--count);
}
}
else
{
VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
do
if (shade_constants.simple_shade)
{
DWORD place0 = local_vplce[0];
DWORD place1 = local_vplce[1];
DWORD place2 = local_vplce[2];
DWORD place3 = local_vplce[3];
VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
do
{
__m128i fg = sample_bilinear4_sse(bufplce, bufplce2, buftexturefracx, local_vplce, bits);
uint32_t p0 = bufplce[0][place0 >> bits];
uint32_t p1 = bufplce[1][place1 >> bits];
uint32_t p2 = bufplce[2][place2 >> bits];
uint32_t p3 = bufplce[3][place3 >> bits];
local_vplce[0] = local_vplce[0] + local_vince[0];
local_vplce[1] = local_vplce[1] + local_vince[1];
local_vplce[2] = local_vplce[2] + local_vince[2];
local_vplce[3] = local_vplce[3] + local_vince[3];
local_vplce[0] = place0 + local_vince[0];
local_vplce[1] = place1 + local_vince[1];
local_vplce[2] = place2 + local_vince[2];
local_vplce[3] = place3 + local_vince[3];
VEC_SHADE_SIMPLE(fg);
_mm_storeu_si128((__m128i*)dest, fg);
dest += pitch;
} while (--count);
}
else
{
VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
do
{
__m128i fg = sample_bilinear4_sse(bufplce, bufplce2, buftexturefracx, local_vplce, bits);
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
VEC_SHADE(fg, shade_constants);
_mm_storeu_si128((__m128i*)dest, fg);
dest += pitch;
} while (--count);
local_vplce[0] = local_vplce[0] + local_vince[0];
local_vplce[1] = local_vplce[1] + local_vince[1];
local_vplce[2] = local_vplce[2] + local_vince[2];
local_vplce[3] = local_vplce[3] + local_vince[3];
VEC_SHADE(fg, shade_constants);
_mm_storeu_si128((__m128i*)dest, fg);
dest += pitch;
} while (--count);
}
}
}
};
@ -385,6 +518,8 @@ class VecCommand(Mvlinec4RGBA) : public DrawerCommand
DWORD vplce[4];
DWORD vince[4];
const uint32 * RESTRICT bufplce[4];
const uint32 * RESTRICT bufplce2[4];
uint32_t buftexturefracx[4];
public:
VecCommand(Mvlinec4RGBA)()
@ -400,6 +535,8 @@ public:
vplce[i] = ::vplce[i];
vince[i] = ::vince[i];
bufplce[i] = (const uint32 *)::bufplce[i];
bufplce2[i] = (const uint32_t *)::bufplce2[i];
buftexturefracx[i] = ::buftexturefracx[i];
}
}
@ -429,61 +566,105 @@ public:
local_vince[i] *= thread->num_cores;
}
if (shade_constants.simple_shade)
if (bufplce2[0] == nullptr)
{
VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
do
if (shade_constants.simple_shade)
{
DWORD place0 = local_vplce[0];
DWORD place1 = local_vplce[1];
DWORD place2 = local_vplce[2];
DWORD place3 = local_vplce[3];
VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
do
{
DWORD place0 = local_vplce[0];
DWORD place1 = local_vplce[1];
DWORD place2 = local_vplce[2];
DWORD place3 = local_vplce[3];
uint32_t pix0 = bufplce[0][place0 >> bits];
uint32_t pix1 = bufplce[1][place1 >> bits];
uint32_t pix2 = bufplce[2][place2 >> bits];
uint32_t pix3 = bufplce[3][place3 >> bits];
uint32_t pix0 = bufplce[0][place0 >> bits];
uint32_t pix1 = bufplce[1][place1 >> bits];
uint32_t pix2 = bufplce[2][place2 >> bits];
uint32_t pix3 = bufplce[3][place3 >> bits];
local_vplce[0] = place0 + local_vince[0];
local_vplce[1] = place1 + local_vince[1];
local_vplce[2] = place2 + local_vince[2];
local_vplce[3] = place3 + local_vince[3];
local_vplce[0] = place0 + local_vince[0];
local_vplce[1] = place1 + local_vince[1];
local_vplce[2] = place2 + local_vince[2];
local_vplce[3] = place3 + local_vince[3];
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
VEC_SHADE_SIMPLE(fg);
VEC_ALPHA_BLEND(fg, bg);
_mm_storeu_si128((__m128i*)dest, fg);
dest += pitch;
} while (--count);
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
VEC_SHADE_SIMPLE(fg);
VEC_ALPHA_BLEND(fg, bg);
_mm_storeu_si128((__m128i*)dest, fg);
dest += pitch;
} while (--count);
}
else
{
VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
do
{
DWORD place0 = local_vplce[0];
DWORD place1 = local_vplce[1];
DWORD place2 = local_vplce[2];
DWORD place3 = local_vplce[3];
uint32_t pix0 = bufplce[0][place0 >> bits];
uint32_t pix1 = bufplce[1][place1 >> bits];
uint32_t pix2 = bufplce[2][place2 >> bits];
uint32_t pix3 = bufplce[3][place3 >> bits];
local_vplce[0] = place0 + local_vince[0];
local_vplce[1] = place1 + local_vince[1];
local_vplce[2] = place2 + local_vince[2];
local_vplce[3] = place3 + local_vince[3];
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
VEC_SHADE(fg, shade_constants);
VEC_ALPHA_BLEND(fg, bg);
_mm_storeu_si128((__m128i*)dest, fg);
dest += pitch;
} while (--count);
}
}
else
{
VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
do
if (shade_constants.simple_shade)
{
DWORD place0 = local_vplce[0];
DWORD place1 = local_vplce[1];
DWORD place2 = local_vplce[2];
DWORD place3 = local_vplce[3];
VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
do
{
__m128i fg = sample_bilinear4_sse(bufplce, bufplce2, buftexturefracx, local_vplce, bits);
uint32_t pix0 = bufplce[0][place0 >> bits];
uint32_t pix1 = bufplce[1][place1 >> bits];
uint32_t pix2 = bufplce[2][place2 >> bits];
uint32_t pix3 = bufplce[3][place3 >> bits];
local_vplce[0] = local_vplce[0] + local_vince[0];
local_vplce[1] = local_vplce[1] + local_vince[1];
local_vplce[2] = local_vplce[2] + local_vince[2];
local_vplce[3] = local_vplce[3] + local_vince[3];
local_vplce[0] = place0 + local_vince[0];
local_vplce[1] = place1 + local_vince[1];
local_vplce[2] = place2 + local_vince[2];
local_vplce[3] = place3 + local_vince[3];
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
VEC_SHADE_SIMPLE(fg);
VEC_ALPHA_BLEND(fg, bg);
_mm_storeu_si128((__m128i*)dest, fg);
dest += pitch;
} while (--count);
}
else
{
VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
do
{
__m128i fg = sample_bilinear4_sse(bufplce, bufplce2, buftexturefracx, local_vplce, bits);
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
VEC_SHADE(fg, shade_constants);
VEC_ALPHA_BLEND(fg, bg);
_mm_storeu_si128((__m128i*)dest, fg);
dest += pitch;
} while (--count);
local_vplce[0] = local_vplce[0] + local_vince[0];
local_vplce[1] = local_vplce[1] + local_vince[1];
local_vplce[2] = local_vplce[2] + local_vince[2];
local_vplce[3] = local_vplce[3] + local_vince[3];
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
VEC_SHADE(fg, shade_constants);
VEC_ALPHA_BLEND(fg, bg);
_mm_storeu_si128((__m128i*)dest, fg);
dest += pitch;
} while (--count);
}
}
}
};