mirror of
https://github.com/ZDoom/qzdoom-gpl.git
synced 2025-01-19 05:30:49 +00:00
Added SSE versions of bilinear filtering
This commit is contained in:
parent
c70aa1fe99
commit
c1b5ba5b90
3 changed files with 611 additions and 291 deletions
|
@ -58,7 +58,7 @@ extern float rw_lightstep;
|
|||
extern int wallshade;
|
||||
|
||||
CVAR(Bool, r_multithreaded, true, 0)
|
||||
CVAR(Bool, r_bilinear, false, 0)
|
||||
CVAR(Bool, r_bilinear, true, 0)
|
||||
|
||||
#ifndef NO_SSE
|
||||
|
||||
|
@ -1680,43 +1680,70 @@ public:
|
|||
xstep = _xstep;
|
||||
ystep = _ystep;
|
||||
|
||||
if (_xbits == 6 && _ybits == 6)
|
||||
{
|
||||
// 64x64 is the most common case by far, so special case it.
|
||||
do
|
||||
{
|
||||
uint32_t texdata;
|
||||
fixed_t xmagnitude = abs((fixed_t)xstep) >> (32 - _xbits - FRACBITS);
|
||||
fixed_t ymagnitude = abs((fixed_t)ystep) >> (32 - _ybits - FRACBITS);
|
||||
fixed_t magnitude = xmagnitude + ymagnitude;
|
||||
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
texdata = source[spot];
|
||||
if (texdata != 0)
|
||||
bool magnifying = !r_bilinear || magnitude >> (FRACBITS - 1) == 0;
|
||||
if (magnifying)
|
||||
{
|
||||
if (_xbits == 6 && _ybits == 6)
|
||||
{
|
||||
// 64x64 is the most common case by far, so special case it.
|
||||
do
|
||||
{
|
||||
*dest = shade_bgra(texdata, light, shade_constants);
|
||||
}
|
||||
dest++;
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
} while (--count);
|
||||
uint32_t texdata;
|
||||
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
texdata = source[spot];
|
||||
*dest = alpha_blend(shade_bgra(texdata, light, shade_constants), *dest);
|
||||
dest++;
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
BYTE yshift = 32 - _ybits;
|
||||
BYTE xshift = yshift - _xbits;
|
||||
int xmask = ((1 << _xbits) - 1) << _ybits;
|
||||
do
|
||||
{
|
||||
uint32_t texdata;
|
||||
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
texdata = source[spot];
|
||||
*dest = alpha_blend(shade_bgra(texdata, light, shade_constants), *dest);
|
||||
dest++;
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
BYTE yshift = 32 - _ybits;
|
||||
BYTE xshift = yshift - _xbits;
|
||||
int xmask = ((1 << _xbits) - 1) << _ybits;
|
||||
do
|
||||
if (_xbits == 6 && _ybits == 6)
|
||||
{
|
||||
uint32_t texdata;
|
||||
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
texdata = source[spot];
|
||||
if (texdata != 0)
|
||||
// 64x64 is the most common case by far, so special case it.
|
||||
do
|
||||
{
|
||||
*dest = shade_bgra(texdata, light, shade_constants);
|
||||
}
|
||||
dest++;
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
} while (--count);
|
||||
*dest++ = alpha_blend(shade_bgra(sample_bilinear(source, xfrac, yfrac, 26, 26), light, shade_constants), *dest);
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
BYTE yshift = 32 - _ybits;
|
||||
BYTE xshift = yshift - _xbits;
|
||||
int xmask = ((1 << _xbits) - 1) << _ybits;
|
||||
do
|
||||
{
|
||||
*dest++ = alpha_blend(shade_bgra(sample_bilinear(source, xfrac, yfrac, 32 - _xbits, 32 - _ybits), light, shade_constants), *dest);
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -2439,6 +2466,8 @@ class Mvlinec1RGBACommand : public DrawerCommand
|
|||
DWORD _texturefrac;
|
||||
int _count;
|
||||
const BYTE * RESTRICT _source;
|
||||
const BYTE * RESTRICT _source2;
|
||||
uint32_t _texturefracx;
|
||||
BYTE * RESTRICT _dest;
|
||||
int mvlinebits;
|
||||
int _pitch;
|
||||
|
@ -2452,6 +2481,8 @@ public:
|
|||
_texturefrac = dc_texturefrac;
|
||||
_count = dc_count;
|
||||
_source = dc_source;
|
||||
_source2 = dc_source2;
|
||||
_texturefracx = dc_texturefracx;
|
||||
_dest = dc_dest;
|
||||
mvlinebits = ::mvlinebits;
|
||||
_pitch = dc_pitch;
|
||||
|
@ -2468,6 +2499,8 @@ public:
|
|||
DWORD fracstep = _iscale * thread->num_cores;
|
||||
DWORD frac = _texturefrac + _iscale * thread->skipped_by_thread(_dest_y);
|
||||
const uint32 *source = (const uint32 *)_source;
|
||||
const uint32 *source2 = (const uint32 *)_source2;
|
||||
uint32_t texturefracx = _texturefracx;
|
||||
uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
|
||||
int bits = mvlinebits;
|
||||
int pitch = _pitch * thread->num_cores;
|
||||
|
@ -2475,13 +2508,25 @@ public:
|
|||
uint32_t light = calc_light_multiplier(_light);
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
do
|
||||
if (_source2 == nullptr)
|
||||
{
|
||||
uint32_t pix = source[frac >> bits];
|
||||
*dest = alpha_blend(shade_bgra(pix, light, shade_constants), *dest);
|
||||
frac += fracstep;
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
do
|
||||
{
|
||||
uint32_t pix = source[frac >> bits];
|
||||
*dest = alpha_blend(shade_bgra(pix, light, shade_constants), *dest);
|
||||
frac += fracstep;
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
do
|
||||
{
|
||||
*dest = alpha_blend(shade_bgra(sample_bilinear(source, source2, texturefracx, frac, bits), light, shade_constants), *dest);
|
||||
frac += fracstep;
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -2496,6 +2541,8 @@ class Mvlinec4RGBACommand : public DrawerCommand
|
|||
DWORD vplce[4];
|
||||
DWORD vince[4];
|
||||
const uint32 * RESTRICT bufplce[4];
|
||||
const uint32 * RESTRICT bufplce2[4];
|
||||
uint32_t buftexturefracx[4];
|
||||
|
||||
public:
|
||||
Mvlinec4RGBACommand()
|
||||
|
@ -2511,6 +2558,8 @@ public:
|
|||
vplce[i] = ::vplce[i];
|
||||
vince[i] = ::vince[i];
|
||||
bufplce[i] = (const uint32 *)::bufplce[i];
|
||||
bufplce2[i] = (const uint32_t *)::bufplce2[i];
|
||||
buftexturefracx[i] = ::buftexturefracx[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2541,15 +2590,29 @@ public:
|
|||
local_vince[i] *= thread->num_cores;
|
||||
}
|
||||
|
||||
do
|
||||
if (bufplce2[0] == nullptr)
|
||||
{
|
||||
uint32_t pix;
|
||||
pix = bufplce[0][(place = local_vplce[0]) >> bits]; dest[0] = alpha_blend(shade_bgra(pix, light0, shade_constants), dest[0]); local_vplce[0] = place + local_vince[0];
|
||||
pix = bufplce[1][(place = local_vplce[1]) >> bits]; dest[1] = alpha_blend(shade_bgra(pix, light1, shade_constants), dest[1]); local_vplce[1] = place + local_vince[1];
|
||||
pix = bufplce[2][(place = local_vplce[2]) >> bits]; dest[2] = alpha_blend(shade_bgra(pix, light2, shade_constants), dest[2]); local_vplce[2] = place + local_vince[2];
|
||||
pix = bufplce[3][(place = local_vplce[3]) >> bits]; dest[3] = alpha_blend(shade_bgra(pix, light3, shade_constants), dest[3]); local_vplce[3] = place + local_vince[3];
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
do
|
||||
{
|
||||
uint32_t pix;
|
||||
pix = bufplce[0][(place = local_vplce[0]) >> bits]; dest[0] = alpha_blend(shade_bgra(pix, light0, shade_constants), dest[0]); local_vplce[0] = place + local_vince[0];
|
||||
pix = bufplce[1][(place = local_vplce[1]) >> bits]; dest[1] = alpha_blend(shade_bgra(pix, light1, shade_constants), dest[1]); local_vplce[1] = place + local_vince[1];
|
||||
pix = bufplce[2][(place = local_vplce[2]) >> bits]; dest[2] = alpha_blend(shade_bgra(pix, light2, shade_constants), dest[2]); local_vplce[2] = place + local_vince[2];
|
||||
pix = bufplce[3][(place = local_vplce[3]) >> bits]; dest[3] = alpha_blend(shade_bgra(pix, light3, shade_constants), dest[3]); local_vplce[3] = place + local_vince[3];
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
do
|
||||
{
|
||||
dest[0] = alpha_blend(shade_bgra(sample_bilinear(bufplce[0], bufplce2[0], buftexturefracx[0], place = local_vplce[0], bits), light0, shade_constants), dest[0]); local_vplce[0] = place + local_vince[0];
|
||||
dest[1] = alpha_blend(shade_bgra(sample_bilinear(bufplce[1], bufplce2[1], buftexturefracx[1], place = local_vplce[1], bits), light1, shade_constants), dest[1]); local_vplce[1] = place + local_vince[1];
|
||||
dest[2] = alpha_blend(shade_bgra(sample_bilinear(bufplce[2], bufplce2[2], buftexturefracx[2], place = local_vplce[2], bits), light2, shade_constants), dest[2]); local_vplce[2] = place + local_vince[2];
|
||||
dest[3] = alpha_blend(shade_bgra(sample_bilinear(bufplce[3], bufplce2[3], buftexturefracx[3], place = local_vplce[3], bits), light3, shade_constants), dest[3]); local_vplce[3] = place + local_vince[3];
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -3719,10 +3782,7 @@ void R_DrawSpan_rgba()
|
|||
#ifdef NO_SSE
|
||||
DrawerCommandQueue::QueueCommand<DrawSpanRGBACommand>();
|
||||
#else
|
||||
if (!r_bilinear)
|
||||
DrawerCommandQueue::QueueCommand<DrawSpanRGBA_SSE_Command>();
|
||||
else
|
||||
DrawerCommandQueue::QueueCommand<DrawSpanRGBACommand>();
|
||||
DrawerCommandQueue::QueueCommand<DrawSpanRGBA_SSE_Command>();
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -3776,10 +3836,7 @@ void vlinec4_rgba()
|
|||
#ifdef NO_SSE
|
||||
DrawerCommandQueue::QueueCommand<Vlinec4RGBACommand>();
|
||||
#else
|
||||
if (!r_bilinear)
|
||||
DrawerCommandQueue::QueueCommand<Vlinec4RGBA_SSE_Command>();
|
||||
else
|
||||
DrawerCommandQueue::QueueCommand<Vlinec4RGBACommand>();
|
||||
DrawerCommandQueue::QueueCommand<Vlinec4RGBA_SSE_Command>();
|
||||
#endif
|
||||
for (int i = 0; i < 4; i++)
|
||||
vplce[i] += vince[i] * dc_count;
|
||||
|
|
|
@ -478,6 +478,88 @@ FORCEINLINE uint32_t sample_bilinear(const uint32_t *texture, dsfixed_t xfrac, d
|
|||
return (alpha << 24) | (red << 16) | (green << 8) | blue;
|
||||
}
|
||||
|
||||
#ifndef NO_SSE
|
||||
FORCEINLINE __m128i sample_bilinear4_sse(const uint32_t **col0, const uint32_t **col1, uint32_t texturefracx[4], uint32_t texturefracy[4], int ybits)
|
||||
{
|
||||
uint32_t half = 1 << (ybits - 1);
|
||||
|
||||
__m128i m127 = _mm_set1_epi16(127);
|
||||
__m128i fg = _mm_setzero_si128();
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
uint32_t y = (texturefracy[i] - half) >> ybits;
|
||||
|
||||
uint32_t inv_b = texturefracx[i];
|
||||
uint32_t inv_a = ((texturefracy[i] + half) >> (ybits - 4)) & 15;
|
||||
uint32_t a = 16 - inv_a;
|
||||
uint32_t b = 16 - inv_b;
|
||||
|
||||
uint32_t ab = a * b;
|
||||
uint32_t invab = inv_a * b;
|
||||
uint32_t ainvb = a * inv_b;
|
||||
uint32_t invainvb = inv_a * inv_b;
|
||||
__m128i ab_invab = _mm_set_epi16(invab, invab, invab, invab, ab, ab, ab, ab);
|
||||
__m128i ainvb_invainvb = _mm_set_epi16(invainvb, invainvb, invainvb, invainvb, ainvb, ainvb, ainvb, ainvb);
|
||||
|
||||
__m128i p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(col0[i] + y)), _mm_setzero_si128());
|
||||
__m128i p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(col1[i] + y)), _mm_setzero_si128());
|
||||
|
||||
__m128i tmp = _mm_adds_epu16(_mm_mullo_epi16(p0, ab_invab), _mm_mullo_epi16(p1, ainvb_invainvb));
|
||||
__m128i color = _mm_srli_epi16(_mm_adds_epu16(_mm_adds_epu16(_mm_srli_si128(tmp, 8), tmp), m127), 8);
|
||||
|
||||
fg = _mm_or_si128(_mm_srli_si128(fg, 4), _mm_slli_si128(_mm_packus_epi16(color, _mm_setzero_si128()), 12));
|
||||
}
|
||||
return fg;
|
||||
}
|
||||
|
||||
FORCEINLINE __m128i sample_bilinear4_sse(const uint32_t *texture, dsfixed_t &xfrac, dsfixed_t &yfrac, dsfixed_t xstep, dsfixed_t ystep, int xbits, int ybits)
|
||||
{
|
||||
int xshift = (32 - xbits);
|
||||
int yshift = (32 - ybits);
|
||||
int xmask = (1 << xshift) - 1;
|
||||
int ymask = (1 << yshift) - 1;
|
||||
uint32_t xhalf = 1 << (xbits - 1);
|
||||
uint32_t yhalf = 1 << (ybits - 1);
|
||||
|
||||
__m128i m127 = _mm_set1_epi16(127);
|
||||
__m128i fg = _mm_setzero_si128();
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
uint32_t x = (xfrac - xhalf) >> xbits;
|
||||
uint32_t y = (yfrac - yhalf) >> ybits;
|
||||
|
||||
uint32_t p00 = texture[(y & ymask) + ((x & xmask) << yshift)];
|
||||
uint32_t p01 = texture[(y + 1 & ymask) + ((x & xmask) << yshift)];
|
||||
uint32_t p10 = texture[(y & ymask) + (((x + 1) & xmask) << yshift)];
|
||||
uint32_t p11 = texture[(y + 1 & ymask) + (((x + 1) & xmask) << yshift)];
|
||||
|
||||
uint32_t inv_b = ((xfrac + xhalf) >> (xbits - 4)) & 15;
|
||||
uint32_t inv_a = ((yfrac + yhalf) >> (ybits - 4)) & 15;
|
||||
uint32_t a = 16 - inv_a;
|
||||
uint32_t b = 16 - inv_b;
|
||||
|
||||
uint32_t ab = a * b;
|
||||
uint32_t invab = inv_a * b;
|
||||
uint32_t ainvb = a * inv_b;
|
||||
uint32_t invainvb = inv_a * inv_b;
|
||||
__m128i ab_invab = _mm_set_epi16(invab, invab, invab, invab, ab, ab, ab, ab);
|
||||
__m128i ainvb_invainvb = _mm_set_epi16(invainvb, invainvb, invainvb, invainvb, ainvb, ainvb, ainvb, ainvb);
|
||||
|
||||
__m128i p0 = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, p01, p00), _mm_setzero_si128());
|
||||
__m128i p1 = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, p11, p10), _mm_setzero_si128());
|
||||
|
||||
__m128i tmp = _mm_adds_epu16(_mm_mullo_epi16(p0, ab_invab), _mm_mullo_epi16(p1, ainvb_invainvb));
|
||||
__m128i color = _mm_srli_epi16(_mm_adds_epu16(_mm_adds_epu16(_mm_srli_si128(tmp, 8), tmp), m127), 8);
|
||||
|
||||
fg = _mm_or_si128(_mm_srli_si128(fg, 4), _mm_slli_si128(_mm_packus_epi16(color, _mm_setzero_si128()), 12));
|
||||
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
}
|
||||
return fg;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Calculate constants for a simple shade with gamma correction
|
||||
#define AVX_LINEAR_SHADE_SIMPLE_INIT(light) \
|
||||
__m256 mlight_hi = _mm256_set_ps(1.0f, light * (1.0f/256.0f), light * (1.0f/256.0f), light * (1.0f/256.0f), 1.0f, light * (1.0f/256.0f), light * (1.0f/256.0f), light * (1.0f/256.0f)); \
|
||||
|
|
|
@ -71,195 +71,284 @@ public:
|
|||
uint32_t light = calc_light_multiplier(_light);
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
if (_xbits == 6 && _ybits == 6)
|
||||
fixed_t xmagnitude = abs((fixed_t)xstep) >> (32 - _xbits - FRACBITS);
|
||||
fixed_t ymagnitude = abs((fixed_t)ystep) >> (32 - _ybits - FRACBITS);
|
||||
fixed_t magnitude = xmagnitude + ymagnitude;
|
||||
|
||||
bool magnifying = !r_bilinear || magnitude >> (FRACBITS - 1) == 0;
|
||||
if (magnifying)
|
||||
{
|
||||
// 64x64 is the most common case by far, so special case it.
|
||||
|
||||
int sse_count = count / 4;
|
||||
count -= sse_count * 4;
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
if (_xbits == 6 && _ybits == 6)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT(light);
|
||||
// 64x64 is the most common case by far, so special case it.
|
||||
|
||||
while (sse_count--)
|
||||
int sse_count = count / 4;
|
||||
count -= sse_count * 4;
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT(light);
|
||||
|
||||
while (sse_count--)
|
||||
{
|
||||
// Current texture index in u,v.
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p0 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p1 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p2 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p3 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
// Lookup pixel from flat texture tile,
|
||||
// re-index using light/colormap.
|
||||
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
|
||||
// Next step in u,v.
|
||||
dest += 4;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT(light, shade_constants);
|
||||
|
||||
while (sse_count--)
|
||||
{
|
||||
// Current texture index in u,v.
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p0 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p1 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p2 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p3 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
// Lookup pixel from flat texture tile,
|
||||
// re-index using light/colormap.
|
||||
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
|
||||
// Next step in u,v.
|
||||
dest += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (count == 0)
|
||||
return;
|
||||
|
||||
do
|
||||
{
|
||||
// Current texture index in u,v.
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p0 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p1 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p2 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p3 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
// Lookup pixel from flat texture tile,
|
||||
// re-index using light/colormap.
|
||||
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
// Lookup pixel from flat texture tile
|
||||
*dest++ = shade_bgra(source[spot], light, shade_constants);
|
||||
|
||||
// Next step in u,v.
|
||||
dest += 4;
|
||||
}
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT(light, shade_constants);
|
||||
BYTE yshift = 32 - _ybits;
|
||||
BYTE xshift = yshift - _xbits;
|
||||
int xmask = ((1 << _xbits) - 1) << _ybits;
|
||||
|
||||
while (sse_count--)
|
||||
int sse_count = count / 4;
|
||||
count -= sse_count * 4;
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT(light);
|
||||
|
||||
while (sse_count--)
|
||||
{
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p0 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p1 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p2 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p3 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
// Lookup pixel from flat texture tile
|
||||
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += 4;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT(light, shade_constants);
|
||||
|
||||
while (sse_count--)
|
||||
{
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p0 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p1 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p2 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p3 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
// Lookup pixel from flat texture tile
|
||||
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (count == 0)
|
||||
return;
|
||||
|
||||
do
|
||||
{
|
||||
// Current texture index in u,v.
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p0 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p1 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p2 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
uint32_t p3 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
// Lookup pixel from flat texture tile,
|
||||
// re-index using light/colormap.
|
||||
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
// Lookup pixel from flat texture tile
|
||||
*dest++ = shade_bgra(source[spot], light, shade_constants);
|
||||
|
||||
// Next step in u,v.
|
||||
dest += 4;
|
||||
}
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
} while (--count);
|
||||
}
|
||||
|
||||
if (count == 0)
|
||||
return;
|
||||
|
||||
do
|
||||
{
|
||||
// Current texture index in u,v.
|
||||
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
|
||||
|
||||
// Lookup pixel from flat texture tile
|
||||
*dest++ = shade_bgra(source[spot], light, shade_constants);
|
||||
|
||||
// Next step in u,v.
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
BYTE yshift = 32 - _ybits;
|
||||
BYTE xshift = yshift - _xbits;
|
||||
int xmask = ((1 << _xbits) - 1) << _ybits;
|
||||
|
||||
int sse_count = count / 4;
|
||||
count -= sse_count * 4;
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
if (_xbits == 6 && _ybits == 6)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT(light);
|
||||
// 64x64 is the most common case by far, so special case it.
|
||||
|
||||
while (sse_count--)
|
||||
int sse_count = count / 4;
|
||||
count -= sse_count * 4;
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p0 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p1 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p2 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p3 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
// Lookup pixel from flat texture tile
|
||||
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += 4;
|
||||
VEC_SHADE_SIMPLE_INIT(light);
|
||||
while (sse_count--)
|
||||
{
|
||||
__m128i fg = sample_bilinear4_sse(source, xfrac, yfrac, xstep, ystep, 26, 26);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += 4;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT(light, shade_constants);
|
||||
while (sse_count--)
|
||||
{
|
||||
__m128i fg = sample_bilinear4_sse(source, xfrac, yfrac, xstep, ystep, 26, 26);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (count == 0)
|
||||
return;
|
||||
|
||||
do
|
||||
{
|
||||
*dest++ = shade_bgra(sample_bilinear(source, xfrac, yfrac, 26, 26), light, shade_constants);
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT(light, shade_constants);
|
||||
int sse_count = count / 4;
|
||||
count -= sse_count * 4;
|
||||
|
||||
while (sse_count--)
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p0 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p1 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p2 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
uint32_t p3 = source[spot];
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
|
||||
// Lookup pixel from flat texture tile
|
||||
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += 4;
|
||||
VEC_SHADE_SIMPLE_INIT(light);
|
||||
while (sse_count--)
|
||||
{
|
||||
__m128i fg = sample_bilinear4_sse(source, xfrac, yfrac, xstep, ystep, 32 -_xbits, 32 - _ybits);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += 4;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT(light, shade_constants);
|
||||
while (sse_count--)
|
||||
{
|
||||
__m128i fg = sample_bilinear4_sse(source, xfrac, yfrac, xstep, ystep, 32 - _xbits, 32 - _ybits);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (count == 0)
|
||||
return;
|
||||
|
||||
do
|
||||
{
|
||||
*dest++ = shade_bgra(sample_bilinear(source, xfrac, yfrac, 32 - _xbits, 32 - _ybits), light, shade_constants);
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
} while (--count);
|
||||
}
|
||||
|
||||
if (count == 0)
|
||||
return;
|
||||
|
||||
do
|
||||
{
|
||||
// Current texture index in u,v.
|
||||
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
|
||||
|
||||
// Lookup pixel from flat texture tile
|
||||
*dest++ = shade_bgra(source[spot], light, shade_constants);
|
||||
|
||||
// Next step in u,v.
|
||||
xfrac += xstep;
|
||||
yfrac += ystep;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -275,6 +364,8 @@ class VecCommand(Vlinec4RGBA) : public DrawerCommand
|
|||
DWORD vplce[4];
|
||||
DWORD vince[4];
|
||||
const uint32 * RESTRICT bufplce[4];
|
||||
const uint32_t * RESTRICT bufplce2[4];
|
||||
uint32_t buftexturefracx[4];
|
||||
|
||||
public:
|
||||
VecCommand(Vlinec4RGBA)()
|
||||
|
@ -290,6 +381,8 @@ public:
|
|||
vplce[i] = ::vplce[i];
|
||||
vince[i] = ::vince[i];
|
||||
bufplce[i] = (const uint32 *)::bufplce[i];
|
||||
bufplce2[i] = (const uint32_t *)::bufplce2[i];
|
||||
buftexturefracx[i] = ::buftexturefracx[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -319,57 +412,97 @@ public:
|
|||
local_vince[i] *= thread->num_cores;
|
||||
}
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
if (bufplce2[0] == nullptr)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
|
||||
do
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
DWORD place0 = local_vplce[0];
|
||||
DWORD place1 = local_vplce[1];
|
||||
DWORD place2 = local_vplce[2];
|
||||
DWORD place3 = local_vplce[3];
|
||||
VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
|
||||
do
|
||||
{
|
||||
DWORD place0 = local_vplce[0];
|
||||
DWORD place1 = local_vplce[1];
|
||||
DWORD place2 = local_vplce[2];
|
||||
DWORD place3 = local_vplce[3];
|
||||
|
||||
uint32_t p0 = bufplce[0][place0 >> bits];
|
||||
uint32_t p1 = bufplce[1][place1 >> bits];
|
||||
uint32_t p2 = bufplce[2][place2 >> bits];
|
||||
uint32_t p3 = bufplce[3][place3 >> bits];
|
||||
uint32_t p0 = bufplce[0][place0 >> bits];
|
||||
uint32_t p1 = bufplce[1][place1 >> bits];
|
||||
uint32_t p2 = bufplce[2][place2 >> bits];
|
||||
uint32_t p3 = bufplce[3][place3 >> bits];
|
||||
|
||||
local_vplce[0] = place0 + local_vince[0];
|
||||
local_vplce[1] = place1 + local_vince[1];
|
||||
local_vplce[2] = place2 + local_vince[2];
|
||||
local_vplce[3] = place3 + local_vince[3];
|
||||
local_vplce[0] = place0 + local_vince[0];
|
||||
local_vplce[1] = place1 + local_vince[1];
|
||||
local_vplce[2] = place2 + local_vince[2];
|
||||
local_vplce[3] = place3 + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
|
||||
do
|
||||
{
|
||||
DWORD place0 = local_vplce[0];
|
||||
DWORD place1 = local_vplce[1];
|
||||
DWORD place2 = local_vplce[2];
|
||||
DWORD place3 = local_vplce[3];
|
||||
|
||||
uint32_t p0 = bufplce[0][place0 >> bits];
|
||||
uint32_t p1 = bufplce[1][place1 >> bits];
|
||||
uint32_t p2 = bufplce[2][place2 >> bits];
|
||||
uint32_t p3 = bufplce[3][place3 >> bits];
|
||||
|
||||
local_vplce[0] = place0 + local_vince[0];
|
||||
local_vplce[1] = place1 + local_vince[1];
|
||||
local_vplce[2] = place2 + local_vince[2];
|
||||
local_vplce[3] = place3 + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
|
||||
do
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
DWORD place0 = local_vplce[0];
|
||||
DWORD place1 = local_vplce[1];
|
||||
DWORD place2 = local_vplce[2];
|
||||
DWORD place3 = local_vplce[3];
|
||||
VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
|
||||
do
|
||||
{
|
||||
__m128i fg = sample_bilinear4_sse(bufplce, bufplce2, buftexturefracx, local_vplce, bits);
|
||||
|
||||
uint32_t p0 = bufplce[0][place0 >> bits];
|
||||
uint32_t p1 = bufplce[1][place1 >> bits];
|
||||
uint32_t p2 = bufplce[2][place2 >> bits];
|
||||
uint32_t p3 = bufplce[3][place3 >> bits];
|
||||
local_vplce[0] = local_vplce[0] + local_vince[0];
|
||||
local_vplce[1] = local_vplce[1] + local_vince[1];
|
||||
local_vplce[2] = local_vplce[2] + local_vince[2];
|
||||
local_vplce[3] = local_vplce[3] + local_vince[3];
|
||||
|
||||
local_vplce[0] = place0 + local_vince[0];
|
||||
local_vplce[1] = place1 + local_vince[1];
|
||||
local_vplce[2] = place2 + local_vince[2];
|
||||
local_vplce[3] = place3 + local_vince[3];
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
|
||||
do
|
||||
{
|
||||
__m128i fg = sample_bilinear4_sse(bufplce, bufplce2, buftexturefracx, local_vplce, bits);
|
||||
|
||||
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
local_vplce[0] = local_vplce[0] + local_vince[0];
|
||||
local_vplce[1] = local_vplce[1] + local_vince[1];
|
||||
local_vplce[2] = local_vplce[2] + local_vince[2];
|
||||
local_vplce[3] = local_vplce[3] + local_vince[3];
|
||||
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -385,6 +518,8 @@ class VecCommand(Mvlinec4RGBA) : public DrawerCommand
|
|||
DWORD vplce[4];
|
||||
DWORD vince[4];
|
||||
const uint32 * RESTRICT bufplce[4];
|
||||
const uint32 * RESTRICT bufplce2[4];
|
||||
uint32_t buftexturefracx[4];
|
||||
|
||||
public:
|
||||
VecCommand(Mvlinec4RGBA)()
|
||||
|
@ -400,6 +535,8 @@ public:
|
|||
vplce[i] = ::vplce[i];
|
||||
vince[i] = ::vince[i];
|
||||
bufplce[i] = (const uint32 *)::bufplce[i];
|
||||
bufplce2[i] = (const uint32_t *)::bufplce2[i];
|
||||
buftexturefracx[i] = ::buftexturefracx[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -429,61 +566,105 @@ public:
|
|||
local_vince[i] *= thread->num_cores;
|
||||
}
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
if (bufplce2[0] == nullptr)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
|
||||
do
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
DWORD place0 = local_vplce[0];
|
||||
DWORD place1 = local_vplce[1];
|
||||
DWORD place2 = local_vplce[2];
|
||||
DWORD place3 = local_vplce[3];
|
||||
VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
|
||||
do
|
||||
{
|
||||
DWORD place0 = local_vplce[0];
|
||||
DWORD place1 = local_vplce[1];
|
||||
DWORD place2 = local_vplce[2];
|
||||
DWORD place3 = local_vplce[3];
|
||||
|
||||
uint32_t pix0 = bufplce[0][place0 >> bits];
|
||||
uint32_t pix1 = bufplce[1][place1 >> bits];
|
||||
uint32_t pix2 = bufplce[2][place2 >> bits];
|
||||
uint32_t pix3 = bufplce[3][place3 >> bits];
|
||||
uint32_t pix0 = bufplce[0][place0 >> bits];
|
||||
uint32_t pix1 = bufplce[1][place1 >> bits];
|
||||
uint32_t pix2 = bufplce[2][place2 >> bits];
|
||||
uint32_t pix3 = bufplce[3][place3 >> bits];
|
||||
|
||||
local_vplce[0] = place0 + local_vince[0];
|
||||
local_vplce[1] = place1 + local_vince[1];
|
||||
local_vplce[2] = place2 + local_vince[2];
|
||||
local_vplce[3] = place3 + local_vince[3];
|
||||
local_vplce[0] = place0 + local_vince[0];
|
||||
local_vplce[1] = place1 + local_vince[1];
|
||||
local_vplce[2] = place2 + local_vince[2];
|
||||
local_vplce[3] = place3 + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
VEC_ALPHA_BLEND(fg, bg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
VEC_ALPHA_BLEND(fg, bg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
|
||||
do
|
||||
{
|
||||
DWORD place0 = local_vplce[0];
|
||||
DWORD place1 = local_vplce[1];
|
||||
DWORD place2 = local_vplce[2];
|
||||
DWORD place3 = local_vplce[3];
|
||||
|
||||
uint32_t pix0 = bufplce[0][place0 >> bits];
|
||||
uint32_t pix1 = bufplce[1][place1 >> bits];
|
||||
uint32_t pix2 = bufplce[2][place2 >> bits];
|
||||
uint32_t pix3 = bufplce[3][place3 >> bits];
|
||||
|
||||
local_vplce[0] = place0 + local_vince[0];
|
||||
local_vplce[1] = place1 + local_vince[1];
|
||||
local_vplce[2] = place2 + local_vince[2];
|
||||
local_vplce[3] = place3 + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
VEC_ALPHA_BLEND(fg, bg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
|
||||
do
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
DWORD place0 = local_vplce[0];
|
||||
DWORD place1 = local_vplce[1];
|
||||
DWORD place2 = local_vplce[2];
|
||||
DWORD place3 = local_vplce[3];
|
||||
VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
|
||||
do
|
||||
{
|
||||
__m128i fg = sample_bilinear4_sse(bufplce, bufplce2, buftexturefracx, local_vplce, bits);
|
||||
|
||||
uint32_t pix0 = bufplce[0][place0 >> bits];
|
||||
uint32_t pix1 = bufplce[1][place1 >> bits];
|
||||
uint32_t pix2 = bufplce[2][place2 >> bits];
|
||||
uint32_t pix3 = bufplce[3][place3 >> bits];
|
||||
local_vplce[0] = local_vplce[0] + local_vince[0];
|
||||
local_vplce[1] = local_vplce[1] + local_vince[1];
|
||||
local_vplce[2] = local_vplce[2] + local_vince[2];
|
||||
local_vplce[3] = local_vplce[3] + local_vince[3];
|
||||
|
||||
local_vplce[0] = place0 + local_vince[0];
|
||||
local_vplce[1] = place1 + local_vince[1];
|
||||
local_vplce[2] = place2 + local_vince[2];
|
||||
local_vplce[3] = place3 + local_vince[3];
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
VEC_ALPHA_BLEND(fg, bg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
|
||||
do
|
||||
{
|
||||
__m128i fg = sample_bilinear4_sse(bufplce, bufplce2, buftexturefracx, local_vplce, bits);
|
||||
|
||||
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
VEC_ALPHA_BLEND(fg, bg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
local_vplce[0] = local_vplce[0] + local_vince[0];
|
||||
local_vplce[1] = local_vplce[1] + local_vince[1];
|
||||
local_vplce[2] = local_vplce[2] + local_vince[2];
|
||||
local_vplce[3] = local_vplce[3] + local_vince[3];
|
||||
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
VEC_ALPHA_BLEND(fg, bg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
|
Loading…
Reference in a new issue