Added bilinear filtering

This commit is contained in:
Magnus Norddahl 2016-06-20 08:24:02 +02:00
parent 6daeb5a158
commit c70aa1fe99
5 changed files with 242 additions and 62 deletions

View file

@ -162,6 +162,8 @@ fixed_t dc_destalpha; // Alpha value used by dc_destblend
// first pixel in a column (possibly virtual)
const BYTE* dc_source;
const BYTE* dc_source2;
uint32_t dc_texturefracx;
BYTE* dc_dest;
int dc_count;
@ -171,6 +173,8 @@ DWORD vince[4];
BYTE* palookupoffse[4];
fixed_t palookuplight[4];
const BYTE* bufplce[4];
const BYTE* bufplce2[4];
uint32_t buftexturefracx[4];
// just for profiling
int dccount;

View file

@ -71,6 +71,8 @@ extern "C" fixed_t dc_destalpha;
// first pixel in a column
extern "C" const BYTE* dc_source;
extern "C" const BYTE* dc_source2;
extern "C" uint32_t dc_texturefracx;
extern "C" BYTE *dc_dest, *dc_destorg;
extern "C" int dc_count;
@ -80,6 +82,8 @@ extern "C" DWORD vince[4];
extern "C" BYTE* palookupoffse[4];
extern "C" fixed_t palookuplight[4];
extern "C" const BYTE* bufplce[4];
extern "C" const BYTE* bufplce2[4];
extern "C" uint32_t buftexturefracx[4];
// [RH] Temporary buffer for column drawing
extern "C" BYTE *dc_temp;
@ -374,4 +378,36 @@ void R_SetDSColorMapLight(FColormap *base_colormap, float light, int shade);
void R_SetTranslationMap(lighttable_t *translation);
extern bool r_swtruecolor;
EXTERN_CVAR(Bool, r_bilinear);
// Texture sampler state needed for bilinear filtering
struct SamplerSetup
{
SamplerSetup() { }
SamplerSetup(fixed_t xoffset, bool magnifying, FTexture *texture, const BYTE*(*getcol)(FTexture *texture, int x));
const BYTE *source;
const BYTE *source2;
uint32_t texturefracx;
};
inline SamplerSetup::SamplerSetup(fixed_t xoffset, bool magnifying, FTexture *texture, const BYTE*(*getcol)(FTexture *texture, int x))
{
// Only do bilinear filtering if enabled and not a magnifying filter
if (!r_swtruecolor || !r_bilinear || magnifying)
{
source = getcol(texture, xoffset >> FRACBITS);
source2 = nullptr;
texturefracx = 0;
}
else
{
int tx = (xoffset - FRACUNIT / 2) >> FRACBITS;
source = getcol(texture, tx);
source2 = getcol(texture, tx + 1);
texturefracx = ((xoffset + FRACUNIT / 2) >> (FRACBITS - 4)) & 15;
}
}
#endif

View file

@ -58,6 +58,7 @@ extern float rw_lightstep;
extern int wallshade;
CVAR(Bool, r_multithreaded, true, 0)
CVAR(Bool, r_bilinear, false, 0)
#ifndef NO_SSE
@ -1547,41 +1548,72 @@ public:
uint32_t light = calc_light_multiplier(_light);
ShadeConstants shade_constants = _shade_constants;
if (_xbits == 6 && _ybits == 6)
fixed_t xmagnitude = abs((fixed_t)xstep) >> (32 - _xbits - FRACBITS);
fixed_t ymagnitude = abs((fixed_t)ystep) >> (32 - _ybits - FRACBITS);
fixed_t magnitude = xmagnitude + ymagnitude;
bool magnifying = !r_bilinear || magnitude >> (FRACBITS - 1) == 0;
if (magnifying)
{
// 64x64 is the most common case by far, so special case it.
do
if (_xbits == 6 && _ybits == 6)
{
// Current texture index in u,v.
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
// 64x64 is the most common case by far, so special case it.
// Lookup pixel from flat texture tile
*dest++ = shade_bgra(source[spot], light, shade_constants);
do
{
// Current texture index in u,v.
spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6));
// Next step in u,v.
xfrac += xstep;
yfrac += ystep;
} while (--count);
// Lookup pixel from flat texture tile
*dest++ = shade_bgra(source[spot], light, shade_constants);
// Next step in u,v.
xfrac += xstep;
yfrac += ystep;
} while (--count);
}
else
{
BYTE yshift = 32 - _ybits;
BYTE xshift = yshift - _xbits;
int xmask = ((1 << _xbits) - 1) << _ybits;
do
{
// Current texture index in u,v.
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
// Lookup pixel from flat texture tile
*dest++ = shade_bgra(source[spot], light, shade_constants);
// Next step in u,v.
xfrac += xstep;
yfrac += ystep;
} while (--count);
}
}
else
{
BYTE yshift = 32 - _ybits;
BYTE xshift = yshift - _xbits;
int xmask = ((1 << _xbits) - 1) << _ybits;
do
if (_xbits == 6 && _ybits == 6)
{
// Current texture index in u,v.
spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift);
// 64x64 is the most common case by far, so special case it.
// Lookup pixel from flat texture tile
*dest++ = shade_bgra(source[spot], light, shade_constants);
// Next step in u,v.
xfrac += xstep;
yfrac += ystep;
} while (--count);
do
{
*dest++ = shade_bgra(sample_bilinear(source, xfrac, yfrac, 26, 26), light, shade_constants);
xfrac += xstep;
yfrac += ystep;
} while (--count);
}
else
{
do
{
*dest++ = shade_bgra(sample_bilinear(source, xfrac, yfrac, 32 - _xbits, 32 - _ybits), light, shade_constants);
xfrac += xstep;
yfrac += ystep;
} while (--count);
}
}
}
};
@ -2253,6 +2285,8 @@ class Vlinec1RGBACommand : public DrawerCommand
DWORD _texturefrac;
int _count;
const BYTE * RESTRICT _source;
const BYTE * RESTRICT _source2;
uint32_t _texturefracx;
BYTE * RESTRICT _dest;
int vlinebits;
int _pitch;
@ -2266,6 +2300,8 @@ public:
_texturefrac = dc_texturefrac;
_count = dc_count;
_source = dc_source;
_source2 = dc_source2;
_texturefracx = dc_texturefracx;
_dest = dc_dest;
vlinebits = ::vlinebits;
_pitch = dc_pitch;
@ -2282,6 +2318,8 @@ public:
DWORD fracstep = _iscale * thread->num_cores;
DWORD frac = _texturefrac + _iscale * thread->skipped_by_thread(_dest_y);
const uint32 *source = (const uint32 *)_source;
const uint32 *source2 = (const uint32 *)_source2;
uint32_t texturefracx = _texturefracx;
uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
int bits = vlinebits;
int pitch = _pitch * thread->num_cores;
@ -2289,12 +2327,24 @@ public:
uint32_t light = calc_light_multiplier(_light);
ShadeConstants shade_constants = _shade_constants;
do
if (_source2 == nullptr)
{
*dest = shade_bgra(source[frac >> bits], light, shade_constants);
frac += fracstep;
dest += pitch;
} while (--count);
do
{
*dest = shade_bgra(source[frac >> bits], light, shade_constants);
frac += fracstep;
dest += pitch;
} while (--count);
}
else
{
do
{
*dest = shade_bgra(sample_bilinear(source, source2, texturefracx, frac, bits), light, shade_constants);
frac += fracstep;
dest += pitch;
} while (--count);
}
}
};
@ -2308,7 +2358,9 @@ class Vlinec4RGBACommand : public DrawerCommand
fixed_t palookuplight[4];
DWORD vplce[4];
DWORD vince[4];
const uint32 * RESTRICT bufplce[4];
const uint32_t * RESTRICT bufplce[4];
const uint32_t * RESTRICT bufplce2[4];
uint32_t buftexturefracx[4];
public:
Vlinec4RGBACommand()
@ -2323,7 +2375,9 @@ public:
palookuplight[i] = ::palookuplight[i];
vplce[i] = ::vplce[i];
vince[i] = ::vince[i];
bufplce[i] = (const uint32 *)::bufplce[i];
bufplce[i] = (const uint32_t *)::bufplce[i];
bufplce2[i] = (const uint32_t *)::bufplce2[i];
buftexturefracx[i] = ::buftexturefracx[i];
}
}
@ -2354,14 +2408,28 @@ public:
local_vince[i] *= thread->num_cores;
}
do
if (bufplce2[0] == nullptr)
{
dest[0] = shade_bgra(bufplce[0][(place = local_vplce[0]) >> bits], light0, shade_constants); local_vplce[0] = place + local_vince[0];
dest[1] = shade_bgra(bufplce[1][(place = local_vplce[1]) >> bits], light1, shade_constants); local_vplce[1] = place + local_vince[1];
dest[2] = shade_bgra(bufplce[2][(place = local_vplce[2]) >> bits], light2, shade_constants); local_vplce[2] = place + local_vince[2];
dest[3] = shade_bgra(bufplce[3][(place = local_vplce[3]) >> bits], light3, shade_constants); local_vplce[3] = place + local_vince[3];
dest += pitch;
} while (--count);
do
{
dest[0] = shade_bgra(bufplce[0][(place = local_vplce[0]) >> bits], light0, shade_constants); local_vplce[0] = place + local_vince[0];
dest[1] = shade_bgra(bufplce[1][(place = local_vplce[1]) >> bits], light1, shade_constants); local_vplce[1] = place + local_vince[1];
dest[2] = shade_bgra(bufplce[2][(place = local_vplce[2]) >> bits], light2, shade_constants); local_vplce[2] = place + local_vince[2];
dest[3] = shade_bgra(bufplce[3][(place = local_vplce[3]) >> bits], light3, shade_constants); local_vplce[3] = place + local_vince[3];
dest += pitch;
} while (--count);
}
else
{
do
{
dest[0] = shade_bgra(sample_bilinear(bufplce[0], bufplce2[0], buftexturefracx[0], place = local_vplce[0], bits), light0, shade_constants); local_vplce[0] = place + local_vince[0];
dest[1] = shade_bgra(sample_bilinear(bufplce[1], bufplce2[1], buftexturefracx[1], place = local_vplce[1], bits), light1, shade_constants); local_vplce[1] = place + local_vince[1];
dest[2] = shade_bgra(sample_bilinear(bufplce[2], bufplce2[2], buftexturefracx[2], place = local_vplce[2], bits), light2, shade_constants); local_vplce[2] = place + local_vince[2];
dest[3] = shade_bgra(sample_bilinear(bufplce[3], bufplce2[3], buftexturefracx[3], place = local_vplce[3], bits), light3, shade_constants); local_vplce[3] = place + local_vince[3];
dest += pitch;
} while (--count);
}
}
};
@ -3651,7 +3719,10 @@ void R_DrawSpan_rgba()
#ifdef NO_SSE
DrawerCommandQueue::QueueCommand<DrawSpanRGBACommand>();
#else
DrawerCommandQueue::QueueCommand<DrawSpanRGBA_SSE_Command>();
if (!r_bilinear)
DrawerCommandQueue::QueueCommand<DrawSpanRGBA_SSE_Command>();
else
DrawerCommandQueue::QueueCommand<DrawSpanRGBACommand>();
#endif
}
@ -3705,7 +3776,10 @@ void vlinec4_rgba()
#ifdef NO_SSE
DrawerCommandQueue::QueueCommand<Vlinec4RGBACommand>();
#else
DrawerCommandQueue::QueueCommand<Vlinec4RGBA_SSE_Command>();
if (!r_bilinear)
DrawerCommandQueue::QueueCommand<Vlinec4RGBA_SSE_Command>();
else
DrawerCommandQueue::QueueCommand<Vlinec4RGBACommand>();
#endif
for (int i = 0; i < 4; i++)
vplce[i] += vince[i] * dc_count;

View file

@ -426,6 +426,58 @@ FORCEINLINE uint32_t alpha_blend(uint32_t fg, uint32_t bg)
return 0xff000000 | (red << 16) | (green << 8) | blue;
}
FORCEINLINE uint32_t sample_bilinear(const uint32_t *col0, const uint32_t *col1, uint32_t texturefracx, uint32_t texturefracy, int ybits)
{
uint32_t half = 1 << (ybits - 1);
uint32_t y = (texturefracy - half) >> ybits;
uint32_t p00 = col0[y];
uint32_t p01 = col0[y + 1];
uint32_t p10 = col1[y];
uint32_t p11 = col1[y + 1];
uint32_t inv_b = texturefracx;
uint32_t inv_a = ((texturefracy + half) >> (ybits - 4)) & 15;
uint32_t a = 16 - inv_a;
uint32_t b = 16 - inv_b;
uint32_t red = (RPART(p00) * a * b + RPART(p01) * inv_a * b + RPART(p10) * a * inv_b + RPART(p11) * inv_a * inv_b + 127) >> 8;
uint32_t green = (GPART(p00) * a * b + GPART(p01) * inv_a * b + GPART(p10) * a * inv_b + GPART(p11) * inv_a * inv_b + 127) >> 8;
uint32_t blue = (BPART(p00) * a * b + BPART(p01) * inv_a * b + BPART(p10) * a * inv_b + BPART(p11) * inv_a * inv_b + 127) >> 8;
uint32_t alpha = (APART(p00) * a * b + APART(p01) * inv_a * b + APART(p10) * a * inv_b + APART(p11) * inv_a * inv_b + 127) >> 8;
return (alpha << 24) | (red << 16) | (green << 8) | blue;
}
FORCEINLINE uint32_t sample_bilinear(const uint32_t *texture, dsfixed_t xfrac, dsfixed_t yfrac, int xbits, int ybits)
{
int xshift = (32 - xbits);
int yshift = (32 - ybits);
int xmask = (1 << xshift) - 1;
int ymask = (1 << yshift) - 1;
uint32_t xhalf = 1 << (xbits - 1);
uint32_t yhalf = 1 << (ybits - 1);
uint32_t x = (xfrac - xhalf) >> xbits;
uint32_t y = (yfrac - yhalf) >> ybits;
uint32_t p00 = texture[(y & ymask) + ((x & xmask) << yshift)];
uint32_t p01 = texture[(y + 1 & ymask) + ((x & xmask) << yshift)];
uint32_t p10 = texture[(y & ymask) + (((x + 1) & xmask) << yshift)];
uint32_t p11 = texture[(y + 1 & ymask) + (((x + 1) & xmask) << yshift)];
uint32_t inv_b = ((xfrac + xhalf) >> (xbits - 4)) & 15;
uint32_t inv_a = ((yfrac + yhalf) >> (ybits - 4)) & 15;
uint32_t a = 16 - inv_a;
uint32_t b = 16 - inv_b;
uint32_t red = (RPART(p00) * a * b + RPART(p01) * inv_a * b + RPART(p10) * a * inv_b + RPART(p11) * inv_a * inv_b + 127) >> 8;
uint32_t green = (GPART(p00) * a * b + GPART(p01) * inv_a * b + GPART(p10) * a * inv_b + GPART(p11) * inv_a * inv_b + 127) >> 8;
uint32_t blue = (BPART(p00) * a * b + BPART(p01) * inv_a * b + BPART(p10) * a * inv_b + BPART(p11) * inv_a * inv_b + 127) >> 8;
uint32_t alpha = (APART(p00) * a * b + APART(p01) * inv_a * b + APART(p10) * a * inv_b + APART(p11) * inv_a * inv_b + 127) >> 8;
return (alpha << 24) | (red << 16) | (green << 8) | blue;
}
// Calculate constants for a simple shade with gamma correction
#define AVX_LINEAR_SHADE_SIMPLE_INIT(light) \
__m256 mlight_hi = _mm256_set_ps(1.0f, light * (1.0f/256.0f), light * (1.0f/256.0f), light * (1.0f/256.0f), 1.0f, light * (1.0f/256.0f), light * (1.0f/256.0f), light * (1.0f/256.0f)); \

View file

@ -58,6 +58,8 @@
CVAR(Bool, r_np2, true, 0)
EXTERN_CVAR(Bool, r_bilinear)
//CVAR (Int, ty, 8, 0)
//CVAR (Int, tx, 8, 0)
@ -1066,14 +1068,16 @@ void R_RenderFakeWallRange (drawseg_t *ds, int x1, int x2)
}
// Draw a column with support for non-power-of-two ranges
uint32_t wallscan_drawcol1(int x, int y1, int y2, uint32_t uv_start, uint32_t uv_step, uint32_t uv_max, const BYTE *source, DWORD(*draw1column)())
uint32_t wallscan_drawcol1(int x, int y1, int y2, uint32_t uv_start, uint32_t uv_step, uint32_t uv_max, const SamplerSetup &sampler, DWORD(*draw1column)())
{
int pixelsize = r_swtruecolor ? 4 : 1;
if (uv_max == 0) // power of two
{
int count = y2 - y1;
dc_source = source;
dc_source = sampler.source;
dc_source2 = sampler.source2;
dc_texturefracx = sampler.texturefracx;
dc_dest = (ylookup[y1] + x) * pixelsize + dc_destorg;
dc_count = count;
dc_iscale = uv_step;
@ -1097,7 +1101,9 @@ uint32_t wallscan_drawcol1(int x, int y1, int y2, uint32_t uv_start, uint32_t uv
next_uv_wrap++;
uint32_t count = MIN(left, next_uv_wrap);
dc_source = source;
dc_source = sampler.source;
dc_source2 = sampler.source2;
dc_texturefracx = sampler.texturefracx;
dc_dest = (ylookup[y1] + x) * pixelsize + dc_destorg;
dc_count = count;
dc_iscale = uv_step;
@ -1115,7 +1121,7 @@ uint32_t wallscan_drawcol1(int x, int y1, int y2, uint32_t uv_start, uint32_t uv
}
// Draw four columns with support for non-power-of-two ranges
void wallscan_drawcol4(int x, int y1, int y2, uint32_t *uv_pos, uint32_t *uv_step, uint32_t uv_max, const BYTE **source, void(*draw4columns)())
void wallscan_drawcol4(int x, int y1, int y2, uint32_t *uv_pos, uint32_t *uv_step, uint32_t uv_max, const SamplerSetup *sampler, void(*draw4columns)())
{
int pixelsize = r_swtruecolor ? 4 : 1;
if (uv_max == 0) // power of two, no wrap handling needed
@ -1123,7 +1129,9 @@ void wallscan_drawcol4(int x, int y1, int y2, uint32_t *uv_pos, uint32_t *uv_ste
int count = y2 - y1;
for (int i = 0; i < 4; i++)
{
bufplce[i] = source[i];
bufplce[i] = sampler[i].source;
bufplce2[i] = sampler[i].source2;
buftexturefracx[i] = sampler[i].texturefracx;
vplce[i] = uv_pos[i];
vince[i] = uv_step[i];
@ -1139,7 +1147,11 @@ void wallscan_drawcol4(int x, int y1, int y2, uint32_t *uv_pos, uint32_t *uv_ste
{
dc_dest = (ylookup[y1] + x) * pixelsize + dc_destorg;
for (int i = 0; i < 4; i++)
bufplce[i] = source[i];
{
bufplce[i] = sampler[i].source;
bufplce2[i] = sampler[i].source2;
buftexturefracx[i] = sampler[i].texturefracx;
}
uint32_t left = y2 - y1;
while (left > 0)
@ -1249,12 +1261,11 @@ void wallscan_any(
if (!fixed)
R_SetColorMapLight(basecolormap, light, wallshade);
const BYTE *source = getcol(rw_pic, (lwal[x] + xoffset) >> FRACBITS);
uint32_t uv_start, uv_step;
calc_uv_start_and_step(y1, swal[x], yrepeat, uv_height, fracbits, uv_start, uv_step);
wallscan_drawcol1(x, y1, y2, uv_start, uv_step, uv_max, source, draw1column);
SamplerSetup sampler(lwal[x] + xoffset, uv_step >> (fracbits - 1) == 0, rw_pic, getcol);
wallscan_drawcol1(x, y1, y2, uv_start, uv_step, uv_max, sampler, draw1column);
}
// The aligned columns
@ -1264,10 +1275,6 @@ void wallscan_any(
int y1[4] = { uwal[x], uwal[x + 1], uwal[x + 2], uwal[x + 3] };
int y2[4] = { dwal[x], dwal[x + 1], dwal[x + 2], dwal[x + 3] };
const BYTE *source[4];
for (int i = 0; i < 4; i++)
source[i] = getcol(rw_pic, (lwal[x + i] + xoffset) >> FRACBITS);
float lights[4];
for (int i = 0; i < 4; i++)
{
@ -1276,8 +1283,16 @@ void wallscan_any(
}
uint32_t uv_pos[4], uv_step[4];
int magnifying = 0;
for (int i = 0; i < 4; i++)
{
calc_uv_start_and_step(y1[i], swal[x + i], yrepeat, uv_height, fracbits, uv_pos[i], uv_step[i]);
magnifying |= uv_step[i] >> (fracbits - 1);
}
SamplerSetup sampler[4];
for (int i = 0; i < 4; i++)
sampler[i] = SamplerSetup(lwal[x + i] + xoffset, magnifying == 0, rw_pic, getcol);
// Figure out where we vertically can start and stop drawing 4 columns in one go
int middle_y1 = y1[0];
@ -1305,7 +1320,7 @@ void wallscan_any(
if (!fixed)
R_SetColorMapLight(basecolormap, lights[i], wallshade);
wallscan_drawcol1(x + i, y1[i], y2[i], uv_pos[i], uv_step[i], uv_max, source[i], draw1column);
wallscan_drawcol1(x + i, y1[i], y2[i], uv_pos[i], uv_step[i], uv_max, sampler[i], draw1column);
}
continue;
}
@ -1317,7 +1332,7 @@ void wallscan_any(
R_SetColorMapLight(basecolormap, lights[i], wallshade);
if (y1[i] < middle_y1)
uv_pos[i] = wallscan_drawcol1(x + i, y1[i], middle_y1, uv_pos[i], uv_step[i], uv_max, source[i], draw1column);
uv_pos[i] = wallscan_drawcol1(x + i, y1[i], middle_y1, uv_pos[i], uv_step[i], uv_max, sampler[i], draw1column);
}
// Draw the area where all 4 columns are active
@ -1337,7 +1352,7 @@ void wallscan_any(
}
}
}
wallscan_drawcol4(x, middle_y1, middle_y2, uv_pos, uv_step, uv_max, source, draw4columns);
wallscan_drawcol4(x, middle_y1, middle_y2, uv_pos, uv_step, uv_max, sampler, draw4columns);
// Draw the last rows where not all 4 columns are active
for (int i = 0; i < 4; i++)
@ -1346,7 +1361,7 @@ void wallscan_any(
R_SetColorMapLight(basecolormap, lights[i], wallshade);
if (middle_y2 < y2[i])
uv_pos[i] = wallscan_drawcol1(x + i, middle_y2, y2[i], uv_pos[i], uv_step[i], uv_max, source[i], draw1column);
uv_pos[i] = wallscan_drawcol1(x + i, middle_y2, y2[i], uv_pos[i], uv_step[i], uv_max, sampler[i], draw1column);
}
}
@ -1361,12 +1376,11 @@ void wallscan_any(
if (!fixed)
R_SetColorMapLight(basecolormap, light, wallshade);
const BYTE *source = getcol(rw_pic, (lwal[x] + xoffset) >> FRACBITS);
uint32_t uv_start, uv_step;
calc_uv_start_and_step(y1, swal[x], yrepeat, uv_height, fracbits, uv_start, uv_step);
wallscan_drawcol1(x, y1, y2, uv_start, uv_step, uv_max, source, draw1column);
SamplerSetup sampler(lwal[x] + xoffset, uv_step >> (fracbits - 1) == 0, rw_pic, getcol);
wallscan_drawcol1(x, y1, y2, uv_start, uv_step, uv_max, sampler, draw1column);
}
NetUpdate ();