diff --git a/src/r_draw.cpp b/src/r_draw.cpp index 7829e2b77f..83c4ac8d40 100644 --- a/src/r_draw.cpp +++ b/src/r_draw.cpp @@ -162,6 +162,8 @@ fixed_t dc_destalpha; // Alpha value used by dc_destblend // first pixel in a column (possibly virtual) const BYTE* dc_source; +const BYTE* dc_source2; +uint32_t dc_texturefracx; BYTE* dc_dest; int dc_count; @@ -171,6 +173,8 @@ DWORD vince[4]; BYTE* palookupoffse[4]; fixed_t palookuplight[4]; const BYTE* bufplce[4]; +const BYTE* bufplce2[4]; +uint32_t buftexturefracx[4]; // just for profiling int dccount; diff --git a/src/r_draw.h b/src/r_draw.h index 99ee4d10d9..d5ecbd289d 100644 --- a/src/r_draw.h +++ b/src/r_draw.h @@ -71,6 +71,8 @@ extern "C" fixed_t dc_destalpha; // first pixel in a column extern "C" const BYTE* dc_source; +extern "C" const BYTE* dc_source2; +extern "C" uint32_t dc_texturefracx; extern "C" BYTE *dc_dest, *dc_destorg; extern "C" int dc_count; @@ -80,6 +82,8 @@ extern "C" DWORD vince[4]; extern "C" BYTE* palookupoffse[4]; extern "C" fixed_t palookuplight[4]; extern "C" const BYTE* bufplce[4]; +extern "C" const BYTE* bufplce2[4]; +extern "C" uint32_t buftexturefracx[4]; // [RH] Temporary buffer for column drawing extern "C" BYTE *dc_temp; @@ -374,4 +378,36 @@ void R_SetDSColorMapLight(FColormap *base_colormap, float light, int shade); void R_SetTranslationMap(lighttable_t *translation); +extern bool r_swtruecolor; +EXTERN_CVAR(Bool, r_bilinear); + +// Texture sampler state needed for bilinear filtering +struct SamplerSetup +{ + SamplerSetup() { } + SamplerSetup(fixed_t xoffset, bool magnifying, FTexture *texture, const BYTE*(*getcol)(FTexture *texture, int x)); + + const BYTE *source; + const BYTE *source2; + uint32_t texturefracx; +}; + +inline SamplerSetup::SamplerSetup(fixed_t xoffset, bool magnifying, FTexture *texture, const BYTE*(*getcol)(FTexture *texture, int x)) +{ + // Only do bilinear filtering if enabled and not a magnifying filter + if (!r_swtruecolor || !r_bilinear || magnifying) + { + source = getcol(texture, xoffset >> FRACBITS); + source2 = nullptr; + texturefracx = 0; + } + else + { + int tx = (xoffset - FRACUNIT / 2) >> FRACBITS; + source = getcol(texture, tx); + source2 = getcol(texture, tx + 1); + texturefracx = ((xoffset + FRACUNIT / 2) >> (FRACBITS - 4)) & 15; + } +} + #endif diff --git a/src/r_draw_rgba.cpp b/src/r_draw_rgba.cpp index f317a34d60..d85d9994bb 100644 --- a/src/r_draw_rgba.cpp +++ b/src/r_draw_rgba.cpp @@ -58,6 +58,7 @@ extern float rw_lightstep; extern int wallshade; CVAR(Bool, r_multithreaded, true, 0) +CVAR(Bool, r_bilinear, false, 0) #ifndef NO_SSE @@ -1547,41 +1548,72 @@ public: uint32_t light = calc_light_multiplier(_light); ShadeConstants shade_constants = _shade_constants; - if (_xbits == 6 && _ybits == 6) + fixed_t xmagnitude = abs((fixed_t)xstep) >> (32 - _xbits - FRACBITS); + fixed_t ymagnitude = abs((fixed_t)ystep) >> (32 - _ybits - FRACBITS); + fixed_t magnitude = xmagnitude + ymagnitude; + + bool magnifying = !r_bilinear || magnitude >> (FRACBITS - 1) == 0; + if (magnifying) { - // 64x64 is the most common case by far, so special case it. - - do + if (_xbits == 6 && _ybits == 6) { - // Current texture index in u,v. - spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); + // 64x64 is the most common case by far, so special case it. - // Lookup pixel from flat texture tile - *dest++ = shade_bgra(source[spot], light, shade_constants); + do + { + // Current texture index in u,v. + spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); - // Next step in u,v. - xfrac += xstep; - yfrac += ystep; - } while (--count); + // Lookup pixel from flat texture tile + *dest++ = shade_bgra(source[spot], light, shade_constants); + + // Next step in u,v. + xfrac += xstep; + yfrac += ystep; + } while (--count); + } + else + { + BYTE yshift = 32 - _ybits; + BYTE xshift = yshift - _xbits; + int xmask = ((1 << _xbits) - 1) << _ybits; + + do + { + // Current texture index in u,v. + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + + // Lookup pixel from flat texture tile + *dest++ = shade_bgra(source[spot], light, shade_constants); + + // Next step in u,v. + xfrac += xstep; + yfrac += ystep; + } while (--count); + } } else { - BYTE yshift = 32 - _ybits; - BYTE xshift = yshift - _xbits; - int xmask = ((1 << _xbits) - 1) << _ybits; - - do + if (_xbits == 6 && _ybits == 6) { - // Current texture index in u,v. - spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + // 64x64 is the most common case by far, so special case it. - // Lookup pixel from flat texture tile - *dest++ = shade_bgra(source[spot], light, shade_constants); - - // Next step in u,v. - xfrac += xstep; - yfrac += ystep; - } while (--count); + do + { + *dest++ = shade_bgra(sample_bilinear(source, xfrac, yfrac, 26, 26), light, shade_constants); + xfrac += xstep; + yfrac += ystep; + } while (--count); + } + else + { + do + { + *dest++ = shade_bgra(sample_bilinear(source, xfrac, yfrac, 32 - _xbits, 32 - _ybits), light, shade_constants); + xfrac += xstep; + yfrac += ystep; + } while (--count); + } } } }; @@ -2253,6 +2285,8 @@ class Vlinec1RGBACommand : public DrawerCommand DWORD _texturefrac; int _count; const BYTE * RESTRICT _source; + const BYTE * RESTRICT _source2; + uint32_t _texturefracx; BYTE * RESTRICT _dest; int vlinebits; int _pitch; @@ -2266,6 +2300,8 @@ public: _texturefrac = dc_texturefrac; _count = dc_count; _source = dc_source; + _source2 = dc_source2; + _texturefracx = dc_texturefracx; _dest = dc_dest; vlinebits = ::vlinebits; _pitch = dc_pitch; @@ -2282,6 +2318,8 @@ public: DWORD fracstep = _iscale * thread->num_cores; DWORD frac = _texturefrac + _iscale * thread->skipped_by_thread(_dest_y); const uint32 *source = (const uint32 *)_source; + const uint32 *source2 = (const uint32 *)_source2; + uint32_t texturefracx = _texturefracx; uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest); int bits = vlinebits; int pitch = _pitch * thread->num_cores; @@ -2289,12 +2327,24 @@ public: uint32_t light = calc_light_multiplier(_light); ShadeConstants shade_constants = _shade_constants; - do + if (_source2 == nullptr) { - *dest = shade_bgra(source[frac >> bits], light, shade_constants); - frac += fracstep; - dest += pitch; - } while (--count); + do + { + *dest = shade_bgra(source[frac >> bits], light, shade_constants); + frac += fracstep; + dest += pitch; + } while (--count); + } + else + { + do + { + *dest = shade_bgra(sample_bilinear(source, source2, texturefracx, frac, bits), light, shade_constants); + frac += fracstep; + dest += pitch; + } while (--count); + } } }; @@ -2308,7 +2358,9 @@ class Vlinec4RGBACommand : public DrawerCommand fixed_t palookuplight[4]; DWORD vplce[4]; DWORD vince[4]; - const uint32 * RESTRICT bufplce[4]; + const uint32_t * RESTRICT bufplce[4]; + const uint32_t * RESTRICT bufplce2[4]; + uint32_t buftexturefracx[4]; public: Vlinec4RGBACommand() @@ -2323,7 +2375,9 @@ public: palookuplight[i] = ::palookuplight[i]; vplce[i] = ::vplce[i]; vince[i] = ::vince[i]; - bufplce[i] = (const uint32 *)::bufplce[i]; + bufplce[i] = (const uint32_t *)::bufplce[i]; + bufplce2[i] = (const uint32_t *)::bufplce2[i]; + buftexturefracx[i] = ::buftexturefracx[i]; } } @@ -2354,14 +2408,28 @@ public: local_vince[i] *= thread->num_cores; } - do + if (bufplce2[0] == nullptr) { - dest[0] = shade_bgra(bufplce[0][(place = local_vplce[0]) >> bits], light0, shade_constants); local_vplce[0] = place + local_vince[0]; - dest[1] = shade_bgra(bufplce[1][(place = local_vplce[1]) >> bits], light1, shade_constants); local_vplce[1] = place + local_vince[1]; - dest[2] = shade_bgra(bufplce[2][(place = local_vplce[2]) >> bits], light2, shade_constants); local_vplce[2] = place + local_vince[2]; - dest[3] = shade_bgra(bufplce[3][(place = local_vplce[3]) >> bits], light3, shade_constants); local_vplce[3] = place + local_vince[3]; - dest += pitch; - } while (--count); + do + { + dest[0] = shade_bgra(bufplce[0][(place = local_vplce[0]) >> bits], light0, shade_constants); local_vplce[0] = place + local_vince[0]; + dest[1] = shade_bgra(bufplce[1][(place = local_vplce[1]) >> bits], light1, shade_constants); local_vplce[1] = place + local_vince[1]; + dest[2] = shade_bgra(bufplce[2][(place = local_vplce[2]) >> bits], light2, shade_constants); local_vplce[2] = place + local_vince[2]; + dest[3] = shade_bgra(bufplce[3][(place = local_vplce[3]) >> bits], light3, shade_constants); local_vplce[3] = place + local_vince[3]; + dest += pitch; + } while (--count); + } + else + { + do + { + dest[0] = shade_bgra(sample_bilinear(bufplce[0], bufplce2[0], buftexturefracx[0], place = local_vplce[0], bits), light0, shade_constants); local_vplce[0] = place + local_vince[0]; + dest[1] = shade_bgra(sample_bilinear(bufplce[1], bufplce2[1], buftexturefracx[1], place = local_vplce[1], bits), light1, shade_constants); local_vplce[1] = place + local_vince[1]; + dest[2] = shade_bgra(sample_bilinear(bufplce[2], bufplce2[2], buftexturefracx[2], place = local_vplce[2], bits), light2, shade_constants); local_vplce[2] = place + local_vince[2]; + dest[3] = shade_bgra(sample_bilinear(bufplce[3], bufplce2[3], buftexturefracx[3], place = local_vplce[3], bits), light3, shade_constants); local_vplce[3] = place + local_vince[3]; + dest += pitch; + } while (--count); + } } }; @@ -3651,7 +3719,10 @@ void R_DrawSpan_rgba() #ifdef NO_SSE DrawerCommandQueue::QueueCommand(); #else - DrawerCommandQueue::QueueCommand(); + if (!r_bilinear) + DrawerCommandQueue::QueueCommand(); + else + DrawerCommandQueue::QueueCommand(); #endif } @@ -3705,7 +3776,10 @@ void vlinec4_rgba() #ifdef NO_SSE DrawerCommandQueue::QueueCommand(); #else - DrawerCommandQueue::QueueCommand(); + if (!r_bilinear) + DrawerCommandQueue::QueueCommand(); + else + DrawerCommandQueue::QueueCommand(); #endif for (int i = 0; i < 4; i++) vplce[i] += vince[i] * dc_count; diff --git a/src/r_draw_rgba.h b/src/r_draw_rgba.h index 2527e84a61..a266ce878a 100644 --- a/src/r_draw_rgba.h +++ b/src/r_draw_rgba.h @@ -426,6 +426,58 @@ FORCEINLINE uint32_t alpha_blend(uint32_t fg, uint32_t bg) return 0xff000000 | (red << 16) | (green << 8) | blue; } +FORCEINLINE uint32_t sample_bilinear(const uint32_t *col0, const uint32_t *col1, uint32_t texturefracx, uint32_t texturefracy, int ybits) +{ + uint32_t half = 1 << (ybits - 1); + uint32_t y = (texturefracy - half) >> ybits; + + uint32_t p00 = col0[y]; + uint32_t p01 = col0[y + 1]; + uint32_t p10 = col1[y]; + uint32_t p11 = col1[y + 1]; + + uint32_t inv_b = texturefracx; + uint32_t inv_a = ((texturefracy + half) >> (ybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t red = (RPART(p00) * a * b + RPART(p01) * inv_a * b + RPART(p10) * a * inv_b + RPART(p11) * inv_a * inv_b + 127) >> 8; + uint32_t green = (GPART(p00) * a * b + GPART(p01) * inv_a * b + GPART(p10) * a * inv_b + GPART(p11) * inv_a * inv_b + 127) >> 8; + uint32_t blue = (BPART(p00) * a * b + BPART(p01) * inv_a * b + BPART(p10) * a * inv_b + BPART(p11) * inv_a * inv_b + 127) >> 8; + uint32_t alpha = (APART(p00) * a * b + APART(p01) * inv_a * b + APART(p10) * a * inv_b + APART(p11) * inv_a * inv_b + 127) >> 8; + + return (alpha << 24) | (red << 16) | (green << 8) | blue; +} + +FORCEINLINE uint32_t sample_bilinear(const uint32_t *texture, dsfixed_t xfrac, dsfixed_t yfrac, int xbits, int ybits) +{ + int xshift = (32 - xbits); + int yshift = (32 - ybits); + int xmask = (1 << xshift) - 1; + int ymask = (1 << yshift) - 1; + uint32_t xhalf = 1 << (xbits - 1); + uint32_t yhalf = 1 << (ybits - 1); + uint32_t x = (xfrac - xhalf) >> xbits; + uint32_t y = (yfrac - yhalf) >> ybits; + + uint32_t p00 = texture[(y & ymask) + ((x & xmask) << yshift)]; + uint32_t p01 = texture[(y + 1 & ymask) + ((x & xmask) << yshift)]; + uint32_t p10 = texture[(y & ymask) + (((x + 1) & xmask) << yshift)]; + uint32_t p11 = texture[(y + 1 & ymask) + (((x + 1) & xmask) << yshift)]; + + uint32_t inv_b = ((xfrac + xhalf) >> (xbits - 4)) & 15; + uint32_t inv_a = ((yfrac + yhalf) >> (ybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t red = (RPART(p00) * a * b + RPART(p01) * inv_a * b + RPART(p10) * a * inv_b + RPART(p11) * inv_a * inv_b + 127) >> 8; + uint32_t green = (GPART(p00) * a * b + GPART(p01) * inv_a * b + GPART(p10) * a * inv_b + GPART(p11) * inv_a * inv_b + 127) >> 8; + uint32_t blue = (BPART(p00) * a * b + BPART(p01) * inv_a * b + BPART(p10) * a * inv_b + BPART(p11) * inv_a * inv_b + 127) >> 8; + uint32_t alpha = (APART(p00) * a * b + APART(p01) * inv_a * b + APART(p10) * a * inv_b + APART(p11) * inv_a * inv_b + 127) >> 8; + + return (alpha << 24) | (red << 16) | (green << 8) | blue; +} + // Calculate constants for a simple shade with gamma correction #define AVX_LINEAR_SHADE_SIMPLE_INIT(light) \ __m256 mlight_hi = _mm256_set_ps(1.0f, light * (1.0f/256.0f), light * (1.0f/256.0f), light * (1.0f/256.0f), 1.0f, light * (1.0f/256.0f), light * (1.0f/256.0f), light * (1.0f/256.0f)); \ diff --git a/src/r_segs.cpp b/src/r_segs.cpp index ad242b2f91..d71487bb9d 100644 --- a/src/r_segs.cpp +++ b/src/r_segs.cpp @@ -58,6 +58,8 @@ CVAR(Bool, r_np2, true, 0) +EXTERN_CVAR(Bool, r_bilinear) + //CVAR (Int, ty, 8, 0) //CVAR (Int, tx, 8, 0) @@ -1066,14 +1068,16 @@ void R_RenderFakeWallRange (drawseg_t *ds, int x1, int x2) } // Draw a column with support for non-power-of-two ranges -uint32_t wallscan_drawcol1(int x, int y1, int y2, uint32_t uv_start, uint32_t uv_step, uint32_t uv_max, const BYTE *source, DWORD(*draw1column)()) +uint32_t wallscan_drawcol1(int x, int y1, int y2, uint32_t uv_start, uint32_t uv_step, uint32_t uv_max, const SamplerSetup &sampler, DWORD(*draw1column)()) { int pixelsize = r_swtruecolor ? 4 : 1; if (uv_max == 0) // power of two { int count = y2 - y1; - dc_source = source; + dc_source = sampler.source; + dc_source2 = sampler.source2; + dc_texturefracx = sampler.texturefracx; dc_dest = (ylookup[y1] + x) * pixelsize + dc_destorg; dc_count = count; dc_iscale = uv_step; @@ -1097,7 +1101,9 @@ uint32_t wallscan_drawcol1(int x, int y1, int y2, uint32_t uv_start, uint32_t uv next_uv_wrap++; uint32_t count = MIN(left, next_uv_wrap); - dc_source = source; + dc_source = sampler.source; + dc_source2 = sampler.source2; + dc_texturefracx = sampler.texturefracx; dc_dest = (ylookup[y1] + x) * pixelsize + dc_destorg; dc_count = count; dc_iscale = uv_step; @@ -1115,7 +1121,7 @@ uint32_t wallscan_drawcol1(int x, int y1, int y2, uint32_t uv_start, uint32_t uv } // Draw four columns with support for non-power-of-two ranges -void wallscan_drawcol4(int x, int y1, int y2, uint32_t *uv_pos, uint32_t *uv_step, uint32_t uv_max, const BYTE **source, void(*draw4columns)()) +void wallscan_drawcol4(int x, int y1, int y2, uint32_t *uv_pos, uint32_t *uv_step, uint32_t uv_max, const SamplerSetup *sampler, void(*draw4columns)()) { int pixelsize = r_swtruecolor ? 4 : 1; if (uv_max == 0) // power of two, no wrap handling needed @@ -1123,7 +1129,9 @@ void wallscan_drawcol4(int x, int y1, int y2, uint32_t *uv_pos, uint32_t *uv_ste int count = y2 - y1; for (int i = 0; i < 4; i++) { - bufplce[i] = source[i]; + bufplce[i] = sampler[i].source; + bufplce2[i] = sampler[i].source2; + buftexturefracx[i] = sampler[i].texturefracx; vplce[i] = uv_pos[i]; vince[i] = uv_step[i]; @@ -1139,7 +1147,11 @@ void wallscan_drawcol4(int x, int y1, int y2, uint32_t *uv_pos, uint32_t *uv_ste { dc_dest = (ylookup[y1] + x) * pixelsize + dc_destorg; for (int i = 0; i < 4; i++) - bufplce[i] = source[i]; + { + bufplce[i] = sampler[i].source; + bufplce2[i] = sampler[i].source2; + buftexturefracx[i] = sampler[i].texturefracx; + } uint32_t left = y2 - y1; while (left > 0) @@ -1249,12 +1261,11 @@ void wallscan_any( if (!fixed) R_SetColorMapLight(basecolormap, light, wallshade); - const BYTE *source = getcol(rw_pic, (lwal[x] + xoffset) >> FRACBITS); - uint32_t uv_start, uv_step; calc_uv_start_and_step(y1, swal[x], yrepeat, uv_height, fracbits, uv_start, uv_step); - wallscan_drawcol1(x, y1, y2, uv_start, uv_step, uv_max, source, draw1column); + SamplerSetup sampler(lwal[x] + xoffset, uv_step >> (fracbits - 1) == 0, rw_pic, getcol); + wallscan_drawcol1(x, y1, y2, uv_start, uv_step, uv_max, sampler, draw1column); } // The aligned columns @@ -1264,10 +1275,6 @@ void wallscan_any( int y1[4] = { uwal[x], uwal[x + 1], uwal[x + 2], uwal[x + 3] }; int y2[4] = { dwal[x], dwal[x + 1], dwal[x + 2], dwal[x + 3] }; - const BYTE *source[4]; - for (int i = 0; i < 4; i++) - source[i] = getcol(rw_pic, (lwal[x + i] + xoffset) >> FRACBITS); - float lights[4]; for (int i = 0; i < 4; i++) { @@ -1276,8 +1283,16 @@ void wallscan_any( } uint32_t uv_pos[4], uv_step[4]; + int magnifying = 0; for (int i = 0; i < 4; i++) + { calc_uv_start_and_step(y1[i], swal[x + i], yrepeat, uv_height, fracbits, uv_pos[i], uv_step[i]); + magnifying |= uv_step[i] >> (fracbits - 1); + } + + SamplerSetup sampler[4]; + for (int i = 0; i < 4; i++) + sampler[i] = SamplerSetup(lwal[x + i] + xoffset, magnifying == 0, rw_pic, getcol); // Figure out where we vertically can start and stop drawing 4 columns in one go int middle_y1 = y1[0]; @@ -1305,7 +1320,7 @@ void wallscan_any( if (!fixed) R_SetColorMapLight(basecolormap, lights[i], wallshade); - wallscan_drawcol1(x + i, y1[i], y2[i], uv_pos[i], uv_step[i], uv_max, source[i], draw1column); + wallscan_drawcol1(x + i, y1[i], y2[i], uv_pos[i], uv_step[i], uv_max, sampler[i], draw1column); } continue; } @@ -1317,7 +1332,7 @@ void wallscan_any( R_SetColorMapLight(basecolormap, lights[i], wallshade); if (y1[i] < middle_y1) - uv_pos[i] = wallscan_drawcol1(x + i, y1[i], middle_y1, uv_pos[i], uv_step[i], uv_max, source[i], draw1column); + uv_pos[i] = wallscan_drawcol1(x + i, y1[i], middle_y1, uv_pos[i], uv_step[i], uv_max, sampler[i], draw1column); } // Draw the area where all 4 columns are active @@ -1337,7 +1352,7 @@ void wallscan_any( } } } - wallscan_drawcol4(x, middle_y1, middle_y2, uv_pos, uv_step, uv_max, source, draw4columns); + wallscan_drawcol4(x, middle_y1, middle_y2, uv_pos, uv_step, uv_max, sampler, draw4columns); // Draw the last rows where not all 4 columns are active for (int i = 0; i < 4; i++) @@ -1346,7 +1361,7 @@ void wallscan_any( R_SetColorMapLight(basecolormap, lights[i], wallshade); if (middle_y2 < y2[i]) - uv_pos[i] = wallscan_drawcol1(x + i, middle_y2, y2[i], uv_pos[i], uv_step[i], uv_max, source[i], draw1column); + uv_pos[i] = wallscan_drawcol1(x + i, middle_y2, y2[i], uv_pos[i], uv_step[i], uv_max, sampler[i], draw1column); } } @@ -1361,12 +1376,11 @@ void wallscan_any( if (!fixed) R_SetColorMapLight(basecolormap, light, wallshade); - const BYTE *source = getcol(rw_pic, (lwal[x] + xoffset) >> FRACBITS); - uint32_t uv_start, uv_step; calc_uv_start_and_step(y1, swal[x], yrepeat, uv_height, fracbits, uv_start, uv_step); - wallscan_drawcol1(x, y1, y2, uv_start, uv_step, uv_max, source, draw1column); + SamplerSetup sampler(lwal[x] + xoffset, uv_step >> (fracbits - 1) == 0, rw_pic, getcol); + wallscan_drawcol1(x, y1, y2, uv_start, uv_step, uv_max, sampler, draw1column); } NetUpdate ();