mirror of
https://github.com/ZDoom/gzdoom-gles.git
synced 2024-11-24 21:21:04 +00:00
- updated xBRZ scaler to 1.8
Removed all C++17 features: std::clamp(), attribute [[likely]], terse static asserts, ... https://sourceforge.net/projects/xbrz/files/xBRZ/xBRZ_1.8.zip/download
This commit is contained in:
parent
a7a899ca14
commit
ffe8aaa091
3 changed files with 257 additions and 168 deletions
|
@ -27,7 +27,7 @@ using namespace xbrz;
|
||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
template <unsigned int M, unsigned int N> inline
|
template <unsigned int M, unsigned int N> inline
|
||||||
uint32_t gradientRGB(uint32_t pixFront, uint32_t pixBack) //blend front color with opacity M / N over opaque background: http://en.wikipedia.org/wiki/Alpha_compositing#Alpha_blending
|
uint32_t gradientRGB(uint32_t pixFront, uint32_t pixBack) //blend front color with opacity M / N over opaque background: https://en.wikipedia.org/wiki/Alpha_compositing#Alpha_blending
|
||||||
{
|
{
|
||||||
static_assert(0 < M && M < N && N <= 1000, "");
|
static_assert(0 < M && M < N && N <= 1000, "");
|
||||||
|
|
||||||
|
@ -153,7 +153,7 @@ double distRGB(uint32_t pix1, uint32_t pix2)
|
||||||
inline
|
inline
|
||||||
double distYCbCr(uint32_t pix1, uint32_t pix2, double lumaWeight)
|
double distYCbCr(uint32_t pix1, uint32_t pix2, double lumaWeight)
|
||||||
{
|
{
|
||||||
//http://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
|
//https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
|
||||||
//YCbCr conversion is a matrix multiplication => take advantage of linearity by subtracting first!
|
//YCbCr conversion is a matrix multiplication => take advantage of linearity by subtracting first!
|
||||||
const int r_diff = static_cast<int>(getRed (pix1)) - getRed (pix2); //we may delay division by 255 to after matrix multiplication
|
const int r_diff = static_cast<int>(getRed (pix1)) - getRed (pix2); //we may delay division by 255 to after matrix multiplication
|
||||||
const int g_diff = static_cast<int>(getGreen(pix1)) - getGreen(pix2); //
|
const int g_diff = static_cast<int>(getGreen(pix1)) - getGreen(pix2); //
|
||||||
|
@ -254,24 +254,32 @@ struct BlendResult
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
struct Kernel_3x3
|
||||||
|
{
|
||||||
|
uint32_t
|
||||||
|
a, b, c,
|
||||||
|
d, e, f,
|
||||||
|
g, h, i;
|
||||||
|
};
|
||||||
|
|
||||||
struct Kernel_4x4 //kernel for preprocessing step
|
struct Kernel_4x4 //kernel for preprocessing step
|
||||||
{
|
{
|
||||||
uint32_t
|
uint32_t
|
||||||
/**/a, b, c, d,
|
a, b, c, //
|
||||||
/**/e, f, g, h,
|
e, f, g, // support reinterpret_cast from Kernel_4x4 => Kernel_3x3
|
||||||
/**/i, j, k, l,
|
i, j, k, //
|
||||||
/**/m, n, o, p;
|
m, n, o,
|
||||||
|
d, h, l, p;
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/* input kernel area naming convention:
|
||||||
input kernel area naming convention:
|
|
||||||
-----------------
|
-----------------
|
||||||
| A | B | C | D |
|
| A | B | C | D |
|
||||||
----|---|---|---|
|
|---|---|---|---|
|
||||||
| E | F | G | H | //evaluate the four corners between F, G, J, K
|
| E | F | G | H | evaluate the four corners between F, G, J, K
|
||||||
----|---|---|---| //input pixel is at position F
|
|---|---|---|---| input pixel is at position F
|
||||||
| I | J | K | L |
|
| I | J | K | L |
|
||||||
----|---|---|---|
|
|---|---|---|---|
|
||||||
| M | N | O | P |
|
| M | N | O | P |
|
||||||
-----------------
|
-----------------
|
||||||
*/
|
*/
|
||||||
|
@ -318,14 +326,6 @@ BlendResult preProcessCorners(const Kernel_4x4& ker, const xbrz::ScalerCfg& cfg)
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct Kernel_3x3
|
|
||||||
{
|
|
||||||
uint32_t
|
|
||||||
/**/a, b, c,
|
|
||||||
/**/d, e, f,
|
|
||||||
/**/g, h, i;
|
|
||||||
};
|
|
||||||
|
|
||||||
#define DEF_GETTER(x) template <RotationDegree rotDeg> uint32_t inline get_##x(const Kernel_3x3& ker) { return ker.x; }
|
#define DEF_GETTER(x) template <RotationDegree rotDeg> uint32_t inline get_##x(const Kernel_3x3& ker) { return ker.x; }
|
||||||
//we cannot and NEED NOT write "ker.##x" since ## concatenates preprocessor tokens but "." is not a token
|
//we cannot and NEED NOT write "ker.##x" since ## concatenates preprocessor tokens but "." is not a token
|
||||||
DEF_GETTER(a) DEF_GETTER(b) DEF_GETTER(c)
|
DEF_GETTER(a) DEF_GETTER(b) DEF_GETTER(c)
|
||||||
|
@ -358,12 +358,16 @@ inline BlendType getTopR (unsigned char b) { return static_cast<BlendType>(0x3
|
||||||
inline BlendType getBottomR(unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 4)); }
|
inline BlendType getBottomR(unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 4)); }
|
||||||
inline BlendType getBottomL(unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 6)); }
|
inline BlendType getBottomL(unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 6)); }
|
||||||
|
|
||||||
inline void setTopL (unsigned char& b, BlendType bt) { b |= bt; } //buffer is assumed to be initialized before preprocessing!
|
inline void clearAddTopL(unsigned char& b, BlendType bt) { b = static_cast<unsigned char>(bt); }
|
||||||
inline void setTopR (unsigned char& b, BlendType bt) { b |= (bt << 2); }
|
inline void addTopR (unsigned char& b, BlendType bt) { b |= (bt << 2); } //buffer is assumed to be initialized before preprocessing!
|
||||||
inline void setBottomR(unsigned char& b, BlendType bt) { b |= (bt << 4); }
|
inline void addBottomR (unsigned char& b, BlendType bt) { b |= (bt << 4); } //e.g. via clearAddTopL()
|
||||||
inline void setBottomL(unsigned char& b, BlendType bt) { b |= (bt << 6); }
|
inline void addBottomL (unsigned char& b, BlendType bt) { b |= (bt << 6); } //
|
||||||
|
|
||||||
inline bool blendingNeeded(unsigned char b) { return b != 0; }
|
inline bool blendingNeeded(unsigned char b)
|
||||||
|
{
|
||||||
|
static_assert(BLEND_NONE == 0, "");
|
||||||
|
return b != 0;
|
||||||
|
}
|
||||||
|
|
||||||
template <RotationDegree rotDeg> inline
|
template <RotationDegree rotDeg> inline
|
||||||
unsigned char rotateBlendInfo(unsigned char b) { return b; }
|
unsigned char rotateBlendInfo(unsigned char b) { return b; }
|
||||||
|
@ -372,13 +376,12 @@ template <> inline unsigned char rotateBlendInfo<ROT_180>(unsigned char b) { ret
|
||||||
template <> inline unsigned char rotateBlendInfo<ROT_270>(unsigned char b) { return ((b << 6) | (b >> 2)) & 0xff; }
|
template <> inline unsigned char rotateBlendInfo<ROT_270>(unsigned char b) { return ((b << 6) | (b >> 2)) & 0xff; }
|
||||||
|
|
||||||
|
|
||||||
/*
|
/* input kernel area naming convention:
|
||||||
input kernel area naming convention:
|
|
||||||
-------------
|
-------------
|
||||||
| A | B | C |
|
| A | B | C |
|
||||||
----|---|---|
|
|---|---|---|
|
||||||
| D | E | F | //input pixel is at position E
|
| D | E | F | input pixel is at position E
|
||||||
----|---|---|
|
|---|---|---|
|
||||||
| G | H | I |
|
| G | H | I |
|
||||||
-------------
|
-------------
|
||||||
*/
|
*/
|
||||||
|
@ -472,7 +475,80 @@ void blendPixel(const Kernel_3x3& ker,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <class Scaler, class ColorDistance> //scaler policy: see "Scaler2x" reference implementation
|
class OobReaderTransparent
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
OobReaderTransparent(const uint32_t* src, int srcWidth, int srcHeight, int y) :
|
||||||
|
s_m1(0 <= y - 1 && y - 1 < srcHeight ? src + srcWidth * (y - 1) : nullptr),
|
||||||
|
s_0 (0 <= y && y < srcHeight ? src + srcWidth * y : nullptr),
|
||||||
|
s_p1(0 <= y + 1 && y + 1 < srcHeight ? src + srcWidth * (y + 1) : nullptr),
|
||||||
|
s_p2(0 <= y + 2 && y + 2 < srcHeight ? src + srcWidth * (y + 2) : nullptr),
|
||||||
|
srcWidth_(srcWidth) {}
|
||||||
|
|
||||||
|
void readDhlp(Kernel_4x4& ker, int x) const //(x, y) is at kernel position F
|
||||||
|
{
|
||||||
|
const int x_p2 = x + 2;
|
||||||
|
|
||||||
|
if (0 <= x_p2 && x_p2 < srcWidth_)
|
||||||
|
{
|
||||||
|
ker.d = s_m1 ? s_m1[x_p2] : 0;
|
||||||
|
ker.h = s_0 ? s_0 [x_p2] : 0;
|
||||||
|
ker.l = s_p1 ? s_p1[x_p2] : 0;
|
||||||
|
ker.p = s_p2 ? s_p2[x_p2] : 0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ker.d = 0;
|
||||||
|
ker.h = 0;
|
||||||
|
ker.l = 0;
|
||||||
|
ker.p = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
const uint32_t* const s_m1;
|
||||||
|
const uint32_t* const s_0;
|
||||||
|
const uint32_t* const s_p1;
|
||||||
|
const uint32_t* const s_p2;
|
||||||
|
const int srcWidth_;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
constexpr inline T xbrz_clamp(const T in, const T min, const T max)
|
||||||
|
{
|
||||||
|
return in <= min ? min : in >= max ? max : in;
|
||||||
|
}
|
||||||
|
|
||||||
|
class OobReaderDuplicate
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
OobReaderDuplicate(const uint32_t* src, int srcWidth, int srcHeight, int y) :
|
||||||
|
s_m1(src + srcWidth * xbrz_clamp(y - 1, 0, srcHeight - 1)),
|
||||||
|
s_0 (src + srcWidth * xbrz_clamp(y, 0, srcHeight - 1)),
|
||||||
|
s_p1(src + srcWidth * xbrz_clamp(y + 1, 0, srcHeight - 1)),
|
||||||
|
s_p2(src + srcWidth * xbrz_clamp(y + 2, 0, srcHeight - 1)),
|
||||||
|
srcWidth_(srcWidth) {}
|
||||||
|
|
||||||
|
void readDhlp(Kernel_4x4& ker, int x) const //(x, y) is at kernel position F
|
||||||
|
{
|
||||||
|
const int x_p2 = xbrz_clamp(x + 2, 0, srcWidth_ - 1);
|
||||||
|
ker.d = s_m1[x_p2];
|
||||||
|
ker.h = s_0 [x_p2];
|
||||||
|
ker.l = s_p1[x_p2];
|
||||||
|
ker.p = s_p2[x_p2];
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
const uint32_t* const s_m1;
|
||||||
|
const uint32_t* const s_0;
|
||||||
|
const uint32_t* const s_p1;
|
||||||
|
const uint32_t* const s_p2;
|
||||||
|
const int srcWidth_;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
template <class Scaler, class ColorDistance, class OobReader> //scaler policy: see "Scaler2x" reference implementation
|
||||||
void scaleImage(const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight, const xbrz::ScalerCfg& cfg, int yFirst, int yLast)
|
void scaleImage(const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight, const xbrz::ScalerCfg& cfg, int yFirst, int yLast)
|
||||||
{
|
{
|
||||||
yFirst = std::max(yFirst, 0);
|
yFirst = std::max(yFirst, 0);
|
||||||
|
@ -482,64 +558,72 @@ void scaleImage(const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight,
|
||||||
|
|
||||||
const int trgWidth = srcWidth * Scaler::scale;
|
const int trgWidth = srcWidth * Scaler::scale;
|
||||||
|
|
||||||
//"use" space at the end of the image as temporary buffer for "on the fly preprocessing": we even could use larger area of
|
//(ab)use space of "sizeof(uint32_t) * srcWidth * Scaler::scale" at the end of the image as temporary
|
||||||
//"sizeof(uint32_t) * srcWidth * (yLast - yFirst)" bytes without risk of accidental overwriting before accessing
|
//buffer for "on the fly preprocessing" without risk of accidental overwriting before accessing
|
||||||
const int bufferSize = srcWidth;
|
unsigned char* const preProcBuf = reinterpret_cast<unsigned char*>(trg + yLast * Scaler::scale * trgWidth) - srcWidth;
|
||||||
unsigned char* preProcBuffer = reinterpret_cast<unsigned char*>(trg + yLast * Scaler::scale * trgWidth) - bufferSize;
|
|
||||||
std::fill(preProcBuffer, preProcBuffer + bufferSize, '\0');
|
|
||||||
static_assert(BLEND_NONE == 0, "");
|
|
||||||
|
|
||||||
//initialize preprocessing buffer for first row of current stripe: detect upper left and right corner blending
|
//initialize preprocessing buffer for first row of current stripe: detect upper left and right corner blending
|
||||||
//this cannot be optimized for adjacent processing stripes; we must not allow for a memory race condition!
|
//this cannot be optimized for adjacent processing stripes; we must not allow for a memory race condition!
|
||||||
if (yFirst > 0)
|
|
||||||
{
|
{
|
||||||
const int y = yFirst - 1;
|
const OobReader oobReader(src, srcWidth, srcHeight, yFirst - 1);
|
||||||
|
|
||||||
const uint32_t* s_m1 = src + srcWidth * std::max(y - 1, 0);
|
//initialize at position x = -1
|
||||||
const uint32_t* s_0 = src + srcWidth * y; //center line
|
Kernel_4x4 ker4 = {};
|
||||||
const uint32_t* s_p1 = src + srcWidth * std::min(y + 1, srcHeight - 1);
|
oobReader.readDhlp(ker4, -4); //hack: read a, e, i, m at x = -1
|
||||||
const uint32_t* s_p2 = src + srcWidth * std::min(y + 2, srcHeight - 1);
|
ker4.a = ker4.d;
|
||||||
|
ker4.e = ker4.h;
|
||||||
|
ker4.i = ker4.l;
|
||||||
|
ker4.m = ker4.p;
|
||||||
|
|
||||||
|
oobReader.readDhlp(ker4, -3);
|
||||||
|
ker4.b = ker4.d;
|
||||||
|
ker4.f = ker4.h;
|
||||||
|
ker4.j = ker4.l;
|
||||||
|
ker4.n = ker4.p;
|
||||||
|
|
||||||
|
oobReader.readDhlp(ker4, -2);
|
||||||
|
ker4.c = ker4.d;
|
||||||
|
ker4.g = ker4.h;
|
||||||
|
ker4.k = ker4.l;
|
||||||
|
ker4.o = ker4.p;
|
||||||
|
|
||||||
|
oobReader.readDhlp(ker4, -1);
|
||||||
|
|
||||||
|
{
|
||||||
|
const BlendResult res = preProcessCorners<ColorDistance>(ker4, cfg);
|
||||||
|
clearAddTopL(preProcBuf[0], res.blend_k); //set 1st known corner for (0, yFirst)
|
||||||
|
}
|
||||||
|
|
||||||
for (int x = 0; x < srcWidth; ++x)
|
for (int x = 0; x < srcWidth; ++x)
|
||||||
{
|
{
|
||||||
const int x_m1 = std::max(x - 1, 0);
|
ker4.a = ker4.b; //shift previous kernel to the left
|
||||||
const int x_p1 = std::min(x + 1, srcWidth - 1);
|
ker4.e = ker4.f; // -----------------
|
||||||
const int x_p2 = std::min(x + 2, srcWidth - 1);
|
ker4.i = ker4.j; // | A | B | C | D |
|
||||||
|
ker4.m = ker4.n; // |---|---|---|---|
|
||||||
|
/**/ // | E | F | G | H | (x, yFirst - 1) is at position F
|
||||||
|
ker4.b = ker4.c; // |---|---|---|---|
|
||||||
|
ker4.f = ker4.g; // | I | J | K | L |
|
||||||
|
ker4.j = ker4.k; // |---|---|---|---|
|
||||||
|
ker4.n = ker4.o; // | M | N | O | P |
|
||||||
|
/**/ // -----------------
|
||||||
|
ker4.c = ker4.d;
|
||||||
|
ker4.g = ker4.h;
|
||||||
|
ker4.k = ker4.l;
|
||||||
|
ker4.o = ker4.p;
|
||||||
|
|
||||||
Kernel_4x4 ker = {}; //perf: initialization is negligible
|
oobReader.readDhlp(ker4, x);
|
||||||
ker.a = s_m1[x_m1]; //read sequentially from memory as far as possible
|
|
||||||
ker.b = s_m1[x];
|
|
||||||
ker.c = s_m1[x_p1];
|
|
||||||
ker.d = s_m1[x_p2];
|
|
||||||
|
|
||||||
ker.e = s_0[x_m1];
|
/* preprocessing blend result:
|
||||||
ker.f = s_0[x];
|
---------
|
||||||
ker.g = s_0[x_p1];
|
| F | G | evaluate corner between F, G, J, K
|
||||||
ker.h = s_0[x_p2];
|
|---+---| current input pixel is at position F
|
||||||
|
| J | K |
|
||||||
|
--------- */
|
||||||
|
const BlendResult res = preProcessCorners<ColorDistance>(ker4, cfg);
|
||||||
|
addTopR(preProcBuf[x], res.blend_j); //set 2nd known corner for (x, yFirst)
|
||||||
|
|
||||||
ker.i = s_p1[x_m1];
|
if (x + 1 < srcWidth)
|
||||||
ker.j = s_p1[x];
|
clearAddTopL(preProcBuf[x + 1], res.blend_k); //set 1st known corner for (x + 1, yFirst)
|
||||||
ker.k = s_p1[x_p1];
|
|
||||||
ker.l = s_p1[x_p2];
|
|
||||||
|
|
||||||
ker.m = s_p2[x_m1];
|
|
||||||
ker.n = s_p2[x];
|
|
||||||
ker.o = s_p2[x_p1];
|
|
||||||
ker.p = s_p2[x_p2];
|
|
||||||
|
|
||||||
const BlendResult res = preProcessCorners<ColorDistance>(ker, cfg);
|
|
||||||
/*
|
|
||||||
preprocessing blend result:
|
|
||||||
---------
|
|
||||||
| F | G | //evalute corner between F, G, J, K
|
|
||||||
----|---| //input pixel is at position F
|
|
||||||
| J | K |
|
|
||||||
---------
|
|
||||||
*/
|
|
||||||
setTopR(preProcBuffer[x], res.blend_j);
|
|
||||||
|
|
||||||
if (x + 1 < bufferSize)
|
|
||||||
setTopL(preProcBuffer[x + 1], res.blend_k);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//------------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------------
|
||||||
|
@ -548,91 +632,92 @@ void scaleImage(const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight,
|
||||||
{
|
{
|
||||||
uint32_t* out = trg + Scaler::scale * y * trgWidth; //consider MT "striped" access
|
uint32_t* out = trg + Scaler::scale * y * trgWidth; //consider MT "striped" access
|
||||||
|
|
||||||
const uint32_t* s_m1 = src + srcWidth * std::max(y - 1, 0);
|
const OobReader oobReader(src, srcWidth, srcHeight, y);
|
||||||
const uint32_t* s_0 = src + srcWidth * y; //center line
|
|
||||||
const uint32_t* s_p1 = src + srcWidth * std::min(y + 1, srcHeight - 1);
|
//initialize at position x = -1
|
||||||
const uint32_t* s_p2 = src + srcWidth * std::min(y + 2, srcHeight - 1);
|
Kernel_4x4 ker4 = {};
|
||||||
|
oobReader.readDhlp(ker4, -4); //hack: read a, e, i, m at x = -1
|
||||||
|
ker4.a = ker4.d;
|
||||||
|
ker4.e = ker4.h;
|
||||||
|
ker4.i = ker4.l;
|
||||||
|
ker4.m = ker4.p;
|
||||||
|
|
||||||
|
oobReader.readDhlp(ker4, -3);
|
||||||
|
ker4.b = ker4.d;
|
||||||
|
ker4.f = ker4.h;
|
||||||
|
ker4.j = ker4.l;
|
||||||
|
ker4.n = ker4.p;
|
||||||
|
|
||||||
|
oobReader.readDhlp(ker4, -2);
|
||||||
|
ker4.c = ker4.d;
|
||||||
|
ker4.g = ker4.h;
|
||||||
|
ker4.k = ker4.l;
|
||||||
|
ker4.o = ker4.p;
|
||||||
|
|
||||||
|
oobReader.readDhlp(ker4, -1);
|
||||||
|
|
||||||
unsigned char blend_xy1 = 0; //corner blending for current (x, y + 1) position
|
unsigned char blend_xy1 = 0; //corner blending for current (x, y + 1) position
|
||||||
|
{
|
||||||
|
const BlendResult res = preProcessCorners<ColorDistance>(ker4, cfg);
|
||||||
|
clearAddTopL(blend_xy1, res.blend_k); //set 1st known corner for (0, y + 1) and buffer for use on next column
|
||||||
|
|
||||||
|
addBottomL(preProcBuf[0], res.blend_g); //set 3rd known corner for (0, y)
|
||||||
|
}
|
||||||
|
|
||||||
for (int x = 0; x < srcWidth; ++x, out += Scaler::scale)
|
for (int x = 0; x < srcWidth; ++x, out += Scaler::scale)
|
||||||
{
|
{
|
||||||
#if defined _MSC_VER && !defined NDEBUG
|
#if defined _MSC_VER && !defined NDEBUG
|
||||||
breakIntoDebugger = debugPixelX == x && debugPixelY == y;
|
breakIntoDebugger = debugPixelX == x && debugPixelY == y;
|
||||||
#endif
|
#endif
|
||||||
//all those bounds checks have only insignificant impact on performance!
|
ker4.a = ker4.b; //shift previous kernel to the left
|
||||||
const int x_m1 = std::max(x - 1, 0); //perf: prefer array indexing to additional pointers!
|
ker4.e = ker4.f; // -----------------
|
||||||
const int x_p1 = std::min(x + 1, srcWidth - 1);
|
ker4.i = ker4.j; // | A | B | C | D |
|
||||||
const int x_p2 = std::min(x + 2, srcWidth - 1);
|
ker4.m = ker4.n; // |---|---|---|---|
|
||||||
|
/**/ // | E | F | G | H | (x, y) is at position F
|
||||||
|
ker4.b = ker4.c; // |---|---|---|---|
|
||||||
|
ker4.f = ker4.g; // | I | J | K | L |
|
||||||
|
ker4.j = ker4.k; // |---|---|---|---|
|
||||||
|
ker4.n = ker4.o; // | M | N | O | P |
|
||||||
|
/**/ // -----------------
|
||||||
|
ker4.c = ker4.d;
|
||||||
|
ker4.g = ker4.h;
|
||||||
|
ker4.k = ker4.l;
|
||||||
|
ker4.o = ker4.p;
|
||||||
|
|
||||||
Kernel_4x4 ker4 = {}; //perf: initialization is negligible
|
oobReader.readDhlp(ker4, x);
|
||||||
|
|
||||||
ker4.a = s_m1[x_m1]; //read sequentially from memory as far as possible
|
|
||||||
ker4.b = s_m1[x];
|
|
||||||
ker4.c = s_m1[x_p1];
|
|
||||||
ker4.d = s_m1[x_p2];
|
|
||||||
|
|
||||||
ker4.e = s_0[x_m1];
|
|
||||||
ker4.f = s_0[x];
|
|
||||||
ker4.g = s_0[x_p1];
|
|
||||||
ker4.h = s_0[x_p2];
|
|
||||||
|
|
||||||
ker4.i = s_p1[x_m1];
|
|
||||||
ker4.j = s_p1[x];
|
|
||||||
ker4.k = s_p1[x_p1];
|
|
||||||
ker4.l = s_p1[x_p2];
|
|
||||||
|
|
||||||
ker4.m = s_p2[x_m1];
|
|
||||||
ker4.n = s_p2[x];
|
|
||||||
ker4.o = s_p2[x_p1];
|
|
||||||
ker4.p = s_p2[x_p2];
|
|
||||||
|
|
||||||
//evaluate the four corners on bottom-right of current pixel
|
//evaluate the four corners on bottom-right of current pixel
|
||||||
unsigned char blend_xy = 0; //for current (x, y) position
|
unsigned char blend_xy = preProcBuf[x]; //for current (x, y) position
|
||||||
{
|
{
|
||||||
|
/* preprocessing blend result:
|
||||||
|
---------
|
||||||
|
| F | G | evaluate corner between F, G, J, K
|
||||||
|
|---+---| current input pixel is at position F
|
||||||
|
| J | K |
|
||||||
|
--------- */
|
||||||
const BlendResult res = preProcessCorners<ColorDistance>(ker4, cfg);
|
const BlendResult res = preProcessCorners<ColorDistance>(ker4, cfg);
|
||||||
/*
|
addBottomR(blend_xy, res.blend_f); //all four corners of (x, y) have been determined at this point due to processing sequence!
|
||||||
preprocessing blend result:
|
|
||||||
---------
|
|
||||||
| F | G | //evalute corner between F, G, J, K
|
|
||||||
----|---| //current input pixel is at position F
|
|
||||||
| J | K |
|
|
||||||
---------
|
|
||||||
*/
|
|
||||||
blend_xy = preProcBuffer[x];
|
|
||||||
setBottomR(blend_xy, res.blend_f); //all four corners of (x, y) have been determined at this point due to processing sequence!
|
|
||||||
|
|
||||||
setTopR(blend_xy1, res.blend_j); //set 2nd known corner for (x, y + 1)
|
addTopR(blend_xy1, res.blend_j); //set 2nd known corner for (x, y + 1)
|
||||||
preProcBuffer[x] = blend_xy1; //store on current buffer position for use on next row
|
preProcBuf[x] = blend_xy1; //store on current buffer position for use on next row
|
||||||
|
|
||||||
blend_xy1 = 0;
|
if (x + 1 < srcWidth)
|
||||||
setTopL(blend_xy1, res.blend_k); //set 1st known corner for (x + 1, y + 1) and buffer for use on next column
|
{
|
||||||
|
//blend_xy1 -> blend_x1y1
|
||||||
|
clearAddTopL(blend_xy1, res.blend_k); //set 1st known corner for (x + 1, y + 1) and buffer for use on next column
|
||||||
|
|
||||||
if (x + 1 < bufferSize) //set 3rd known corner for (x + 1, y)
|
addBottomL(preProcBuf[x + 1], res.blend_g); //set 3rd known corner for (x + 1, y)
|
||||||
setBottomL(preProcBuffer[x + 1], res.blend_g);
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//fill block of size scale * scale with the given color
|
//fill block of size scale * scale with the given color
|
||||||
fillBlock(out, trgWidth * sizeof(uint32_t), ker4.f, Scaler::scale, Scaler::scale);
|
fillBlock(out, trgWidth * sizeof(uint32_t), ker4.f, Scaler::scale, Scaler::scale);
|
||||||
//place *after* preprocessing step, to not overwrite the results while processing the the last pixel!
|
//place *after* preprocessing step, to not overwrite the results while processing the last pixel!
|
||||||
|
|
||||||
//blend four corners of current pixel
|
//blend all four corners of current pixel
|
||||||
if (blendingNeeded(blend_xy)) //good 5% perf-improvement
|
if (blendingNeeded(blend_xy))
|
||||||
{
|
{
|
||||||
Kernel_3x3 ker3 = {}; //perf: initialization is negligible
|
const auto& ker3 = reinterpret_cast<const Kernel_3x3&>(ker4); //"The Things We Do for Perf"
|
||||||
|
|
||||||
ker3.a = ker4.a;
|
|
||||||
ker3.b = ker4.b;
|
|
||||||
ker3.c = ker4.c;
|
|
||||||
|
|
||||||
ker3.d = ker4.e;
|
|
||||||
ker3.e = ker4.f;
|
|
||||||
ker3.f = ker4.g;
|
|
||||||
|
|
||||||
ker3.g = ker4.i;
|
|
||||||
ker3.h = ker4.j;
|
|
||||||
ker3.i = ker4.k;
|
|
||||||
|
|
||||||
blendPixel<Scaler, ColorDistance, ROT_0 >(ker3, out, trgWidth, blend_xy, cfg);
|
blendPixel<Scaler, ColorDistance, ROT_0 >(ker3, out, trgWidth, blend_xy, cfg);
|
||||||
blendPixel<Scaler, ColorDistance, ROT_90 >(ker3, out, trgWidth, blend_xy, cfg);
|
blendPixel<Scaler, ColorDistance, ROT_90 >(ker3, out, trgWidth, blend_xy, cfg);
|
||||||
blendPixel<Scaler, ColorDistance, ROT_180>(ker3, out, trgWidth, blend_xy, cfg);
|
blendPixel<Scaler, ColorDistance, ROT_180>(ker3, out, trgWidth, blend_xy, cfg);
|
||||||
|
@ -1095,15 +1180,15 @@ void xbrz::scale(size_t factor, const uint32_t* src, uint32_t* trg, int srcWidth
|
||||||
switch (factor)
|
switch (factor)
|
||||||
{
|
{
|
||||||
case 2:
|
case 2:
|
||||||
return scaleImage<Scaler2x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
return scaleImage<Scaler2x<ColorGradientRGB>, ColorDistanceRGB, OobReaderDuplicate>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||||
case 3:
|
case 3:
|
||||||
return scaleImage<Scaler3x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
return scaleImage<Scaler3x<ColorGradientRGB>, ColorDistanceRGB, OobReaderDuplicate>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||||
case 4:
|
case 4:
|
||||||
return scaleImage<Scaler4x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
return scaleImage<Scaler4x<ColorGradientRGB>, ColorDistanceRGB, OobReaderDuplicate>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||||
case 5:
|
case 5:
|
||||||
return scaleImage<Scaler5x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
return scaleImage<Scaler5x<ColorGradientRGB>, ColorDistanceRGB, OobReaderDuplicate>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||||
case 6:
|
case 6:
|
||||||
return scaleImage<Scaler6x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
return scaleImage<Scaler6x<ColorGradientRGB>, ColorDistanceRGB, OobReaderDuplicate>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -1111,15 +1196,15 @@ void xbrz::scale(size_t factor, const uint32_t* src, uint32_t* trg, int srcWidth
|
||||||
switch (factor)
|
switch (factor)
|
||||||
{
|
{
|
||||||
case 2:
|
case 2:
|
||||||
return scaleImage<Scaler2x<ColorGradientARGB>, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
return scaleImage<Scaler2x<ColorGradientARGB>, ColorDistanceARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||||
case 3:
|
case 3:
|
||||||
return scaleImage<Scaler3x<ColorGradientARGB>, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
return scaleImage<Scaler3x<ColorGradientARGB>, ColorDistanceARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||||
case 4:
|
case 4:
|
||||||
return scaleImage<Scaler4x<ColorGradientARGB>, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
return scaleImage<Scaler4x<ColorGradientARGB>, ColorDistanceARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||||
case 5:
|
case 5:
|
||||||
return scaleImage<Scaler5x<ColorGradientARGB>, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
return scaleImage<Scaler5x<ColorGradientARGB>, ColorDistanceARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||||
case 6:
|
case 6:
|
||||||
return scaleImage<Scaler6x<ColorGradientARGB>, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
return scaleImage<Scaler6x<ColorGradientARGB>, ColorDistanceARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -1127,15 +1212,15 @@ void xbrz::scale(size_t factor, const uint32_t* src, uint32_t* trg, int srcWidth
|
||||||
switch (factor)
|
switch (factor)
|
||||||
{
|
{
|
||||||
case 2:
|
case 2:
|
||||||
return scaleImage<Scaler2x<ColorGradientARGB>, ColorDistanceUnbufferedARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
return scaleImage<Scaler2x<ColorGradientARGB>, ColorDistanceUnbufferedARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||||
case 3:
|
case 3:
|
||||||
return scaleImage<Scaler3x<ColorGradientARGB>, ColorDistanceUnbufferedARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
return scaleImage<Scaler3x<ColorGradientARGB>, ColorDistanceUnbufferedARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||||
case 4:
|
case 4:
|
||||||
return scaleImage<Scaler4x<ColorGradientARGB>, ColorDistanceUnbufferedARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
return scaleImage<Scaler4x<ColorGradientARGB>, ColorDistanceUnbufferedARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||||
case 5:
|
case 5:
|
||||||
return scaleImage<Scaler5x<ColorGradientARGB>, ColorDistanceUnbufferedARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
return scaleImage<Scaler5x<ColorGradientARGB>, ColorDistanceUnbufferedARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||||
case 6:
|
case 6:
|
||||||
return scaleImage<Scaler6x<ColorGradientARGB>, ColorDistanceUnbufferedARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
return scaleImage<Scaler6x<ColorGradientARGB>, ColorDistanceUnbufferedARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,6 +22,7 @@
|
||||||
#include <limits>
|
#include <limits>
|
||||||
#include "xbrz_config.h"
|
#include "xbrz_config.h"
|
||||||
|
|
||||||
|
|
||||||
namespace xbrz
|
namespace xbrz
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
|
@ -50,7 +51,6 @@ const int SCALE_FACTOR_MAX = 6;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
-> map source (srcWidth * srcHeight) to target (scale * width x scale * height) image, optionally processing a half-open slice of rows [yFirst, yLast) only
|
-> map source (srcWidth * srcHeight) to target (scale * width x scale * height) image, optionally processing a half-open slice of rows [yFirst, yLast) only
|
||||||
-> support for source/target pitch in bytes!
|
|
||||||
-> if your emulator changes only a few image slices during each cycle (e.g. DOSBox) then there's no need to run xBRZ on the complete image:
|
-> if your emulator changes only a few image slices during each cycle (e.g. DOSBox) then there's no need to run xBRZ on the complete image:
|
||||||
Just make sure you enlarge the source image slice by 2 rows on top and 2 on bottom (this is the additional range the xBRZ algorithm is using during analysis)
|
Just make sure you enlarge the source image slice by 2 rows on top and 2 on bottom (this is the additional range the xBRZ algorithm is using during analysis)
|
||||||
CAVEAT: If there are multiple changed slices, make sure they do not overlap after adding these additional rows in order to avoid a memory race condition
|
CAVEAT: If there are multiple changed slices, make sure they do not overlap after adding these additional rows in order to avoid a memory race condition
|
||||||
|
|
|
@ -56,7 +56,7 @@ Pix* byteAdvance(Pix* ptr, int bytes)
|
||||||
|
|
||||||
//fill block with the given color
|
//fill block with the given color
|
||||||
template <class Pix> inline
|
template <class Pix> inline
|
||||||
void fillBlock(Pix* trg, int pitch, Pix col, int blockWidth, int blockHeight)
|
void fillBlock(Pix* trg, int pitch /*[bytes]*/, Pix col, int blockWidth, int blockHeight)
|
||||||
{
|
{
|
||||||
//for (int y = 0; y < blockHeight; ++y, trg = byteAdvance(trg, pitch))
|
//for (int y = 0; y < blockHeight; ++y, trg = byteAdvance(trg, pitch))
|
||||||
// std::fill(trg, trg + blockWidth, col);
|
// std::fill(trg, trg + blockWidth, col);
|
||||||
|
@ -69,8 +69,8 @@ void fillBlock(Pix* trg, int pitch, Pix col, int blockWidth, int blockHeight)
|
||||||
|
|
||||||
//nearest-neighbor (going over target image - slow for upscaling, since source is read multiple times missing out on cache! Fast for similar image sizes!)
|
//nearest-neighbor (going over target image - slow for upscaling, since source is read multiple times missing out on cache! Fast for similar image sizes!)
|
||||||
template <class PixSrc, class PixTrg, class PixConverter>
|
template <class PixSrc, class PixTrg, class PixConverter>
|
||||||
void nearestNeighborScale(const PixSrc* src, int srcWidth, int srcHeight, int srcPitch,
|
void nearestNeighborScale(const PixSrc* src, int srcWidth, int srcHeight, int srcPitch /*[bytes]*/,
|
||||||
/**/ PixTrg* trg, int trgWidth, int trgHeight, int trgPitch,
|
/**/ PixTrg* trg, int trgWidth, int trgHeight, int trgPitch /*[bytes]*/,
|
||||||
int yFirst, int yLast, PixConverter pixCvrt /*convert PixSrc to PixTrg*/)
|
int yFirst, int yLast, PixConverter pixCvrt /*convert PixSrc to PixTrg*/)
|
||||||
{
|
{
|
||||||
static_assert(std::is_integral<PixSrc>::value, "PixSrc* is expected to be cast-able to char*");
|
static_assert(std::is_integral<PixSrc>::value, "PixSrc* is expected to be cast-able to char*");
|
||||||
|
@ -106,8 +106,8 @@ void nearestNeighborScale(const PixSrc* src, int srcWidth, int srcHeight, int sr
|
||||||
|
|
||||||
//nearest-neighbor (going over source image - fast for upscaling, since source is read only once
|
//nearest-neighbor (going over source image - fast for upscaling, since source is read only once
|
||||||
template <class PixSrc, class PixTrg, class PixConverter>
|
template <class PixSrc, class PixTrg, class PixConverter>
|
||||||
void nearestNeighborScaleOverSource(const PixSrc* src, int srcWidth, int srcHeight, int srcPitch,
|
void nearestNeighborScaleOverSource(const PixSrc* src, int srcWidth, int srcHeight, int srcPitch /*[bytes]*/,
|
||||||
/**/ PixTrg* trg, int trgWidth, int trgHeight, int trgPitch,
|
/**/ PixTrg* trg, int trgWidth, int trgHeight, int trgPitch /*[bytes]*/,
|
||||||
int yFirst, int yLast, PixConverter pixCvrt /*convert PixSrc to PixTrg*/)
|
int yFirst, int yLast, PixConverter pixCvrt /*convert PixSrc to PixTrg*/)
|
||||||
{
|
{
|
||||||
static_assert(std::is_integral<PixSrc>::value, "PixSrc* is expected to be cast-able to char*");
|
static_assert(std::is_integral<PixSrc>::value, "PixSrc* is expected to be cast-able to char*");
|
||||||
|
@ -187,10 +187,10 @@ void bilinearScale(const uint32_t* src, int srcWidth, int srcHeight, int srcPitc
|
||||||
// -> pre-calculation gives significant boost; std::vector<> memory allocation is negligible!
|
// -> pre-calculation gives significant boost; std::vector<> memory allocation is negligible!
|
||||||
struct CoeffsX
|
struct CoeffsX
|
||||||
{
|
{
|
||||||
int x1;
|
int x1 = 0;
|
||||||
int x2;
|
int x2 = 0;
|
||||||
double xx1;
|
double xx1 = 0;
|
||||||
double x2x;
|
double x2x = 0;
|
||||||
};
|
};
|
||||||
std::vector<CoeffsX> buf(trgWidth);
|
std::vector<CoeffsX> buf(trgWidth);
|
||||||
for (int x = 0; x < trgWidth; ++x)
|
for (int x = 0; x < trgWidth; ++x)
|
||||||
|
@ -202,7 +202,11 @@ void bilinearScale(const uint32_t* src, int srcWidth, int srcHeight, int srcPitc
|
||||||
const double xx1 = x / scaleX - x1;
|
const double xx1 = x / scaleX - x1;
|
||||||
const double x2x = 1 - xx1;
|
const double x2x = 1 - xx1;
|
||||||
|
|
||||||
buf[x] = { x1, x2, xx1, x2x };
|
CoeffsX& bx = buf[x];
|
||||||
|
bx.x1 = x1;
|
||||||
|
bx.x2 = x2;
|
||||||
|
bx.xx1 = xx1;
|
||||||
|
bx.x2x = x2x;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int y = yFirst; y < yLast; ++y)
|
for (int y = yFirst; y < yLast; ++y)
|
||||||
|
@ -231,7 +235,7 @@ void bilinearScale(const uint32_t* src, int srcWidth, int srcHeight, int srcPitc
|
||||||
const double x2xyy1 = x2x * yy1;
|
const double x2xyy1 = x2x * yy1;
|
||||||
const double xx1yy1 = xx1 * yy1;
|
const double xx1yy1 = xx1 * yy1;
|
||||||
|
|
||||||
auto interpolate = [=](int offset)
|
auto interpolate = [=](int offset) -> double
|
||||||
{
|
{
|
||||||
/* https://en.wikipedia.org/wiki/Bilinear_interpolation
|
/* https://en.wikipedia.org/wiki/Bilinear_interpolation
|
||||||
(c11(x2 - x) + c21(x - x1)) * (y2 - y ) +
|
(c11(x2 - x) + c21(x - x1)) * (y2 - y ) +
|
||||||
|
|
Loading…
Reference in a new issue