mirror of
https://github.com/ZDoom/gzdoom-gles.git
synced 2024-11-24 21:21:04 +00:00
- updated xBRZ scaler to 1.8
Removed all C++17 features: std::clamp(), attribute [[likely]], terse static asserts, ... https://sourceforge.net/projects/xbrz/files/xBRZ/xBRZ_1.8.zip/download
This commit is contained in:
parent
a7a899ca14
commit
ffe8aaa091
3 changed files with 257 additions and 168 deletions
|
@ -27,7 +27,7 @@ using namespace xbrz;
|
|||
namespace
|
||||
{
|
||||
template <unsigned int M, unsigned int N> inline
|
||||
uint32_t gradientRGB(uint32_t pixFront, uint32_t pixBack) //blend front color with opacity M / N over opaque background: http://en.wikipedia.org/wiki/Alpha_compositing#Alpha_blending
|
||||
uint32_t gradientRGB(uint32_t pixFront, uint32_t pixBack) //blend front color with opacity M / N over opaque background: https://en.wikipedia.org/wiki/Alpha_compositing#Alpha_blending
|
||||
{
|
||||
static_assert(0 < M && M < N && N <= 1000, "");
|
||||
|
||||
|
@ -153,7 +153,7 @@ double distRGB(uint32_t pix1, uint32_t pix2)
|
|||
inline
|
||||
double distYCbCr(uint32_t pix1, uint32_t pix2, double lumaWeight)
|
||||
{
|
||||
//http://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
|
||||
//https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
|
||||
//YCbCr conversion is a matrix multiplication => take advantage of linearity by subtracting first!
|
||||
const int r_diff = static_cast<int>(getRed (pix1)) - getRed (pix2); //we may delay division by 255 to after matrix multiplication
|
||||
const int g_diff = static_cast<int>(getGreen(pix1)) - getGreen(pix2); //
|
||||
|
@ -254,24 +254,32 @@ struct BlendResult
|
|||
};
|
||||
|
||||
|
||||
struct Kernel_3x3
|
||||
{
|
||||
uint32_t
|
||||
a, b, c,
|
||||
d, e, f,
|
||||
g, h, i;
|
||||
};
|
||||
|
||||
struct Kernel_4x4 //kernel for preprocessing step
|
||||
{
|
||||
uint32_t
|
||||
/**/a, b, c, d,
|
||||
/**/e, f, g, h,
|
||||
/**/i, j, k, l,
|
||||
/**/m, n, o, p;
|
||||
a, b, c, //
|
||||
e, f, g, // support reinterpret_cast from Kernel_4x4 => Kernel_3x3
|
||||
i, j, k, //
|
||||
m, n, o,
|
||||
d, h, l, p;
|
||||
};
|
||||
|
||||
/*
|
||||
input kernel area naming convention:
|
||||
/* input kernel area naming convention:
|
||||
-----------------
|
||||
| A | B | C | D |
|
||||
----|---|---|---|
|
||||
| E | F | G | H | //evaluate the four corners between F, G, J, K
|
||||
----|---|---|---| //input pixel is at position F
|
||||
|---|---|---|---|
|
||||
| E | F | G | H | evaluate the four corners between F, G, J, K
|
||||
|---|---|---|---| input pixel is at position F
|
||||
| I | J | K | L |
|
||||
----|---|---|---|
|
||||
|---|---|---|---|
|
||||
| M | N | O | P |
|
||||
-----------------
|
||||
*/
|
||||
|
@ -318,14 +326,6 @@ BlendResult preProcessCorners(const Kernel_4x4& ker, const xbrz::ScalerCfg& cfg)
|
|||
return result;
|
||||
}
|
||||
|
||||
struct Kernel_3x3
|
||||
{
|
||||
uint32_t
|
||||
/**/a, b, c,
|
||||
/**/d, e, f,
|
||||
/**/g, h, i;
|
||||
};
|
||||
|
||||
#define DEF_GETTER(x) template <RotationDegree rotDeg> uint32_t inline get_##x(const Kernel_3x3& ker) { return ker.x; }
|
||||
//we cannot and NEED NOT write "ker.##x" since ## concatenates preprocessor tokens but "." is not a token
|
||||
DEF_GETTER(a) DEF_GETTER(b) DEF_GETTER(c)
|
||||
|
@ -358,12 +358,16 @@ inline BlendType getTopR (unsigned char b) { return static_cast<BlendType>(0x3
|
|||
inline BlendType getBottomR(unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 4)); }
|
||||
inline BlendType getBottomL(unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 6)); }
|
||||
|
||||
inline void setTopL (unsigned char& b, BlendType bt) { b |= bt; } //buffer is assumed to be initialized before preprocessing!
|
||||
inline void setTopR (unsigned char& b, BlendType bt) { b |= (bt << 2); }
|
||||
inline void setBottomR(unsigned char& b, BlendType bt) { b |= (bt << 4); }
|
||||
inline void setBottomL(unsigned char& b, BlendType bt) { b |= (bt << 6); }
|
||||
inline void clearAddTopL(unsigned char& b, BlendType bt) { b = static_cast<unsigned char>(bt); }
|
||||
inline void addTopR (unsigned char& b, BlendType bt) { b |= (bt << 2); } //buffer is assumed to be initialized before preprocessing!
|
||||
inline void addBottomR (unsigned char& b, BlendType bt) { b |= (bt << 4); } //e.g. via clearAddTopL()
|
||||
inline void addBottomL (unsigned char& b, BlendType bt) { b |= (bt << 6); } //
|
||||
|
||||
inline bool blendingNeeded(unsigned char b) { return b != 0; }
|
||||
inline bool blendingNeeded(unsigned char b)
|
||||
{
|
||||
static_assert(BLEND_NONE == 0, "");
|
||||
return b != 0;
|
||||
}
|
||||
|
||||
template <RotationDegree rotDeg> inline
|
||||
unsigned char rotateBlendInfo(unsigned char b) { return b; }
|
||||
|
@ -372,13 +376,12 @@ template <> inline unsigned char rotateBlendInfo<ROT_180>(unsigned char b) { ret
|
|||
template <> inline unsigned char rotateBlendInfo<ROT_270>(unsigned char b) { return ((b << 6) | (b >> 2)) & 0xff; }
|
||||
|
||||
|
||||
/*
|
||||
input kernel area naming convention:
|
||||
/* input kernel area naming convention:
|
||||
-------------
|
||||
| A | B | C |
|
||||
----|---|---|
|
||||
| D | E | F | //input pixel is at position E
|
||||
----|---|---|
|
||||
|---|---|---|
|
||||
| D | E | F | input pixel is at position E
|
||||
|---|---|---|
|
||||
| G | H | I |
|
||||
-------------
|
||||
*/
|
||||
|
@ -472,7 +475,80 @@ void blendPixel(const Kernel_3x3& ker,
|
|||
}
|
||||
|
||||
|
||||
template <class Scaler, class ColorDistance> //scaler policy: see "Scaler2x" reference implementation
|
||||
class OobReaderTransparent
|
||||
{
|
||||
public:
|
||||
OobReaderTransparent(const uint32_t* src, int srcWidth, int srcHeight, int y) :
|
||||
s_m1(0 <= y - 1 && y - 1 < srcHeight ? src + srcWidth * (y - 1) : nullptr),
|
||||
s_0 (0 <= y && y < srcHeight ? src + srcWidth * y : nullptr),
|
||||
s_p1(0 <= y + 1 && y + 1 < srcHeight ? src + srcWidth * (y + 1) : nullptr),
|
||||
s_p2(0 <= y + 2 && y + 2 < srcHeight ? src + srcWidth * (y + 2) : nullptr),
|
||||
srcWidth_(srcWidth) {}
|
||||
|
||||
void readDhlp(Kernel_4x4& ker, int x) const //(x, y) is at kernel position F
|
||||
{
|
||||
const int x_p2 = x + 2;
|
||||
|
||||
if (0 <= x_p2 && x_p2 < srcWidth_)
|
||||
{
|
||||
ker.d = s_m1 ? s_m1[x_p2] : 0;
|
||||
ker.h = s_0 ? s_0 [x_p2] : 0;
|
||||
ker.l = s_p1 ? s_p1[x_p2] : 0;
|
||||
ker.p = s_p2 ? s_p2[x_p2] : 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
ker.d = 0;
|
||||
ker.h = 0;
|
||||
ker.l = 0;
|
||||
ker.p = 0;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
const uint32_t* const s_m1;
|
||||
const uint32_t* const s_0;
|
||||
const uint32_t* const s_p1;
|
||||
const uint32_t* const s_p2;
|
||||
const int srcWidth_;
|
||||
};
|
||||
|
||||
|
||||
template <typename T>
|
||||
constexpr inline T xbrz_clamp(const T in, const T min, const T max)
|
||||
{
|
||||
return in <= min ? min : in >= max ? max : in;
|
||||
}
|
||||
|
||||
class OobReaderDuplicate
|
||||
{
|
||||
public:
|
||||
OobReaderDuplicate(const uint32_t* src, int srcWidth, int srcHeight, int y) :
|
||||
s_m1(src + srcWidth * xbrz_clamp(y - 1, 0, srcHeight - 1)),
|
||||
s_0 (src + srcWidth * xbrz_clamp(y, 0, srcHeight - 1)),
|
||||
s_p1(src + srcWidth * xbrz_clamp(y + 1, 0, srcHeight - 1)),
|
||||
s_p2(src + srcWidth * xbrz_clamp(y + 2, 0, srcHeight - 1)),
|
||||
srcWidth_(srcWidth) {}
|
||||
|
||||
void readDhlp(Kernel_4x4& ker, int x) const //(x, y) is at kernel position F
|
||||
{
|
||||
const int x_p2 = xbrz_clamp(x + 2, 0, srcWidth_ - 1);
|
||||
ker.d = s_m1[x_p2];
|
||||
ker.h = s_0 [x_p2];
|
||||
ker.l = s_p1[x_p2];
|
||||
ker.p = s_p2[x_p2];
|
||||
}
|
||||
|
||||
private:
|
||||
const uint32_t* const s_m1;
|
||||
const uint32_t* const s_0;
|
||||
const uint32_t* const s_p1;
|
||||
const uint32_t* const s_p2;
|
||||
const int srcWidth_;
|
||||
};
|
||||
|
||||
|
||||
template <class Scaler, class ColorDistance, class OobReader> //scaler policy: see "Scaler2x" reference implementation
|
||||
void scaleImage(const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight, const xbrz::ScalerCfg& cfg, int yFirst, int yLast)
|
||||
{
|
||||
yFirst = std::max(yFirst, 0);
|
||||
|
@ -482,64 +558,72 @@ void scaleImage(const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight,
|
|||
|
||||
const int trgWidth = srcWidth * Scaler::scale;
|
||||
|
||||
//"use" space at the end of the image as temporary buffer for "on the fly preprocessing": we even could use larger area of
|
||||
//"sizeof(uint32_t) * srcWidth * (yLast - yFirst)" bytes without risk of accidental overwriting before accessing
|
||||
const int bufferSize = srcWidth;
|
||||
unsigned char* preProcBuffer = reinterpret_cast<unsigned char*>(trg + yLast * Scaler::scale * trgWidth) - bufferSize;
|
||||
std::fill(preProcBuffer, preProcBuffer + bufferSize, '\0');
|
||||
static_assert(BLEND_NONE == 0, "");
|
||||
//(ab)use space of "sizeof(uint32_t) * srcWidth * Scaler::scale" at the end of the image as temporary
|
||||
//buffer for "on the fly preprocessing" without risk of accidental overwriting before accessing
|
||||
unsigned char* const preProcBuf = reinterpret_cast<unsigned char*>(trg + yLast * Scaler::scale * trgWidth) - srcWidth;
|
||||
|
||||
//initialize preprocessing buffer for first row of current stripe: detect upper left and right corner blending
|
||||
//this cannot be optimized for adjacent processing stripes; we must not allow for a memory race condition!
|
||||
if (yFirst > 0)
|
||||
{
|
||||
const int y = yFirst - 1;
|
||||
const OobReader oobReader(src, srcWidth, srcHeight, yFirst - 1);
|
||||
|
||||
const uint32_t* s_m1 = src + srcWidth * std::max(y - 1, 0);
|
||||
const uint32_t* s_0 = src + srcWidth * y; //center line
|
||||
const uint32_t* s_p1 = src + srcWidth * std::min(y + 1, srcHeight - 1);
|
||||
const uint32_t* s_p2 = src + srcWidth * std::min(y + 2, srcHeight - 1);
|
||||
//initialize at position x = -1
|
||||
Kernel_4x4 ker4 = {};
|
||||
oobReader.readDhlp(ker4, -4); //hack: read a, e, i, m at x = -1
|
||||
ker4.a = ker4.d;
|
||||
ker4.e = ker4.h;
|
||||
ker4.i = ker4.l;
|
||||
ker4.m = ker4.p;
|
||||
|
||||
oobReader.readDhlp(ker4, -3);
|
||||
ker4.b = ker4.d;
|
||||
ker4.f = ker4.h;
|
||||
ker4.j = ker4.l;
|
||||
ker4.n = ker4.p;
|
||||
|
||||
oobReader.readDhlp(ker4, -2);
|
||||
ker4.c = ker4.d;
|
||||
ker4.g = ker4.h;
|
||||
ker4.k = ker4.l;
|
||||
ker4.o = ker4.p;
|
||||
|
||||
oobReader.readDhlp(ker4, -1);
|
||||
|
||||
{
|
||||
const BlendResult res = preProcessCorners<ColorDistance>(ker4, cfg);
|
||||
clearAddTopL(preProcBuf[0], res.blend_k); //set 1st known corner for (0, yFirst)
|
||||
}
|
||||
|
||||
for (int x = 0; x < srcWidth; ++x)
|
||||
{
|
||||
const int x_m1 = std::max(x - 1, 0);
|
||||
const int x_p1 = std::min(x + 1, srcWidth - 1);
|
||||
const int x_p2 = std::min(x + 2, srcWidth - 1);
|
||||
ker4.a = ker4.b; //shift previous kernel to the left
|
||||
ker4.e = ker4.f; // -----------------
|
||||
ker4.i = ker4.j; // | A | B | C | D |
|
||||
ker4.m = ker4.n; // |---|---|---|---|
|
||||
/**/ // | E | F | G | H | (x, yFirst - 1) is at position F
|
||||
ker4.b = ker4.c; // |---|---|---|---|
|
||||
ker4.f = ker4.g; // | I | J | K | L |
|
||||
ker4.j = ker4.k; // |---|---|---|---|
|
||||
ker4.n = ker4.o; // | M | N | O | P |
|
||||
/**/ // -----------------
|
||||
ker4.c = ker4.d;
|
||||
ker4.g = ker4.h;
|
||||
ker4.k = ker4.l;
|
||||
ker4.o = ker4.p;
|
||||
|
||||
Kernel_4x4 ker = {}; //perf: initialization is negligible
|
||||
ker.a = s_m1[x_m1]; //read sequentially from memory as far as possible
|
||||
ker.b = s_m1[x];
|
||||
ker.c = s_m1[x_p1];
|
||||
ker.d = s_m1[x_p2];
|
||||
oobReader.readDhlp(ker4, x);
|
||||
|
||||
ker.e = s_0[x_m1];
|
||||
ker.f = s_0[x];
|
||||
ker.g = s_0[x_p1];
|
||||
ker.h = s_0[x_p2];
|
||||
/* preprocessing blend result:
|
||||
---------
|
||||
| F | G | evaluate corner between F, G, J, K
|
||||
|---+---| current input pixel is at position F
|
||||
| J | K |
|
||||
--------- */
|
||||
const BlendResult res = preProcessCorners<ColorDistance>(ker4, cfg);
|
||||
addTopR(preProcBuf[x], res.blend_j); //set 2nd known corner for (x, yFirst)
|
||||
|
||||
ker.i = s_p1[x_m1];
|
||||
ker.j = s_p1[x];
|
||||
ker.k = s_p1[x_p1];
|
||||
ker.l = s_p1[x_p2];
|
||||
|
||||
ker.m = s_p2[x_m1];
|
||||
ker.n = s_p2[x];
|
||||
ker.o = s_p2[x_p1];
|
||||
ker.p = s_p2[x_p2];
|
||||
|
||||
const BlendResult res = preProcessCorners<ColorDistance>(ker, cfg);
|
||||
/*
|
||||
preprocessing blend result:
|
||||
---------
|
||||
| F | G | //evalute corner between F, G, J, K
|
||||
----|---| //input pixel is at position F
|
||||
| J | K |
|
||||
---------
|
||||
*/
|
||||
setTopR(preProcBuffer[x], res.blend_j);
|
||||
|
||||
if (x + 1 < bufferSize)
|
||||
setTopL(preProcBuffer[x + 1], res.blend_k);
|
||||
if (x + 1 < srcWidth)
|
||||
clearAddTopL(preProcBuf[x + 1], res.blend_k); //set 1st known corner for (x + 1, yFirst)
|
||||
}
|
||||
}
|
||||
//------------------------------------------------------------------------------------
|
||||
|
@ -548,91 +632,92 @@ void scaleImage(const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight,
|
|||
{
|
||||
uint32_t* out = trg + Scaler::scale * y * trgWidth; //consider MT "striped" access
|
||||
|
||||
const uint32_t* s_m1 = src + srcWidth * std::max(y - 1, 0);
|
||||
const uint32_t* s_0 = src + srcWidth * y; //center line
|
||||
const uint32_t* s_p1 = src + srcWidth * std::min(y + 1, srcHeight - 1);
|
||||
const uint32_t* s_p2 = src + srcWidth * std::min(y + 2, srcHeight - 1);
|
||||
const OobReader oobReader(src, srcWidth, srcHeight, y);
|
||||
|
||||
//initialize at position x = -1
|
||||
Kernel_4x4 ker4 = {};
|
||||
oobReader.readDhlp(ker4, -4); //hack: read a, e, i, m at x = -1
|
||||
ker4.a = ker4.d;
|
||||
ker4.e = ker4.h;
|
||||
ker4.i = ker4.l;
|
||||
ker4.m = ker4.p;
|
||||
|
||||
oobReader.readDhlp(ker4, -3);
|
||||
ker4.b = ker4.d;
|
||||
ker4.f = ker4.h;
|
||||
ker4.j = ker4.l;
|
||||
ker4.n = ker4.p;
|
||||
|
||||
oobReader.readDhlp(ker4, -2);
|
||||
ker4.c = ker4.d;
|
||||
ker4.g = ker4.h;
|
||||
ker4.k = ker4.l;
|
||||
ker4.o = ker4.p;
|
||||
|
||||
oobReader.readDhlp(ker4, -1);
|
||||
|
||||
unsigned char blend_xy1 = 0; //corner blending for current (x, y + 1) position
|
||||
{
|
||||
const BlendResult res = preProcessCorners<ColorDistance>(ker4, cfg);
|
||||
clearAddTopL(blend_xy1, res.blend_k); //set 1st known corner for (0, y + 1) and buffer for use on next column
|
||||
|
||||
addBottomL(preProcBuf[0], res.blend_g); //set 3rd known corner for (0, y)
|
||||
}
|
||||
|
||||
for (int x = 0; x < srcWidth; ++x, out += Scaler::scale)
|
||||
{
|
||||
#if defined _MSC_VER && !defined NDEBUG
|
||||
breakIntoDebugger = debugPixelX == x && debugPixelY == y;
|
||||
#endif
|
||||
//all those bounds checks have only insignificant impact on performance!
|
||||
const int x_m1 = std::max(x - 1, 0); //perf: prefer array indexing to additional pointers!
|
||||
const int x_p1 = std::min(x + 1, srcWidth - 1);
|
||||
const int x_p2 = std::min(x + 2, srcWidth - 1);
|
||||
ker4.a = ker4.b; //shift previous kernel to the left
|
||||
ker4.e = ker4.f; // -----------------
|
||||
ker4.i = ker4.j; // | A | B | C | D |
|
||||
ker4.m = ker4.n; // |---|---|---|---|
|
||||
/**/ // | E | F | G | H | (x, y) is at position F
|
||||
ker4.b = ker4.c; // |---|---|---|---|
|
||||
ker4.f = ker4.g; // | I | J | K | L |
|
||||
ker4.j = ker4.k; // |---|---|---|---|
|
||||
ker4.n = ker4.o; // | M | N | O | P |
|
||||
/**/ // -----------------
|
||||
ker4.c = ker4.d;
|
||||
ker4.g = ker4.h;
|
||||
ker4.k = ker4.l;
|
||||
ker4.o = ker4.p;
|
||||
|
||||
Kernel_4x4 ker4 = {}; //perf: initialization is negligible
|
||||
|
||||
ker4.a = s_m1[x_m1]; //read sequentially from memory as far as possible
|
||||
ker4.b = s_m1[x];
|
||||
ker4.c = s_m1[x_p1];
|
||||
ker4.d = s_m1[x_p2];
|
||||
|
||||
ker4.e = s_0[x_m1];
|
||||
ker4.f = s_0[x];
|
||||
ker4.g = s_0[x_p1];
|
||||
ker4.h = s_0[x_p2];
|
||||
|
||||
ker4.i = s_p1[x_m1];
|
||||
ker4.j = s_p1[x];
|
||||
ker4.k = s_p1[x_p1];
|
||||
ker4.l = s_p1[x_p2];
|
||||
|
||||
ker4.m = s_p2[x_m1];
|
||||
ker4.n = s_p2[x];
|
||||
ker4.o = s_p2[x_p1];
|
||||
ker4.p = s_p2[x_p2];
|
||||
oobReader.readDhlp(ker4, x);
|
||||
|
||||
//evaluate the four corners on bottom-right of current pixel
|
||||
unsigned char blend_xy = 0; //for current (x, y) position
|
||||
unsigned char blend_xy = preProcBuf[x]; //for current (x, y) position
|
||||
{
|
||||
/* preprocessing blend result:
|
||||
---------
|
||||
| F | G | evaluate corner between F, G, J, K
|
||||
|---+---| current input pixel is at position F
|
||||
| J | K |
|
||||
--------- */
|
||||
const BlendResult res = preProcessCorners<ColorDistance>(ker4, cfg);
|
||||
/*
|
||||
preprocessing blend result:
|
||||
---------
|
||||
| F | G | //evalute corner between F, G, J, K
|
||||
----|---| //current input pixel is at position F
|
||||
| J | K |
|
||||
---------
|
||||
*/
|
||||
blend_xy = preProcBuffer[x];
|
||||
setBottomR(blend_xy, res.blend_f); //all four corners of (x, y) have been determined at this point due to processing sequence!
|
||||
addBottomR(blend_xy, res.blend_f); //all four corners of (x, y) have been determined at this point due to processing sequence!
|
||||
|
||||
setTopR(blend_xy1, res.blend_j); //set 2nd known corner for (x, y + 1)
|
||||
preProcBuffer[x] = blend_xy1; //store on current buffer position for use on next row
|
||||
addTopR(blend_xy1, res.blend_j); //set 2nd known corner for (x, y + 1)
|
||||
preProcBuf[x] = blend_xy1; //store on current buffer position for use on next row
|
||||
|
||||
blend_xy1 = 0;
|
||||
setTopL(blend_xy1, res.blend_k); //set 1st known corner for (x + 1, y + 1) and buffer for use on next column
|
||||
if (x + 1 < srcWidth)
|
||||
{
|
||||
//blend_xy1 -> blend_x1y1
|
||||
clearAddTopL(blend_xy1, res.blend_k); //set 1st known corner for (x + 1, y + 1) and buffer for use on next column
|
||||
|
||||
if (x + 1 < bufferSize) //set 3rd known corner for (x + 1, y)
|
||||
setBottomL(preProcBuffer[x + 1], res.blend_g);
|
||||
addBottomL(preProcBuf[x + 1], res.blend_g); //set 3rd known corner for (x + 1, y)
|
||||
}
|
||||
}
|
||||
|
||||
//fill block of size scale * scale with the given color
|
||||
fillBlock(out, trgWidth * sizeof(uint32_t), ker4.f, Scaler::scale, Scaler::scale);
|
||||
//place *after* preprocessing step, to not overwrite the results while processing the the last pixel!
|
||||
//place *after* preprocessing step, to not overwrite the results while processing the last pixel!
|
||||
|
||||
//blend four corners of current pixel
|
||||
if (blendingNeeded(blend_xy)) //good 5% perf-improvement
|
||||
//blend all four corners of current pixel
|
||||
if (blendingNeeded(blend_xy))
|
||||
{
|
||||
Kernel_3x3 ker3 = {}; //perf: initialization is negligible
|
||||
|
||||
ker3.a = ker4.a;
|
||||
ker3.b = ker4.b;
|
||||
ker3.c = ker4.c;
|
||||
|
||||
ker3.d = ker4.e;
|
||||
ker3.e = ker4.f;
|
||||
ker3.f = ker4.g;
|
||||
|
||||
ker3.g = ker4.i;
|
||||
ker3.h = ker4.j;
|
||||
ker3.i = ker4.k;
|
||||
|
||||
const auto& ker3 = reinterpret_cast<const Kernel_3x3&>(ker4); //"The Things We Do for Perf"
|
||||
blendPixel<Scaler, ColorDistance, ROT_0 >(ker3, out, trgWidth, blend_xy, cfg);
|
||||
blendPixel<Scaler, ColorDistance, ROT_90 >(ker3, out, trgWidth, blend_xy, cfg);
|
||||
blendPixel<Scaler, ColorDistance, ROT_180>(ker3, out, trgWidth, blend_xy, cfg);
|
||||
|
@ -1095,15 +1180,15 @@ void xbrz::scale(size_t factor, const uint32_t* src, uint32_t* trg, int srcWidth
|
|||
switch (factor)
|
||||
{
|
||||
case 2:
|
||||
return scaleImage<Scaler2x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||
return scaleImage<Scaler2x<ColorGradientRGB>, ColorDistanceRGB, OobReaderDuplicate>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||
case 3:
|
||||
return scaleImage<Scaler3x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||
return scaleImage<Scaler3x<ColorGradientRGB>, ColorDistanceRGB, OobReaderDuplicate>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||
case 4:
|
||||
return scaleImage<Scaler4x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||
return scaleImage<Scaler4x<ColorGradientRGB>, ColorDistanceRGB, OobReaderDuplicate>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||
case 5:
|
||||
return scaleImage<Scaler5x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||
return scaleImage<Scaler5x<ColorGradientRGB>, ColorDistanceRGB, OobReaderDuplicate>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||
case 6:
|
||||
return scaleImage<Scaler6x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||
return scaleImage<Scaler6x<ColorGradientRGB>, ColorDistanceRGB, OobReaderDuplicate>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||
}
|
||||
break;
|
||||
|
||||
|
@ -1111,15 +1196,15 @@ void xbrz::scale(size_t factor, const uint32_t* src, uint32_t* trg, int srcWidth
|
|||
switch (factor)
|
||||
{
|
||||
case 2:
|
||||
return scaleImage<Scaler2x<ColorGradientARGB>, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||
return scaleImage<Scaler2x<ColorGradientARGB>, ColorDistanceARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||
case 3:
|
||||
return scaleImage<Scaler3x<ColorGradientARGB>, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||
return scaleImage<Scaler3x<ColorGradientARGB>, ColorDistanceARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||
case 4:
|
||||
return scaleImage<Scaler4x<ColorGradientARGB>, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||
return scaleImage<Scaler4x<ColorGradientARGB>, ColorDistanceARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||
case 5:
|
||||
return scaleImage<Scaler5x<ColorGradientARGB>, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||
return scaleImage<Scaler5x<ColorGradientARGB>, ColorDistanceARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||
case 6:
|
||||
return scaleImage<Scaler6x<ColorGradientARGB>, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||
return scaleImage<Scaler6x<ColorGradientARGB>, ColorDistanceARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||
}
|
||||
break;
|
||||
|
||||
|
@ -1127,15 +1212,15 @@ void xbrz::scale(size_t factor, const uint32_t* src, uint32_t* trg, int srcWidth
|
|||
switch (factor)
|
||||
{
|
||||
case 2:
|
||||
return scaleImage<Scaler2x<ColorGradientARGB>, ColorDistanceUnbufferedARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||
return scaleImage<Scaler2x<ColorGradientARGB>, ColorDistanceUnbufferedARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||
case 3:
|
||||
return scaleImage<Scaler3x<ColorGradientARGB>, ColorDistanceUnbufferedARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||
return scaleImage<Scaler3x<ColorGradientARGB>, ColorDistanceUnbufferedARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||
case 4:
|
||||
return scaleImage<Scaler4x<ColorGradientARGB>, ColorDistanceUnbufferedARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||
return scaleImage<Scaler4x<ColorGradientARGB>, ColorDistanceUnbufferedARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||
case 5:
|
||||
return scaleImage<Scaler5x<ColorGradientARGB>, ColorDistanceUnbufferedARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||
return scaleImage<Scaler5x<ColorGradientARGB>, ColorDistanceUnbufferedARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||
case 6:
|
||||
return scaleImage<Scaler6x<ColorGradientARGB>, ColorDistanceUnbufferedARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||
return scaleImage<Scaler6x<ColorGradientARGB>, ColorDistanceUnbufferedARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include <limits>
|
||||
#include "xbrz_config.h"
|
||||
|
||||
|
||||
namespace xbrz
|
||||
{
|
||||
/*
|
||||
|
@ -50,7 +51,6 @@ const int SCALE_FACTOR_MAX = 6;
|
|||
|
||||
/*
|
||||
-> map source (srcWidth * srcHeight) to target (scale * width x scale * height) image, optionally processing a half-open slice of rows [yFirst, yLast) only
|
||||
-> support for source/target pitch in bytes!
|
||||
-> if your emulator changes only a few image slices during each cycle (e.g. DOSBox) then there's no need to run xBRZ on the complete image:
|
||||
Just make sure you enlarge the source image slice by 2 rows on top and 2 on bottom (this is the additional range the xBRZ algorithm is using during analysis)
|
||||
CAVEAT: If there are multiple changed slices, make sure they do not overlap after adding these additional rows in order to avoid a memory race condition
|
||||
|
|
|
@ -56,7 +56,7 @@ Pix* byteAdvance(Pix* ptr, int bytes)
|
|||
|
||||
//fill block with the given color
|
||||
template <class Pix> inline
|
||||
void fillBlock(Pix* trg, int pitch, Pix col, int blockWidth, int blockHeight)
|
||||
void fillBlock(Pix* trg, int pitch /*[bytes]*/, Pix col, int blockWidth, int blockHeight)
|
||||
{
|
||||
//for (int y = 0; y < blockHeight; ++y, trg = byteAdvance(trg, pitch))
|
||||
// std::fill(trg, trg + blockWidth, col);
|
||||
|
@ -69,8 +69,8 @@ void fillBlock(Pix* trg, int pitch, Pix col, int blockWidth, int blockHeight)
|
|||
|
||||
//nearest-neighbor (going over target image - slow for upscaling, since source is read multiple times missing out on cache! Fast for similar image sizes!)
|
||||
template <class PixSrc, class PixTrg, class PixConverter>
|
||||
void nearestNeighborScale(const PixSrc* src, int srcWidth, int srcHeight, int srcPitch,
|
||||
/**/ PixTrg* trg, int trgWidth, int trgHeight, int trgPitch,
|
||||
void nearestNeighborScale(const PixSrc* src, int srcWidth, int srcHeight, int srcPitch /*[bytes]*/,
|
||||
/**/ PixTrg* trg, int trgWidth, int trgHeight, int trgPitch /*[bytes]*/,
|
||||
int yFirst, int yLast, PixConverter pixCvrt /*convert PixSrc to PixTrg*/)
|
||||
{
|
||||
static_assert(std::is_integral<PixSrc>::value, "PixSrc* is expected to be cast-able to char*");
|
||||
|
@ -106,8 +106,8 @@ void nearestNeighborScale(const PixSrc* src, int srcWidth, int srcHeight, int sr
|
|||
|
||||
//nearest-neighbor (going over source image - fast for upscaling, since source is read only once
|
||||
template <class PixSrc, class PixTrg, class PixConverter>
|
||||
void nearestNeighborScaleOverSource(const PixSrc* src, int srcWidth, int srcHeight, int srcPitch,
|
||||
/**/ PixTrg* trg, int trgWidth, int trgHeight, int trgPitch,
|
||||
void nearestNeighborScaleOverSource(const PixSrc* src, int srcWidth, int srcHeight, int srcPitch /*[bytes]*/,
|
||||
/**/ PixTrg* trg, int trgWidth, int trgHeight, int trgPitch /*[bytes]*/,
|
||||
int yFirst, int yLast, PixConverter pixCvrt /*convert PixSrc to PixTrg*/)
|
||||
{
|
||||
static_assert(std::is_integral<PixSrc>::value, "PixSrc* is expected to be cast-able to char*");
|
||||
|
@ -187,10 +187,10 @@ void bilinearScale(const uint32_t* src, int srcWidth, int srcHeight, int srcPitc
|
|||
// -> pre-calculation gives significant boost; std::vector<> memory allocation is negligible!
|
||||
struct CoeffsX
|
||||
{
|
||||
int x1;
|
||||
int x2;
|
||||
double xx1;
|
||||
double x2x;
|
||||
int x1 = 0;
|
||||
int x2 = 0;
|
||||
double xx1 = 0;
|
||||
double x2x = 0;
|
||||
};
|
||||
std::vector<CoeffsX> buf(trgWidth);
|
||||
for (int x = 0; x < trgWidth; ++x)
|
||||
|
@ -202,7 +202,11 @@ void bilinearScale(const uint32_t* src, int srcWidth, int srcHeight, int srcPitc
|
|||
const double xx1 = x / scaleX - x1;
|
||||
const double x2x = 1 - xx1;
|
||||
|
||||
buf[x] = { x1, x2, xx1, x2x };
|
||||
CoeffsX& bx = buf[x];
|
||||
bx.x1 = x1;
|
||||
bx.x2 = x2;
|
||||
bx.xx1 = xx1;
|
||||
bx.x2x = x2x;
|
||||
}
|
||||
|
||||
for (int y = yFirst; y < yLast; ++y)
|
||||
|
@ -231,7 +235,7 @@ void bilinearScale(const uint32_t* src, int srcWidth, int srcHeight, int srcPitc
|
|||
const double x2xyy1 = x2x * yy1;
|
||||
const double xx1yy1 = xx1 * yy1;
|
||||
|
||||
auto interpolate = [=](int offset)
|
||||
auto interpolate = [=](int offset) -> double
|
||||
{
|
||||
/* https://en.wikipedia.org/wiki/Bilinear_interpolation
|
||||
(c11(x2 - x) + c21(x - x1)) * (y2 - y ) +
|
||||
|
|
Loading…
Reference in a new issue