- updated xBRZ scaler to 1.8

Removed all C++17 features: std::clamp(), attribute [[likely]], terse static asserts, ...

https://sourceforge.net/projects/xbrz/files/xBRZ/xBRZ_1.8.zip/download
This commit is contained in:
alexey.lysiuk 2019-12-05 22:42:11 +02:00
parent a7a899ca14
commit ffe8aaa091
3 changed files with 257 additions and 168 deletions

View file

@ -27,7 +27,7 @@ using namespace xbrz;
namespace namespace
{ {
template <unsigned int M, unsigned int N> inline template <unsigned int M, unsigned int N> inline
uint32_t gradientRGB(uint32_t pixFront, uint32_t pixBack) //blend front color with opacity M / N over opaque background: http://en.wikipedia.org/wiki/Alpha_compositing#Alpha_blending uint32_t gradientRGB(uint32_t pixFront, uint32_t pixBack) //blend front color with opacity M / N over opaque background: https://en.wikipedia.org/wiki/Alpha_compositing#Alpha_blending
{ {
static_assert(0 < M && M < N && N <= 1000, ""); static_assert(0 < M && M < N && N <= 1000, "");
@ -153,7 +153,7 @@ double distRGB(uint32_t pix1, uint32_t pix2)
inline inline
double distYCbCr(uint32_t pix1, uint32_t pix2, double lumaWeight) double distYCbCr(uint32_t pix1, uint32_t pix2, double lumaWeight)
{ {
//http://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion //https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
//YCbCr conversion is a matrix multiplication => take advantage of linearity by subtracting first! //YCbCr conversion is a matrix multiplication => take advantage of linearity by subtracting first!
const int r_diff = static_cast<int>(getRed (pix1)) - getRed (pix2); //we may delay division by 255 to after matrix multiplication const int r_diff = static_cast<int>(getRed (pix1)) - getRed (pix2); //we may delay division by 255 to after matrix multiplication
const int g_diff = static_cast<int>(getGreen(pix1)) - getGreen(pix2); // const int g_diff = static_cast<int>(getGreen(pix1)) - getGreen(pix2); //
@ -254,24 +254,32 @@ struct BlendResult
}; };
struct Kernel_3x3
{
uint32_t
a, b, c,
d, e, f,
g, h, i;
};
struct Kernel_4x4 //kernel for preprocessing step struct Kernel_4x4 //kernel for preprocessing step
{ {
uint32_t uint32_t
/**/a, b, c, d, a, b, c, //
/**/e, f, g, h, e, f, g, // support reinterpret_cast from Kernel_4x4 => Kernel_3x3
/**/i, j, k, l, i, j, k, //
/**/m, n, o, p; m, n, o,
d, h, l, p;
}; };
/* /* input kernel area naming convention:
input kernel area naming convention:
----------------- -----------------
| A | B | C | D | | A | B | C | D |
----|---|---|---| |---|---|---|---|
| E | F | G | H | //evaluate the four corners between F, G, J, K | E | F | G | H | evaluate the four corners between F, G, J, K
----|---|---|---| //input pixel is at position F |---|---|---|---| input pixel is at position F
| I | J | K | L | | I | J | K | L |
----|---|---|---| |---|---|---|---|
| M | N | O | P | | M | N | O | P |
----------------- -----------------
*/ */
@ -318,14 +326,6 @@ BlendResult preProcessCorners(const Kernel_4x4& ker, const xbrz::ScalerCfg& cfg)
return result; return result;
} }
struct Kernel_3x3
{
uint32_t
/**/a, b, c,
/**/d, e, f,
/**/g, h, i;
};
#define DEF_GETTER(x) template <RotationDegree rotDeg> uint32_t inline get_##x(const Kernel_3x3& ker) { return ker.x; } #define DEF_GETTER(x) template <RotationDegree rotDeg> uint32_t inline get_##x(const Kernel_3x3& ker) { return ker.x; }
//we cannot and NEED NOT write "ker.##x" since ## concatenates preprocessor tokens but "." is not a token //we cannot and NEED NOT write "ker.##x" since ## concatenates preprocessor tokens but "." is not a token
DEF_GETTER(a) DEF_GETTER(b) DEF_GETTER(c) DEF_GETTER(a) DEF_GETTER(b) DEF_GETTER(c)
@ -358,12 +358,16 @@ inline BlendType getTopR (unsigned char b) { return static_cast<BlendType>(0x3
inline BlendType getBottomR(unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 4)); } inline BlendType getBottomR(unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 4)); }
inline BlendType getBottomL(unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 6)); } inline BlendType getBottomL(unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 6)); }
inline void setTopL (unsigned char& b, BlendType bt) { b |= bt; } //buffer is assumed to be initialized before preprocessing! inline void clearAddTopL(unsigned char& b, BlendType bt) { b = static_cast<unsigned char>(bt); }
inline void setTopR (unsigned char& b, BlendType bt) { b |= (bt << 2); } inline void addTopR (unsigned char& b, BlendType bt) { b |= (bt << 2); } //buffer is assumed to be initialized before preprocessing!
inline void setBottomR(unsigned char& b, BlendType bt) { b |= (bt << 4); } inline void addBottomR (unsigned char& b, BlendType bt) { b |= (bt << 4); } //e.g. via clearAddTopL()
inline void setBottomL(unsigned char& b, BlendType bt) { b |= (bt << 6); } inline void addBottomL (unsigned char& b, BlendType bt) { b |= (bt << 6); } //
inline bool blendingNeeded(unsigned char b) { return b != 0; } inline bool blendingNeeded(unsigned char b)
{
static_assert(BLEND_NONE == 0, "");
return b != 0;
}
template <RotationDegree rotDeg> inline template <RotationDegree rotDeg> inline
unsigned char rotateBlendInfo(unsigned char b) { return b; } unsigned char rotateBlendInfo(unsigned char b) { return b; }
@ -372,13 +376,12 @@ template <> inline unsigned char rotateBlendInfo<ROT_180>(unsigned char b) { ret
template <> inline unsigned char rotateBlendInfo<ROT_270>(unsigned char b) { return ((b << 6) | (b >> 2)) & 0xff; } template <> inline unsigned char rotateBlendInfo<ROT_270>(unsigned char b) { return ((b << 6) | (b >> 2)) & 0xff; }
/* /* input kernel area naming convention:
input kernel area naming convention:
------------- -------------
| A | B | C | | A | B | C |
----|---|---| |---|---|---|
| D | E | F | //input pixel is at position E | D | E | F | input pixel is at position E
----|---|---| |---|---|---|
| G | H | I | | G | H | I |
------------- -------------
*/ */
@ -472,7 +475,80 @@ void blendPixel(const Kernel_3x3& ker,
} }
template <class Scaler, class ColorDistance> //scaler policy: see "Scaler2x" reference implementation class OobReaderTransparent
{
public:
OobReaderTransparent(const uint32_t* src, int srcWidth, int srcHeight, int y) :
s_m1(0 <= y - 1 && y - 1 < srcHeight ? src + srcWidth * (y - 1) : nullptr),
s_0 (0 <= y && y < srcHeight ? src + srcWidth * y : nullptr),
s_p1(0 <= y + 1 && y + 1 < srcHeight ? src + srcWidth * (y + 1) : nullptr),
s_p2(0 <= y + 2 && y + 2 < srcHeight ? src + srcWidth * (y + 2) : nullptr),
srcWidth_(srcWidth) {}
void readDhlp(Kernel_4x4& ker, int x) const //(x, y) is at kernel position F
{
const int x_p2 = x + 2;
if (0 <= x_p2 && x_p2 < srcWidth_)
{
ker.d = s_m1 ? s_m1[x_p2] : 0;
ker.h = s_0 ? s_0 [x_p2] : 0;
ker.l = s_p1 ? s_p1[x_p2] : 0;
ker.p = s_p2 ? s_p2[x_p2] : 0;
}
else
{
ker.d = 0;
ker.h = 0;
ker.l = 0;
ker.p = 0;
}
}
private:
const uint32_t* const s_m1;
const uint32_t* const s_0;
const uint32_t* const s_p1;
const uint32_t* const s_p2;
const int srcWidth_;
};
template <typename T>
constexpr inline T xbrz_clamp(const T in, const T min, const T max)
{
return in <= min ? min : in >= max ? max : in;
}
class OobReaderDuplicate
{
public:
OobReaderDuplicate(const uint32_t* src, int srcWidth, int srcHeight, int y) :
s_m1(src + srcWidth * xbrz_clamp(y - 1, 0, srcHeight - 1)),
s_0 (src + srcWidth * xbrz_clamp(y, 0, srcHeight - 1)),
s_p1(src + srcWidth * xbrz_clamp(y + 1, 0, srcHeight - 1)),
s_p2(src + srcWidth * xbrz_clamp(y + 2, 0, srcHeight - 1)),
srcWidth_(srcWidth) {}
void readDhlp(Kernel_4x4& ker, int x) const //(x, y) is at kernel position F
{
const int x_p2 = xbrz_clamp(x + 2, 0, srcWidth_ - 1);
ker.d = s_m1[x_p2];
ker.h = s_0 [x_p2];
ker.l = s_p1[x_p2];
ker.p = s_p2[x_p2];
}
private:
const uint32_t* const s_m1;
const uint32_t* const s_0;
const uint32_t* const s_p1;
const uint32_t* const s_p2;
const int srcWidth_;
};
template <class Scaler, class ColorDistance, class OobReader> //scaler policy: see "Scaler2x" reference implementation
void scaleImage(const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight, const xbrz::ScalerCfg& cfg, int yFirst, int yLast) void scaleImage(const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight, const xbrz::ScalerCfg& cfg, int yFirst, int yLast)
{ {
yFirst = std::max(yFirst, 0); yFirst = std::max(yFirst, 0);
@ -482,64 +558,72 @@ void scaleImage(const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight,
const int trgWidth = srcWidth * Scaler::scale; const int trgWidth = srcWidth * Scaler::scale;
//"use" space at the end of the image as temporary buffer for "on the fly preprocessing": we even could use larger area of //(ab)use space of "sizeof(uint32_t) * srcWidth * Scaler::scale" at the end of the image as temporary
//"sizeof(uint32_t) * srcWidth * (yLast - yFirst)" bytes without risk of accidental overwriting before accessing //buffer for "on the fly preprocessing" without risk of accidental overwriting before accessing
const int bufferSize = srcWidth; unsigned char* const preProcBuf = reinterpret_cast<unsigned char*>(trg + yLast * Scaler::scale * trgWidth) - srcWidth;
unsigned char* preProcBuffer = reinterpret_cast<unsigned char*>(trg + yLast * Scaler::scale * trgWidth) - bufferSize;
std::fill(preProcBuffer, preProcBuffer + bufferSize, '\0');
static_assert(BLEND_NONE == 0, "");
//initialize preprocessing buffer for first row of current stripe: detect upper left and right corner blending //initialize preprocessing buffer for first row of current stripe: detect upper left and right corner blending
//this cannot be optimized for adjacent processing stripes; we must not allow for a memory race condition! //this cannot be optimized for adjacent processing stripes; we must not allow for a memory race condition!
if (yFirst > 0)
{ {
const int y = yFirst - 1; const OobReader oobReader(src, srcWidth, srcHeight, yFirst - 1);
const uint32_t* s_m1 = src + srcWidth * std::max(y - 1, 0); //initialize at position x = -1
const uint32_t* s_0 = src + srcWidth * y; //center line Kernel_4x4 ker4 = {};
const uint32_t* s_p1 = src + srcWidth * std::min(y + 1, srcHeight - 1); oobReader.readDhlp(ker4, -4); //hack: read a, e, i, m at x = -1
const uint32_t* s_p2 = src + srcWidth * std::min(y + 2, srcHeight - 1); ker4.a = ker4.d;
ker4.e = ker4.h;
ker4.i = ker4.l;
ker4.m = ker4.p;
oobReader.readDhlp(ker4, -3);
ker4.b = ker4.d;
ker4.f = ker4.h;
ker4.j = ker4.l;
ker4.n = ker4.p;
oobReader.readDhlp(ker4, -2);
ker4.c = ker4.d;
ker4.g = ker4.h;
ker4.k = ker4.l;
ker4.o = ker4.p;
oobReader.readDhlp(ker4, -1);
{
const BlendResult res = preProcessCorners<ColorDistance>(ker4, cfg);
clearAddTopL(preProcBuf[0], res.blend_k); //set 1st known corner for (0, yFirst)
}
for (int x = 0; x < srcWidth; ++x) for (int x = 0; x < srcWidth; ++x)
{ {
const int x_m1 = std::max(x - 1, 0); ker4.a = ker4.b; //shift previous kernel to the left
const int x_p1 = std::min(x + 1, srcWidth - 1); ker4.e = ker4.f; // -----------------
const int x_p2 = std::min(x + 2, srcWidth - 1); ker4.i = ker4.j; // | A | B | C | D |
ker4.m = ker4.n; // |---|---|---|---|
/**/ // | E | F | G | H | (x, yFirst - 1) is at position F
ker4.b = ker4.c; // |---|---|---|---|
ker4.f = ker4.g; // | I | J | K | L |
ker4.j = ker4.k; // |---|---|---|---|
ker4.n = ker4.o; // | M | N | O | P |
/**/ // -----------------
ker4.c = ker4.d;
ker4.g = ker4.h;
ker4.k = ker4.l;
ker4.o = ker4.p;
Kernel_4x4 ker = {}; //perf: initialization is negligible oobReader.readDhlp(ker4, x);
ker.a = s_m1[x_m1]; //read sequentially from memory as far as possible
ker.b = s_m1[x];
ker.c = s_m1[x_p1];
ker.d = s_m1[x_p2];
ker.e = s_0[x_m1]; /* preprocessing blend result:
ker.f = s_0[x];
ker.g = s_0[x_p1];
ker.h = s_0[x_p2];
ker.i = s_p1[x_m1];
ker.j = s_p1[x];
ker.k = s_p1[x_p1];
ker.l = s_p1[x_p2];
ker.m = s_p2[x_m1];
ker.n = s_p2[x];
ker.o = s_p2[x_p1];
ker.p = s_p2[x_p2];
const BlendResult res = preProcessCorners<ColorDistance>(ker, cfg);
/*
preprocessing blend result:
--------- ---------
| F | G | //evalute corner between F, G, J, K | F | G | evaluate corner between F, G, J, K
----|---| //input pixel is at position F |---+---| current input pixel is at position F
| J | K | | J | K |
--------- --------- */
*/ const BlendResult res = preProcessCorners<ColorDistance>(ker4, cfg);
setTopR(preProcBuffer[x], res.blend_j); addTopR(preProcBuf[x], res.blend_j); //set 2nd known corner for (x, yFirst)
if (x + 1 < bufferSize) if (x + 1 < srcWidth)
setTopL(preProcBuffer[x + 1], res.blend_k); clearAddTopL(preProcBuf[x + 1], res.blend_k); //set 1st known corner for (x + 1, yFirst)
} }
} }
//------------------------------------------------------------------------------------ //------------------------------------------------------------------------------------
@ -548,91 +632,92 @@ void scaleImage(const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight,
{ {
uint32_t* out = trg + Scaler::scale * y * trgWidth; //consider MT "striped" access uint32_t* out = trg + Scaler::scale * y * trgWidth; //consider MT "striped" access
const uint32_t* s_m1 = src + srcWidth * std::max(y - 1, 0); const OobReader oobReader(src, srcWidth, srcHeight, y);
const uint32_t* s_0 = src + srcWidth * y; //center line
const uint32_t* s_p1 = src + srcWidth * std::min(y + 1, srcHeight - 1); //initialize at position x = -1
const uint32_t* s_p2 = src + srcWidth * std::min(y + 2, srcHeight - 1); Kernel_4x4 ker4 = {};
oobReader.readDhlp(ker4, -4); //hack: read a, e, i, m at x = -1
ker4.a = ker4.d;
ker4.e = ker4.h;
ker4.i = ker4.l;
ker4.m = ker4.p;
oobReader.readDhlp(ker4, -3);
ker4.b = ker4.d;
ker4.f = ker4.h;
ker4.j = ker4.l;
ker4.n = ker4.p;
oobReader.readDhlp(ker4, -2);
ker4.c = ker4.d;
ker4.g = ker4.h;
ker4.k = ker4.l;
ker4.o = ker4.p;
oobReader.readDhlp(ker4, -1);
unsigned char blend_xy1 = 0; //corner blending for current (x, y + 1) position unsigned char blend_xy1 = 0; //corner blending for current (x, y + 1) position
{
const BlendResult res = preProcessCorners<ColorDistance>(ker4, cfg);
clearAddTopL(blend_xy1, res.blend_k); //set 1st known corner for (0, y + 1) and buffer for use on next column
addBottomL(preProcBuf[0], res.blend_g); //set 3rd known corner for (0, y)
}
for (int x = 0; x < srcWidth; ++x, out += Scaler::scale) for (int x = 0; x < srcWidth; ++x, out += Scaler::scale)
{ {
#if defined _MSC_VER && !defined NDEBUG #if defined _MSC_VER && !defined NDEBUG
breakIntoDebugger = debugPixelX == x && debugPixelY == y; breakIntoDebugger = debugPixelX == x && debugPixelY == y;
#endif #endif
//all those bounds checks have only insignificant impact on performance! ker4.a = ker4.b; //shift previous kernel to the left
const int x_m1 = std::max(x - 1, 0); //perf: prefer array indexing to additional pointers! ker4.e = ker4.f; // -----------------
const int x_p1 = std::min(x + 1, srcWidth - 1); ker4.i = ker4.j; // | A | B | C | D |
const int x_p2 = std::min(x + 2, srcWidth - 1); ker4.m = ker4.n; // |---|---|---|---|
/**/ // | E | F | G | H | (x, y) is at position F
ker4.b = ker4.c; // |---|---|---|---|
ker4.f = ker4.g; // | I | J | K | L |
ker4.j = ker4.k; // |---|---|---|---|
ker4.n = ker4.o; // | M | N | O | P |
/**/ // -----------------
ker4.c = ker4.d;
ker4.g = ker4.h;
ker4.k = ker4.l;
ker4.o = ker4.p;
Kernel_4x4 ker4 = {}; //perf: initialization is negligible oobReader.readDhlp(ker4, x);
ker4.a = s_m1[x_m1]; //read sequentially from memory as far as possible
ker4.b = s_m1[x];
ker4.c = s_m1[x_p1];
ker4.d = s_m1[x_p2];
ker4.e = s_0[x_m1];
ker4.f = s_0[x];
ker4.g = s_0[x_p1];
ker4.h = s_0[x_p2];
ker4.i = s_p1[x_m1];
ker4.j = s_p1[x];
ker4.k = s_p1[x_p1];
ker4.l = s_p1[x_p2];
ker4.m = s_p2[x_m1];
ker4.n = s_p2[x];
ker4.o = s_p2[x_p1];
ker4.p = s_p2[x_p2];
//evaluate the four corners on bottom-right of current pixel //evaluate the four corners on bottom-right of current pixel
unsigned char blend_xy = 0; //for current (x, y) position unsigned char blend_xy = preProcBuf[x]; //for current (x, y) position
{ {
const BlendResult res = preProcessCorners<ColorDistance>(ker4, cfg); /* preprocessing blend result:
/*
preprocessing blend result:
--------- ---------
| F | G | //evalute corner between F, G, J, K | F | G | evaluate corner between F, G, J, K
----|---| //current input pixel is at position F |---+---| current input pixel is at position F
| J | K | | J | K |
--------- --------- */
*/ const BlendResult res = preProcessCorners<ColorDistance>(ker4, cfg);
blend_xy = preProcBuffer[x]; addBottomR(blend_xy, res.blend_f); //all four corners of (x, y) have been determined at this point due to processing sequence!
setBottomR(blend_xy, res.blend_f); //all four corners of (x, y) have been determined at this point due to processing sequence!
setTopR(blend_xy1, res.blend_j); //set 2nd known corner for (x, y + 1) addTopR(blend_xy1, res.blend_j); //set 2nd known corner for (x, y + 1)
preProcBuffer[x] = blend_xy1; //store on current buffer position for use on next row preProcBuf[x] = blend_xy1; //store on current buffer position for use on next row
blend_xy1 = 0; if (x + 1 < srcWidth)
setTopL(blend_xy1, res.blend_k); //set 1st known corner for (x + 1, y + 1) and buffer for use on next column {
//blend_xy1 -> blend_x1y1
clearAddTopL(blend_xy1, res.blend_k); //set 1st known corner for (x + 1, y + 1) and buffer for use on next column
if (x + 1 < bufferSize) //set 3rd known corner for (x + 1, y) addBottomL(preProcBuf[x + 1], res.blend_g); //set 3rd known corner for (x + 1, y)
setBottomL(preProcBuffer[x + 1], res.blend_g); }
} }
//fill block of size scale * scale with the given color //fill block of size scale * scale with the given color
fillBlock(out, trgWidth * sizeof(uint32_t), ker4.f, Scaler::scale, Scaler::scale); fillBlock(out, trgWidth * sizeof(uint32_t), ker4.f, Scaler::scale, Scaler::scale);
//place *after* preprocessing step, to not overwrite the results while processing the the last pixel! //place *after* preprocessing step, to not overwrite the results while processing the last pixel!
//blend four corners of current pixel //blend all four corners of current pixel
if (blendingNeeded(blend_xy)) //good 5% perf-improvement if (blendingNeeded(blend_xy))
{ {
Kernel_3x3 ker3 = {}; //perf: initialization is negligible const auto& ker3 = reinterpret_cast<const Kernel_3x3&>(ker4); //"The Things We Do for Perf"
ker3.a = ker4.a;
ker3.b = ker4.b;
ker3.c = ker4.c;
ker3.d = ker4.e;
ker3.e = ker4.f;
ker3.f = ker4.g;
ker3.g = ker4.i;
ker3.h = ker4.j;
ker3.i = ker4.k;
blendPixel<Scaler, ColorDistance, ROT_0 >(ker3, out, trgWidth, blend_xy, cfg); blendPixel<Scaler, ColorDistance, ROT_0 >(ker3, out, trgWidth, blend_xy, cfg);
blendPixel<Scaler, ColorDistance, ROT_90 >(ker3, out, trgWidth, blend_xy, cfg); blendPixel<Scaler, ColorDistance, ROT_90 >(ker3, out, trgWidth, blend_xy, cfg);
blendPixel<Scaler, ColorDistance, ROT_180>(ker3, out, trgWidth, blend_xy, cfg); blendPixel<Scaler, ColorDistance, ROT_180>(ker3, out, trgWidth, blend_xy, cfg);
@ -1095,15 +1180,15 @@ void xbrz::scale(size_t factor, const uint32_t* src, uint32_t* trg, int srcWidth
switch (factor) switch (factor)
{ {
case 2: case 2:
return scaleImage<Scaler2x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast); return scaleImage<Scaler2x<ColorGradientRGB>, ColorDistanceRGB, OobReaderDuplicate>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
case 3: case 3:
return scaleImage<Scaler3x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast); return scaleImage<Scaler3x<ColorGradientRGB>, ColorDistanceRGB, OobReaderDuplicate>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
case 4: case 4:
return scaleImage<Scaler4x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast); return scaleImage<Scaler4x<ColorGradientRGB>, ColorDistanceRGB, OobReaderDuplicate>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
case 5: case 5:
return scaleImage<Scaler5x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast); return scaleImage<Scaler5x<ColorGradientRGB>, ColorDistanceRGB, OobReaderDuplicate>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
case 6: case 6:
return scaleImage<Scaler6x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast); return scaleImage<Scaler6x<ColorGradientRGB>, ColorDistanceRGB, OobReaderDuplicate>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
} }
break; break;
@ -1111,15 +1196,15 @@ void xbrz::scale(size_t factor, const uint32_t* src, uint32_t* trg, int srcWidth
switch (factor) switch (factor)
{ {
case 2: case 2:
return scaleImage<Scaler2x<ColorGradientARGB>, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast); return scaleImage<Scaler2x<ColorGradientARGB>, ColorDistanceARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
case 3: case 3:
return scaleImage<Scaler3x<ColorGradientARGB>, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast); return scaleImage<Scaler3x<ColorGradientARGB>, ColorDistanceARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
case 4: case 4:
return scaleImage<Scaler4x<ColorGradientARGB>, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast); return scaleImage<Scaler4x<ColorGradientARGB>, ColorDistanceARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
case 5: case 5:
return scaleImage<Scaler5x<ColorGradientARGB>, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast); return scaleImage<Scaler5x<ColorGradientARGB>, ColorDistanceARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
case 6: case 6:
return scaleImage<Scaler6x<ColorGradientARGB>, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast); return scaleImage<Scaler6x<ColorGradientARGB>, ColorDistanceARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
} }
break; break;
@ -1127,15 +1212,15 @@ void xbrz::scale(size_t factor, const uint32_t* src, uint32_t* trg, int srcWidth
switch (factor) switch (factor)
{ {
case 2: case 2:
return scaleImage<Scaler2x<ColorGradientARGB>, ColorDistanceUnbufferedARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast); return scaleImage<Scaler2x<ColorGradientARGB>, ColorDistanceUnbufferedARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
case 3: case 3:
return scaleImage<Scaler3x<ColorGradientARGB>, ColorDistanceUnbufferedARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast); return scaleImage<Scaler3x<ColorGradientARGB>, ColorDistanceUnbufferedARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
case 4: case 4:
return scaleImage<Scaler4x<ColorGradientARGB>, ColorDistanceUnbufferedARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast); return scaleImage<Scaler4x<ColorGradientARGB>, ColorDistanceUnbufferedARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
case 5: case 5:
return scaleImage<Scaler5x<ColorGradientARGB>, ColorDistanceUnbufferedARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast); return scaleImage<Scaler5x<ColorGradientARGB>, ColorDistanceUnbufferedARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
case 6: case 6:
return scaleImage<Scaler6x<ColorGradientARGB>, ColorDistanceUnbufferedARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast); return scaleImage<Scaler6x<ColorGradientARGB>, ColorDistanceUnbufferedARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
} }
break; break;
} }

View file

@ -22,6 +22,7 @@
#include <limits> #include <limits>
#include "xbrz_config.h" #include "xbrz_config.h"
namespace xbrz namespace xbrz
{ {
/* /*
@ -50,7 +51,6 @@ const int SCALE_FACTOR_MAX = 6;
/* /*
-> map source (srcWidth * srcHeight) to target (scale * width x scale * height) image, optionally processing a half-open slice of rows [yFirst, yLast) only -> map source (srcWidth * srcHeight) to target (scale * width x scale * height) image, optionally processing a half-open slice of rows [yFirst, yLast) only
-> support for source/target pitch in bytes!
-> if your emulator changes only a few image slices during each cycle (e.g. DOSBox) then there's no need to run xBRZ on the complete image: -> if your emulator changes only a few image slices during each cycle (e.g. DOSBox) then there's no need to run xBRZ on the complete image:
Just make sure you enlarge the source image slice by 2 rows on top and 2 on bottom (this is the additional range the xBRZ algorithm is using during analysis) Just make sure you enlarge the source image slice by 2 rows on top and 2 on bottom (this is the additional range the xBRZ algorithm is using during analysis)
CAVEAT: If there are multiple changed slices, make sure they do not overlap after adding these additional rows in order to avoid a memory race condition CAVEAT: If there are multiple changed slices, make sure they do not overlap after adding these additional rows in order to avoid a memory race condition

View file

@ -56,7 +56,7 @@ Pix* byteAdvance(Pix* ptr, int bytes)
//fill block with the given color //fill block with the given color
template <class Pix> inline template <class Pix> inline
void fillBlock(Pix* trg, int pitch, Pix col, int blockWidth, int blockHeight) void fillBlock(Pix* trg, int pitch /*[bytes]*/, Pix col, int blockWidth, int blockHeight)
{ {
//for (int y = 0; y < blockHeight; ++y, trg = byteAdvance(trg, pitch)) //for (int y = 0; y < blockHeight; ++y, trg = byteAdvance(trg, pitch))
// std::fill(trg, trg + blockWidth, col); // std::fill(trg, trg + blockWidth, col);
@ -69,8 +69,8 @@ void fillBlock(Pix* trg, int pitch, Pix col, int blockWidth, int blockHeight)
//nearest-neighbor (going over target image - slow for upscaling, since source is read multiple times missing out on cache! Fast for similar image sizes!) //nearest-neighbor (going over target image - slow for upscaling, since source is read multiple times missing out on cache! Fast for similar image sizes!)
template <class PixSrc, class PixTrg, class PixConverter> template <class PixSrc, class PixTrg, class PixConverter>
void nearestNeighborScale(const PixSrc* src, int srcWidth, int srcHeight, int srcPitch, void nearestNeighborScale(const PixSrc* src, int srcWidth, int srcHeight, int srcPitch /*[bytes]*/,
/**/ PixTrg* trg, int trgWidth, int trgHeight, int trgPitch, /**/ PixTrg* trg, int trgWidth, int trgHeight, int trgPitch /*[bytes]*/,
int yFirst, int yLast, PixConverter pixCvrt /*convert PixSrc to PixTrg*/) int yFirst, int yLast, PixConverter pixCvrt /*convert PixSrc to PixTrg*/)
{ {
static_assert(std::is_integral<PixSrc>::value, "PixSrc* is expected to be cast-able to char*"); static_assert(std::is_integral<PixSrc>::value, "PixSrc* is expected to be cast-able to char*");
@ -106,8 +106,8 @@ void nearestNeighborScale(const PixSrc* src, int srcWidth, int srcHeight, int sr
//nearest-neighbor (going over source image - fast for upscaling, since source is read only once //nearest-neighbor (going over source image - fast for upscaling, since source is read only once
template <class PixSrc, class PixTrg, class PixConverter> template <class PixSrc, class PixTrg, class PixConverter>
void nearestNeighborScaleOverSource(const PixSrc* src, int srcWidth, int srcHeight, int srcPitch, void nearestNeighborScaleOverSource(const PixSrc* src, int srcWidth, int srcHeight, int srcPitch /*[bytes]*/,
/**/ PixTrg* trg, int trgWidth, int trgHeight, int trgPitch, /**/ PixTrg* trg, int trgWidth, int trgHeight, int trgPitch /*[bytes]*/,
int yFirst, int yLast, PixConverter pixCvrt /*convert PixSrc to PixTrg*/) int yFirst, int yLast, PixConverter pixCvrt /*convert PixSrc to PixTrg*/)
{ {
static_assert(std::is_integral<PixSrc>::value, "PixSrc* is expected to be cast-able to char*"); static_assert(std::is_integral<PixSrc>::value, "PixSrc* is expected to be cast-able to char*");
@ -187,10 +187,10 @@ void bilinearScale(const uint32_t* src, int srcWidth, int srcHeight, int srcPitc
// -> pre-calculation gives significant boost; std::vector<> memory allocation is negligible! // -> pre-calculation gives significant boost; std::vector<> memory allocation is negligible!
struct CoeffsX struct CoeffsX
{ {
int x1; int x1 = 0;
int x2; int x2 = 0;
double xx1; double xx1 = 0;
double x2x; double x2x = 0;
}; };
std::vector<CoeffsX> buf(trgWidth); std::vector<CoeffsX> buf(trgWidth);
for (int x = 0; x < trgWidth; ++x) for (int x = 0; x < trgWidth; ++x)
@ -202,7 +202,11 @@ void bilinearScale(const uint32_t* src, int srcWidth, int srcHeight, int srcPitc
const double xx1 = x / scaleX - x1; const double xx1 = x / scaleX - x1;
const double x2x = 1 - xx1; const double x2x = 1 - xx1;
buf[x] = { x1, x2, xx1, x2x }; CoeffsX& bx = buf[x];
bx.x1 = x1;
bx.x2 = x2;
bx.xx1 = xx1;
bx.x2x = x2x;
} }
for (int y = yFirst; y < yLast; ++y) for (int y = yFirst; y < yLast; ++y)
@ -231,7 +235,7 @@ void bilinearScale(const uint32_t* src, int srcWidth, int srcHeight, int srcPitc
const double x2xyy1 = x2x * yy1; const double x2xyy1 = x2x * yy1;
const double xx1yy1 = xx1 * yy1; const double xx1yy1 = xx1 * yy1;
auto interpolate = [=](int offset) auto interpolate = [=](int offset) -> double
{ {
/* https://en.wikipedia.org/wiki/Bilinear_interpolation /* https://en.wikipedia.org/wiki/Bilinear_interpolation
(c11(x2 - x) + c21(x - x1)) * (y2 - y ) + (c11(x2 - x) + c21(x - x1)) * (y2 - y ) +