- update xBRZ upscaler to version 1.6

Fixed build with all suported toolchains thanks to incomplete implementation of C++14 in MSVC 2015 and GCC 4.9
Removed obsolete header comments and support for C++98
Disabled Windows only debug features

https://sourceforge.net/projects/xbrz/
https://sourceforge.net/projects/xbrz/files/xBRZ/xBRZ_1.6.zip
This commit is contained in:
alexey.lysiuk 2018-08-03 17:25:39 +03:00 committed by Rachael Alexanderson
parent d65d462268
commit 30c3f4f597
5 changed files with 559 additions and 284 deletions

View file

@ -285,13 +285,13 @@ static unsigned char *xbrzHelper( void (*xbrzFunction) ( size_t, const uint32_t*
parallel_for(inHeight, thresholdHeight, [=](int sliceY) parallel_for(inHeight, thresholdHeight, [=](int sliceY)
{ {
xbrzFunction(N, reinterpret_cast<uint32_t*>(inputBuffer), reinterpret_cast<uint32_t*>(newBuffer), xbrzFunction(N, reinterpret_cast<uint32_t*>(inputBuffer), reinterpret_cast<uint32_t*>(newBuffer),
inWidth, inHeight, xbrz::ARGB, xbrz::ScalerCfg(), sliceY, sliceY + thresholdHeight); inWidth, inHeight, xbrz::ColorFormat::ARGB, xbrz::ScalerCfg(), sliceY, sliceY + thresholdHeight);
}); });
} }
else else
{ {
xbrzFunction(N, reinterpret_cast<uint32_t*>(inputBuffer), reinterpret_cast<uint32_t*>(newBuffer), xbrzFunction(N, reinterpret_cast<uint32_t*>(inputBuffer), reinterpret_cast<uint32_t*>(newBuffer),
inWidth, inHeight, xbrz::ARGB, xbrz::ScalerCfg(), 0, std::numeric_limits<int>::max()); inWidth, inHeight, xbrz::ColorFormat::ARGB, xbrz::ScalerCfg(), 0, std::numeric_limits<int>::max());
} }
delete[] inputBuffer; delete[] inputBuffer;

View file

@ -1,63 +1,41 @@
// **************************************************************************** // ****************************************************************************
// * This file is part of the HqMAME project. It is distributed under * // * This file is part of the xBRZ project. It is distributed under *
// * GNU General Public License: http://www.gnu.org/licenses/gpl-3.0 * // * GNU General Public License: https://www.gnu.org/licenses/gpl-3.0 *
// * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved * // * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved *
// * * // * *
// * Additionally and as a special exception, the author gives permission * // * Additionally and as a special exception, the author gives permission *
// * to link the code of this program with the MAME library (or with modified * // * to link the code of this program with the following libraries *
// * versions of MAME that use the same license as MAME), and distribute * // * (or with modified versions that use the same licenses), and distribute *
// * linked combinations including the two. You must obey the GNU General * // * linked combinations including the two: MAME, FreeFileSync, Snes9x, ePSXe *
// * Public License in all respects for all of the code used other than MAME. * // * You must obey the GNU General Public License in all respects for all of *
// * the code used other than MAME, FreeFileSync, Snes9x, ePSXe. *
// * If you modify this file, you may extend this exception to your version * // * If you modify this file, you may extend this exception to your version *
// * of the file, but you are not obligated to do so. If you do not wish to * // * of the file, but you are not obligated to do so. If you do not wish to *
// * do so, delete this exception statement from your version. * // * do so, delete this exception statement from your version. *
// * *
// * An explicit permission was granted to use xBRZ in combination with ZDoom *
// * and derived projects as long as it is used for non-commercial purposes. *
// * *
// * Backported to C++98 by Alexey Lysiuk *
// **************************************************************************** // ****************************************************************************
#include "xbrz.h" #include "xbrz.h"
#include <cassert> #include <cassert>
#include <cmath>
#include <algorithm>
#include <vector> #include <vector>
#include <algorithm>
#include <cmath> //std::sqrt
#include "xbrz_tools.h"
using namespace xbrz;
#if __cplusplus <= 199711
#define static_assert(VAL, MSG) static_assertion<VAL>();
template<bool> struct static_assertion;
template<> struct static_assertion<true> {};
#endif // __cplusplus <= 199711
namespace namespace
{ {
template <uint32_t N> inline
unsigned char getByte(uint32_t val) { return static_cast<unsigned char>((val >> (8 * N)) & 0xff); }
inline unsigned char getAlpha(uint32_t pix) { return getByte<3>(pix); }
inline unsigned char getRed (uint32_t pix) { return getByte<2>(pix); }
inline unsigned char getGreen(uint32_t pix) { return getByte<1>(pix); }
inline unsigned char getBlue (uint32_t pix) { return getByte<0>(pix); }
inline uint32_t makePixel( unsigned char r, unsigned char g, unsigned char b) { return (r << 16) | (g << 8) | b; }
inline uint32_t makePixel(unsigned char a, unsigned char r, unsigned char g, unsigned char b) { return (a << 24) | (r << 16) | (g << 8) | b; }
template <unsigned int M, unsigned int N> inline template <unsigned int M, unsigned int N> inline
uint32_t gradientRGB(uint32_t pixFront, uint32_t pixBack) //blend front color with opacity M / N over opaque background: http://en.wikipedia.org/wiki/Alpha_compositing#Alpha_blending uint32_t gradientRGB(uint32_t pixFront, uint32_t pixBack) //blend front color with opacity M / N over opaque background: http://en.wikipedia.org/wiki/Alpha_compositing#Alpha_blending
{ {
static_assert(0 < M && M < N && N <= 1000, ""); static_assert(0 < M && M < N && N <= 1000, "");
#define calcColor(colFront, colBack) \ auto calcColor = [](unsigned char colFront, unsigned char colBack) -> unsigned char { return (colFront * M + colBack * (N - M)) / N; };
(((colFront) * M + (colBack) * (N - M)) / N)
return makePixel(calcColor(getRed (pixFront), getRed (pixBack)), return makePixel(calcColor(getRed (pixFront), getRed (pixBack)),
calcColor(getGreen(pixFront), getGreen(pixBack)), calcColor(getGreen(pixFront), getGreen(pixBack)),
calcColor(getBlue (pixFront), getBlue (pixBack))); calcColor(getBlue (pixFront), getBlue (pixBack)));
#undef calcColor
} }
@ -72,15 +50,15 @@ uint32_t gradientARGB(uint32_t pixFront, uint32_t pixBack) //find intermediate c
if (weightSum == 0) if (weightSum == 0)
return 0; return 0;
#define calcColor(colFront, colBack) \ auto calcColor = [=](unsigned char colFront, unsigned char colBack)
static_cast<unsigned char>(((colFront) * weightFront + (colBack) * weightBack) / weightSum) {
return static_cast<unsigned char>((colFront * weightFront + colBack * weightBack) / weightSum);
};
return makePixel(static_cast<unsigned char>(weightSum / N), return makePixel(static_cast<unsigned char>(weightSum / N),
calcColor(getRed (pixFront), getRed (pixBack)), calcColor(getRed (pixFront), getRed (pixBack)),
calcColor(getGreen(pixFront), getGreen(pixBack)), calcColor(getGreen(pixFront), getGreen(pixBack)),
calcColor(getBlue (pixFront), getBlue (pixBack))); calcColor(getBlue (pixFront), getBlue (pixBack)));
#undef calcColor
} }
@ -96,26 +74,6 @@ uint32_t gradientARGB(uint32_t pixFront, uint32_t pixBack) //find intermediate c
// //
uint32_t* byteAdvance( uint32_t* ptr, int bytes) { return reinterpret_cast< uint32_t*>(reinterpret_cast< char*>(ptr) + bytes); }
const uint32_t* byteAdvance(const uint32_t* ptr, int bytes) { return reinterpret_cast<const uint32_t*>(reinterpret_cast<const char*>(ptr) + bytes); }
//fill block with the given color
inline
void fillBlock(uint32_t* trg, int pitch, uint32_t col, int blockWidth, int blockHeight)
{
//for (int y = 0; y < blockHeight; ++y, trg = byteAdvance(trg, pitch))
// std::fill(trg, trg + blockWidth, col);
for (int y = 0; y < blockHeight; ++y, trg = byteAdvance(trg, pitch))
for (int x = 0; x < blockWidth; ++x)
trg[x] = col;
}
inline
void fillBlock(uint32_t* trg, int pitch, uint32_t col, int n) { fillBlock(trg, pitch, col, n, n); }
#ifdef _MSC_VER #ifdef _MSC_VER
#define FORCE_INLINE __forceinline #define FORCE_INLINE __forceinline
#elif defined __GNUC__ #elif defined __GNUC__
@ -178,7 +136,7 @@ template <class T> inline
T square(T value) { return value * value; } T square(T value) { return value * value; }
#if 0
inline inline
double distRGB(uint32_t pix1, uint32_t pix2) double distRGB(uint32_t pix1, uint32_t pix2)
{ {
@ -189,6 +147,7 @@ double distRGB(uint32_t pix1, uint32_t pix2)
//euklidean RGB distance //euklidean RGB distance
return std::sqrt(square(r_diff) + square(g_diff) + square(b_diff)); return std::sqrt(square(r_diff) + square(g_diff) + square(b_diff));
} }
#endif
inline inline
@ -218,26 +177,20 @@ double distYCbCr(uint32_t pix1, uint32_t pix2, double lumaWeight)
} }
struct DistYCbCrBuffer //30% perf boost compared to distYCbCr()! inline
double distYCbCrBuffered(uint32_t pix1, uint32_t pix2)
{ {
public: //30% perf boost compared to plain distYCbCr()!
static double dist(uint32_t pix1, uint32_t pix2) //consumes 64 MB memory; using double is only 2% faster, but takes 128 MB
static const std::vector<float> diffToDist = []
{ {
#if defined _MSC_VER && _MSC_VER < 1900 std::vector<float> tmp;
#error function scope static initialization is not yet thread-safe!
#endif
static const DistYCbCrBuffer inst;
return inst.distImpl(pix1, pix2);
}
private:
DistYCbCrBuffer() : buffer(256 * 256 * 256)
{
for (uint32_t i = 0; i < 256 * 256 * 256; ++i) //startup time: 114 ms on Intel Core i5 (four cores) for (uint32_t i = 0; i < 256 * 256 * 256; ++i) //startup time: 114 ms on Intel Core i5 (four cores)
{ {
const int r_diff = getByte<2>(i) * 2 - 255; const int r_diff = getByte<2>(i) * 2 - 0xFF;
const int g_diff = getByte<1>(i) * 2 - 255; const int g_diff = getByte<1>(i) * 2 - 0xFF;
const int b_diff = getByte<0>(i) * 2 - 255; const int b_diff = getByte<0>(i) * 2 - 0xFF;
const double k_b = 0.0593; //ITU-R BT.2020 conversion const double k_b = 0.0593; //ITU-R BT.2020 conversion
const double k_r = 0.2627; // const double k_r = 0.2627; //
@ -250,28 +203,31 @@ private:
const double c_b = scale_b * (b_diff - y); const double c_b = scale_b * (b_diff - y);
const double c_r = scale_r * (r_diff - y); const double c_r = scale_r * (r_diff - y);
buffer[i] = static_cast<float>(std::sqrt(square(y) + square(c_b) + square(c_r))); tmp.push_back(static_cast<float>(std::sqrt(square(y) + square(c_b) + square(c_r))));
} }
} return tmp;
}();
double distImpl(uint32_t pix1, uint32_t pix2) const //if (pix1 == pix2) -> 8% perf degradation!
{ // return 0;
//if (pix1 == pix2) -> 8% perf degradation! //if (pix1 < pix2)
// return 0; // std::swap(pix1, pix2); -> 30% perf degradation!!!
//if (pix1 > pix2) #if 1
// std::swap(pix1, pix2); -> 30% perf degradation!!! const int r_diff = static_cast<int>(getRed (pix1)) - getRed (pix2);
const int g_diff = static_cast<int>(getGreen(pix1)) - getGreen(pix2);
const int b_diff = static_cast<int>(getBlue (pix1)) - getBlue (pix2);
const int r_diff = static_cast<int>(getRed (pix1)) - getRed (pix2); return diffToDist[(((r_diff + 0xFF) / 2) << 16) | //slightly reduce precision (division by 2) to squeeze value into single byte
const int g_diff = static_cast<int>(getGreen(pix1)) - getGreen(pix2); (((g_diff + 0xFF) / 2) << 8) |
const int b_diff = static_cast<int>(getBlue (pix1)) - getBlue (pix2); (( b_diff + 0xFF) / 2)];
#else //not noticeably faster:
const int r_diff_tmp = ((pix1 & 0xFF0000) + 0xFF0000 - (pix2 & 0xFF0000)) / 2;
const int g_diff_tmp = ((pix1 & 0x00FF00) + 0x00FF00 - (pix2 & 0x00FF00)) / 2; //slightly reduce precision (division by 2) to squeeze value into single byte
const int b_diff_tmp = ((pix1 & 0x0000FF) + 0x0000FF - (pix2 & 0x0000FF)) / 2;
return buffer[(((r_diff + 255) / 2) << 16) | //slightly reduce precision (division by 2) to squeeze value into single byte return diffToDist[(r_diff_tmp & 0xFF0000) | (g_diff_tmp & 0x00FF00) | (b_diff_tmp & 0x0000FF)];
(((g_diff + 255) / 2) << 8) | #endif
(( b_diff + 255) / 2)]; }
}
std::vector<float> buffer; //consumes 64 MB memory; using double is only 2% faster, but takes 128 MB
};
enum BlendType enum BlendType
@ -323,15 +279,12 @@ BlendResult preProcessCorners(const Kernel_4x4& ker, const xbrz::ScalerCfg& cfg)
ker.g == ker.k)) ker.g == ker.k))
return result; return result;
#define dist(pix1, pix2) \ auto dist = [&](uint32_t pix1, uint32_t pix2) { return ColorDistance::dist(pix1, pix2, cfg.luminanceWeight); };
ColorDistance::dist((pix1), (pix2), cfg.luminanceWeight)
const int weight = 4; const int weight = 4;
double jg = dist(ker.i, ker.f) + dist(ker.f, ker.c) + dist(ker.n, ker.k) + dist(ker.k, ker.h) + weight * dist(ker.j, ker.g); double jg = dist(ker.i, ker.f) + dist(ker.f, ker.c) + dist(ker.n, ker.k) + dist(ker.k, ker.h) + weight * dist(ker.j, ker.g);
double fk = dist(ker.e, ker.j) + dist(ker.j, ker.o) + dist(ker.b, ker.g) + dist(ker.g, ker.l) + weight * dist(ker.f, ker.k); double fk = dist(ker.e, ker.j) + dist(ker.j, ker.o) + dist(ker.b, ker.g) + dist(ker.g, ker.l) + weight * dist(ker.f, ker.k);
#undef dist
if (jg < fk) //test sample: 70% of values max(jg, fk) / min(jg, fk) are between 1.1 and 3.7 with median being 1.8 if (jg < fk) //test sample: 70% of values max(jg, fk) / min(jg, fk) are between 1.1 and 3.7 with median being 1.8
{ {
const bool dominantGradient = cfg.dominantDirectionThreshold * jg < fk; const bool dominantGradient = cfg.dominantDirectionThreshold * jg < fk;
@ -383,12 +336,12 @@ DEF_GETTER(g, c) DEF_GETTER(h, b) DEF_GETTER(i, a)
#define DEF_GETTER(x, y) template <> inline uint32_t get_##x<ROT_270>(const Kernel_3x3& ker) { return ker.y; } #define DEF_GETTER(x, y) template <> inline uint32_t get_##x<ROT_270>(const Kernel_3x3& ker) { return ker.y; }
DEF_GETTER(a, c) DEF_GETTER(b, f) DEF_GETTER(c, i) DEF_GETTER(a, c) DEF_GETTER(b, f) DEF_GETTER(c, i)
DEF_GETTER(d, b) DEF_GETTER(e, e) DEF_GETTER(f, h) DEF_GETTER(d, b) DEF_GETTER(e, e) DEF_GETTER(f, h)
DEF_GETTER(g, a) DEF_GETTER(h, d) DEF_GETTER(i, g) DEF_GETTER(g, a) DEF_GETTER(h, d) DEF_GETTER(i, g)
#undef DEF_GETTER #undef DEF_GETTER
//compress four blend types into a single byte //compress four blend types into a single byte
inline BlendType getTopL (unsigned char b) { return static_cast<BlendType>(0x3 & b); } //inline BlendType getTopL (unsigned char b) { return static_cast<BlendType>(0x3 & b); }
inline BlendType getTopR (unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 2)); } inline BlendType getTopR (unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 2)); }
inline BlendType getBottomR(unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 4)); } inline BlendType getBottomR(unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 4)); }
inline BlendType getBottomL(unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 6)); } inline BlendType getBottomL(unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 6)); }
@ -407,6 +360,13 @@ template <> inline unsigned char rotateBlendInfo<ROT_180>(unsigned char b) { ret
template <> inline unsigned char rotateBlendInfo<ROT_270>(unsigned char b) { return ((b << 6) | (b >> 2)) & 0xff; } template <> inline unsigned char rotateBlendInfo<ROT_270>(unsigned char b) { return ((b << 6) | (b >> 2)) & 0xff; }
#if 0 //#ifndef NDEBUG
int debugPixelX = -1;
int debugPixelY = 12;
__declspec(thread) bool breakIntoDebugger = false;
#endif
/* /*
input kernel area naming convention: input kernel area naming convention:
------------- -------------
@ -434,40 +394,37 @@ void blendPixel(const Kernel_3x3& ker,
#define h get_h<rotDeg>(ker) #define h get_h<rotDeg>(ker)
#define i get_i<rotDeg>(ker) #define i get_i<rotDeg>(ker)
#if 0 //#ifndef NDEBUG
if (breakIntoDebugger)
__debugbreak(); //__asm int 3;
#endif
(void)a; //silence Clang's -Wunused-function
const unsigned char blend = rotateBlendInfo<rotDeg>(blendInfo); const unsigned char blend = rotateBlendInfo<rotDeg>(blendInfo);
if (getBottomR(blend) >= BLEND_NORMAL) if (getBottomR(blend) >= BLEND_NORMAL)
{ {
struct LineBlend auto eq = [&](uint32_t pix1, uint32_t pix2) { return ColorDistance::dist(pix1, pix2, cfg.luminanceWeight) < cfg.equalColorTolerance; };
auto dist = [&](uint32_t pix1, uint32_t pix2) { return ColorDistance::dist(pix1, pix2, cfg.luminanceWeight); };
const bool doLineBlend = [&]() -> bool
{ {
static bool Eval(const Kernel_3x3& ker, const xbrz::ScalerCfg& cfg, const unsigned char blend) if (getBottomR(blend) >= BLEND_DOMINANT)
{
if (getBottomR(blend) >= BLEND_DOMINANT)
return true;
#define eq(pix1, pix2) \
(ColorDistance::dist((pix1), (pix2), cfg.luminanceWeight) < cfg.equalColorTolerance)
//make sure there is no second blending in an adjacent rotation for this pixel: handles insular pixels, mario eyes
if (getTopR(blend) != BLEND_NONE && !eq(e, g)) //but support double-blending for 90 degree corners
return false;
if (getBottomL(blend) != BLEND_NONE && !eq(e, c))
return false;
//no full blending for L-shapes; blend corner only (handles "mario mushroom eyes")
if (!eq(e, i) && eq(g, h) && eq(h , i) && eq(i, f) && eq(f, c))
return false;
#undef eq
return true; return true;
}
};
const bool doLineBlend = LineBlend::Eval(ker, cfg, blend); //make sure there is no second blending in an adjacent rotation for this pixel: handles insular pixels, mario eyes
if (getTopR(blend) != BLEND_NONE && !eq(e, g)) //but support double-blending for 90° corners
return false;
if (getBottomL(blend) != BLEND_NONE && !eq(e, c))
return false;
#define dist(pix1, pix2) \ //no full blending for L-shapes; blend corner only (handles "mario mushroom eyes")
ColorDistance::dist((pix1), (pix2), cfg.luminanceWeight) if (!eq(e, i) && eq(g, h) && eq(h, i) && eq(i, f) && eq(f, c))
return false;
return true;
}();
const uint32_t px = dist(e, f) <= dist(e, h) ? f : h; //choose most similar color const uint32_t px = dist(e, f) <= dist(e, h) ? f : h; //choose most similar color
@ -493,15 +450,13 @@ void blendPixel(const Kernel_3x3& ker,
if (haveSteepLine) if (haveSteepLine)
Scaler::blendLineSteep(px, out); Scaler::blendLineSteep(px, out);
else else
Scaler::blendLineDiagonal(px,out); Scaler::blendLineDiagonal(px, out);
} }
} }
else else
Scaler::blendCorner(px, out); Scaler::blendCorner(px, out);
} }
#undef dist
#undef a #undef a
#undef b #undef b
#undef c #undef c
@ -528,7 +483,7 @@ void scaleImage(const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight,
//"sizeof(uint32_t) * srcWidth * (yLast - yFirst)" bytes without risk of accidental overwriting before accessing //"sizeof(uint32_t) * srcWidth * (yLast - yFirst)" bytes without risk of accidental overwriting before accessing
const int bufferSize = srcWidth; const int bufferSize = srcWidth;
unsigned char* preProcBuffer = reinterpret_cast<unsigned char*>(trg + yLast * Scaler::scale * trgWidth) - bufferSize; unsigned char* preProcBuffer = reinterpret_cast<unsigned char*>(trg + yLast * Scaler::scale * trgWidth) - bufferSize;
std::fill(preProcBuffer, preProcBuffer + bufferSize, 0); std::fill(preProcBuffer, preProcBuffer + bufferSize, '\0');
static_assert(BLEND_NONE == 0, ""); static_assert(BLEND_NONE == 0, "");
//initialize preprocessing buffer for first row of current stripe: detect upper left and right corner blending //initialize preprocessing buffer for first row of current stripe: detect upper left and right corner blending
@ -599,6 +554,9 @@ void scaleImage(const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight,
for (int x = 0; x < srcWidth; ++x, out += Scaler::scale) for (int x = 0; x < srcWidth; ++x, out += Scaler::scale)
{ {
#if 0 //#ifndef NDEBUG
breakIntoDebugger = debugPixelX == x && debugPixelY == y;
#endif
//all those bounds checks have only insignificant impact on performance! //all those bounds checks have only insignificant impact on performance!
const int x_m1 = std::max(x - 1, 0); //perf: prefer array indexing to additional pointers! const int x_m1 = std::max(x - 1, 0); //perf: prefer array indexing to additional pointers!
const int x_p1 = std::min(x + 1, srcWidth - 1); const int x_p1 = std::min(x + 1, srcWidth - 1);
@ -652,7 +610,8 @@ void scaleImage(const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight,
} }
//fill block of size scale * scale with the given color //fill block of size scale * scale with the given color
fillBlock(out, trgWidth * sizeof(uint32_t), ker4.f, Scaler::scale); //place *after* preprocessing step, to not overwrite the results while processing the the last pixel! fillBlock(out, trgWidth * sizeof(uint32_t), ker4.f, Scaler::scale, Scaler::scale);
//place *after* preprocessing step, to not overwrite the results while processing the the last pixel!
//blend four corners of current pixel //blend four corners of current pixel
if (blendingNeeded(blend_xy)) //good 5% perf-improvement if (blendingNeeded(blend_xy)) //good 5% perf-improvement
@ -1047,7 +1006,7 @@ struct ColorDistanceRGB
{ {
static double dist(uint32_t pix1, uint32_t pix2, double luminanceWeight) static double dist(uint32_t pix1, uint32_t pix2, double luminanceWeight)
{ {
return DistYCbCrBuffer::dist(pix1, pix2); return distYCbCrBuffered(pix1, pix2);
//if (pix1 == pix2) //about 4% perf boost //if (pix1 == pix2) //about 4% perf boost
// return 0; // return 0;
@ -1064,20 +1023,36 @@ struct ColorDistanceARGB
/* /*
Requirements for a color distance handling alpha channel: with a1, a2 in [0, 1] Requirements for a color distance handling alpha channel: with a1, a2 in [0, 1]
1. if a1 = a2, distance should be: a1 * distYCbCr() 1. if a1 = a2, distance should be: a1 * distYCbCr()
2. if a1 = 0, distance should be: a2 * distYCbCr(black, white) = a2 * 255 2. if a1 = 0, distance should be: a2 * distYCbCr(black, white) = a2 * 255
3. if a1 = 1, ??? maybe: 255 * (1 - a2) + a2 * distYCbCr() 3. if a1 = 1, ??? maybe: 255 * (1 - a2) + a2 * distYCbCr()
*/ */
//return std::min(a1, a2) * DistYCbCrBuffer::dist(pix1, pix2) + 255 * abs(a1 - a2); //return std::min(a1, a2) * distYCbCrBuffered(pix1, pix2) + 255 * abs(a1 - a2);
//=> following code is 15% faster: //=> following code is 15% faster:
const double d = DistYCbCrBuffer::dist(pix1, pix2); const double d = distYCbCrBuffered(pix1, pix2);
if (a1 < a2) if (a1 < a2)
return a1 * d + 255 * (a2 - a1); return a1 * d + 255 * (a2 - a1);
else else
return a2 * d + 255 * (a1 - a2); return a2 * d + 255 * (a1 - a2);
//alternative? return std::sqrt(a1 * a2 * square(DistYCbCrBuffer::dist(pix1, pix2)) + square(255 * (a1 - a2))); //alternative? return std::sqrt(a1 * a2 * square(distYCbCrBuffered(pix1, pix2)) + square(255 * (a1 - a2)));
}
};
struct ColorDistanceUnbufferedARGB
{
static double dist(uint32_t pix1, uint32_t pix2, double luminanceWeight)
{
const double a1 = getAlpha(pix1) / 255.0 ;
const double a2 = getAlpha(pix2) / 255.0 ;
const double d = distYCbCr(pix1, pix2, luminanceWeight);
if (a1 < a2)
return a1 * d + 255 * (a2 - a1);
else
return a2 * d + 255 * (a1 - a2);
} }
}; };
@ -1104,9 +1079,26 @@ struct ColorGradientARGB
void xbrz::scale(size_t factor, const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight, ColorFormat colFmt, const xbrz::ScalerCfg& cfg, int yFirst, int yLast) void xbrz::scale(size_t factor, const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight, ColorFormat colFmt, const xbrz::ScalerCfg& cfg, int yFirst, int yLast)
{ {
static_assert(SCALE_FACTOR_MAX == 6, "");
switch (colFmt) switch (colFmt)
{ {
case ARGB: case ColorFormat::RGB:
switch (factor)
{
case 2:
return scaleImage<Scaler2x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
case 3:
return scaleImage<Scaler3x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
case 4:
return scaleImage<Scaler4x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
case 5:
return scaleImage<Scaler5x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
case 6:
return scaleImage<Scaler6x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
}
break;
case ColorFormat::ARGB:
switch (factor) switch (factor)
{ {
case 2: case 2:
@ -1122,19 +1114,19 @@ void xbrz::scale(size_t factor, const uint32_t* src, uint32_t* trg, int srcWidth
} }
break; break;
case RGB: case ColorFormat::ARGB_UNBUFFERED:
switch (factor) switch (factor)
{ {
case 2: case 2:
return scaleImage<Scaler2x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast); return scaleImage<Scaler2x<ColorGradientARGB>, ColorDistanceUnbufferedARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
case 3: case 3:
return scaleImage<Scaler3x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast); return scaleImage<Scaler3x<ColorGradientARGB>, ColorDistanceUnbufferedARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
case 4: case 4:
return scaleImage<Scaler4x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast); return scaleImage<Scaler4x<ColorGradientARGB>, ColorDistanceUnbufferedARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
case 5: case 5:
return scaleImage<Scaler5x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast); return scaleImage<Scaler5x<ColorGradientARGB>, ColorDistanceUnbufferedARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
case 6: case 6:
return scaleImage<Scaler6x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast); return scaleImage<Scaler6x<ColorGradientARGB>, ColorDistanceUnbufferedARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
} }
break; break;
} }
@ -1146,84 +1138,133 @@ bool xbrz::equalColorTest(uint32_t col1, uint32_t col2, ColorFormat colFmt, doub
{ {
switch (colFmt) switch (colFmt)
{ {
case ARGB: case ColorFormat::RGB:
return ColorDistanceARGB::dist(col1, col2, luminanceWeight) < equalColorTolerance;
case RGB:
return ColorDistanceRGB::dist(col1, col2, luminanceWeight) < equalColorTolerance; return ColorDistanceRGB::dist(col1, col2, luminanceWeight) < equalColorTolerance;
case ColorFormat::ARGB:
return ColorDistanceARGB::dist(col1, col2, luminanceWeight) < equalColorTolerance;
case ColorFormat::ARGB_UNBUFFERED:
return ColorDistanceUnbufferedARGB::dist(col1, col2, luminanceWeight) < equalColorTolerance;
} }
assert(false); assert(false);
return false; return false;
} }
void xbrz::nearestNeighborScale(const uint32_t* src, int srcWidth, int srcHeight, int srcPitch, void xbrz::bilinearScale(const uint32_t* src, int srcWidth, int srcHeight,
uint32_t* trg, int trgWidth, int trgHeight, int trgPitch, /**/ uint32_t* trg, int trgWidth, int trgHeight)
SliceType st, int yFirst, int yLast)
{ {
if (srcPitch < srcWidth * static_cast<int>(sizeof(uint32_t)) || bilinearScale(src, srcWidth, srcHeight, srcWidth * sizeof(uint32_t),
trgPitch < trgWidth * static_cast<int>(sizeof(uint32_t))) trg, trgWidth, trgHeight, trgWidth * sizeof(uint32_t),
{ 0, trgHeight, [](uint32_t pix) { return pix; });
assert(false);
return;
}
switch (st)
{
case NN_SCALE_SLICE_SOURCE:
//nearest-neighbor (going over source image - fast for upscaling, since source is read only once
yFirst = std::max(yFirst, 0);
yLast = std::min(yLast, srcHeight);
if (yFirst >= yLast || trgWidth <= 0 || trgHeight <= 0) return;
for (int y = yFirst; y < yLast; ++y)
{
//mathematically: ySrc = floor(srcHeight * yTrg / trgHeight)
// => search for integers in: [ySrc, ySrc + 1) * trgHeight / srcHeight
//keep within for loop to support MT input slices!
const int yTrg_first = ( y * trgHeight + srcHeight - 1) / srcHeight; //=ceil(y * trgHeight / srcHeight)
const int yTrg_last = ((y + 1) * trgHeight + srcHeight - 1) / srcHeight; //=ceil(((y + 1) * trgHeight) / srcHeight)
const int blockHeight = yTrg_last - yTrg_first;
if (blockHeight > 0)
{
const uint32_t* srcLine = byteAdvance(src, y * srcPitch);
uint32_t* trgLine = byteAdvance(trg, yTrg_first * trgPitch);
int xTrg_first = 0;
for (int x = 0; x < srcWidth; ++x)
{
int xTrg_last = ((x + 1) * trgWidth + srcWidth - 1) / srcWidth;
const int blockWidth = xTrg_last - xTrg_first;
if (blockWidth > 0)
{
xTrg_first = xTrg_last;
fillBlock(trgLine, trgPitch, srcLine[x], blockWidth, blockHeight);
trgLine += blockWidth;
}
}
}
}
break;
case NN_SCALE_SLICE_TARGET:
//nearest-neighbor (going over target image - slow for upscaling, since source is read multiple times missing out on cache! Fast for similar image sizes!)
yFirst = std::max(yFirst, 0);
yLast = std::min(yLast, trgHeight);
if (yFirst >= yLast || srcHeight <= 0 || srcWidth <= 0) return;
for (int y = yFirst; y < yLast; ++y)
{
uint32_t* trgLine = byteAdvance(trg, y * trgPitch);
const int ySrc = srcHeight * y / trgHeight;
const uint32_t* srcLine = byteAdvance(src, ySrc * srcPitch);
for (int x = 0; x < trgWidth; ++x)
{
const int xSrc = srcWidth * x / trgWidth;
trgLine[x] = srcLine[xSrc];
}
}
break;
}
} }
void xbrz::nearestNeighborScale(const uint32_t* src, int srcWidth, int srcHeight,
/**/ uint32_t* trg, int trgWidth, int trgHeight)
{
nearestNeighborScale(src, srcWidth, srcHeight, srcWidth * sizeof(uint32_t),
trg, trgWidth, trgHeight, trgWidth * sizeof(uint32_t),
0, trgHeight, [](uint32_t pix) { return pix; });
}
#if 0
//#include <ppl.h>
void bilinearScaleCpu(const uint32_t* src, int srcWidth, int srcHeight,
/**/ uint32_t* trg, int trgWidth, int trgHeight)
{
const int TASK_GRANULARITY = 16;
concurrency::task_group tg;
for (int i = 0; i < trgHeight; i += TASK_GRANULARITY)
tg.run([=]
{
const int iLast = std::min(i + TASK_GRANULARITY, trgHeight);
xbrz::bilinearScale(src, srcWidth, srcHeight, srcWidth * sizeof(uint32_t),
trg, trgWidth, trgHeight, trgWidth * sizeof(uint32_t),
i, iLast, [](uint32_t pix) { return pix; });
});
tg.wait();
}
//Perf: AMP vs CPU: merely ~10% shorter runtime (scaling 1280x800 -> 1920x1080)
//#include <amp.h>
void bilinearScaleAmp(const uint32_t* src, int srcWidth, int srcHeight, //throw concurrency::runtime_exception
/**/ uint32_t* trg, int trgWidth, int trgHeight)
{
//C++ AMP reference: https://msdn.microsoft.com/en-us/library/hh289390.aspx
//introduction to C++ AMP: https://msdn.microsoft.com/en-us/magazine/hh882446.aspx
using namespace concurrency;
//TODO: pitch
if (srcHeight <= 0 || srcWidth <= 0) return;
const float scaleX = static_cast<float>(trgWidth ) / srcWidth;
const float scaleY = static_cast<float>(trgHeight) / srcHeight;
array_view<const uint32_t, 2> srcView(srcHeight, srcWidth, src);
array_view< uint32_t, 2> trgView(trgHeight, trgWidth, trg);
trgView.discard_data();
parallel_for_each(trgView.extent, [=](index<2> idx) restrict(amp) //throw ?
{
const int y = idx[0];
const int x = idx[1];
//Perf notes:
// -> float-based calculation is (almost 2x) faster than double!
// -> no noticeable improvement via tiling: https://msdn.microsoft.com/en-us/magazine/hh882447.aspx
// -> no noticeable improvement with restrict(amp,cpu)
// -> iterating over y-axis only is significantly slower!
// -> pre-calculating x,y-dependent variables in a buffer + array_view<> is ~ 20 % slower!
const int y1 = srcHeight * y / trgHeight;
int y2 = y1 + 1;
if (y2 == srcHeight) --y2;
const float yy1 = y / scaleY - y1;
const float y2y = 1 - yy1;
//-------------------------------------
const int x1 = srcWidth * x / trgWidth;
int x2 = x1 + 1;
if (x2 == srcWidth) --x2;
const float xx1 = x / scaleX - x1;
const float x2x = 1 - xx1;
//-------------------------------------
const float x2xy2y = x2x * y2y;
const float xx1y2y = xx1 * y2y;
const float x2xyy1 = x2x * yy1;
const float xx1yy1 = xx1 * yy1;
auto interpolate = [=](int offset)
{
/*
https://en.wikipedia.org/wiki/Bilinear_interpolation
(c11(x2 - x) + c21(x - x1)) * (y2 - y ) +
(c12(x2 - x) + c22(x - x1)) * (y - y1)
*/
const auto c11 = (srcView(y1, x1) >> (8 * offset)) & 0xff;
const auto c21 = (srcView(y1, x2) >> (8 * offset)) & 0xff;
const auto c12 = (srcView(y2, x1) >> (8 * offset)) & 0xff;
const auto c22 = (srcView(y2, x2) >> (8 * offset)) & 0xff;
return c11 * x2xy2y + c21 * xx1y2y +
c12 * x2xyy1 + c22 * xx1yy1;
};
const float bi = interpolate(0);
const float gi = interpolate(1);
const float ri = interpolate(2);
const float ai = interpolate(3);
const auto b = static_cast<uint32_t>(bi + 0.5f);
const auto g = static_cast<uint32_t>(gi + 0.5f);
const auto r = static_cast<uint32_t>(ri + 0.5f);
const auto a = static_cast<uint32_t>(ai + 0.5f);
trgView(y, x) = (a << 24) | (r << 16) | (g << 8) | b;
});
trgView.synchronize(); //throw ?
}
#endif

View file

@ -1,28 +1,24 @@
// **************************************************************************** // ****************************************************************************
// * This file is part of the HqMAME project. It is distributed under * // * This file is part of the xBRZ project. It is distributed under *
// * GNU General Public License: http://www.gnu.org/licenses/gpl-3.0 * // * GNU General Public License: https://www.gnu.org/licenses/gpl-3.0 *
// * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved * // * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved *
// * * // * *
// * Additionally and as a special exception, the author gives permission * // * Additionally and as a special exception, the author gives permission *
// * to link the code of this program with the MAME library (or with modified * // * to link the code of this program with the following libraries *
// * versions of MAME that use the same license as MAME), and distribute * // * (or with modified versions that use the same licenses), and distribute *
// * linked combinations including the two. You must obey the GNU General * // * linked combinations including the two: MAME, FreeFileSync, Snes9x, ePSXe *
// * Public License in all respects for all of the code used other than MAME. * // * You must obey the GNU General Public License in all respects for all of *
// * the code used other than MAME, FreeFileSync, Snes9x, ePSXe. *
// * If you modify this file, you may extend this exception to your version * // * If you modify this file, you may extend this exception to your version *
// * of the file, but you are not obligated to do so. If you do not wish to * // * of the file, but you are not obligated to do so. If you do not wish to *
// * do so, delete this exception statement from your version. * // * do so, delete this exception statement from your version. *
// * *
// * An explicit permission was granted to use xBRZ in combination with ZDoom *
// * and derived projects as long as it is used for non-commercial purposes. *
// * *
// * Backported to C++98 by Alexey Lysiuk *
// **************************************************************************** // ****************************************************************************
#ifndef XBRZ_HEADER_3847894708239054 #ifndef XBRZ_HEADER_3847894708239054
#define XBRZ_HEADER_3847894708239054 #define XBRZ_HEADER_3847894708239054
#include <cstddef> //size_t #include <cstddef> //size_t
#include <stdint.h> //uint32_t #include <cstdint> //uint32_t
#include <limits> #include <limits>
#include "xbrz_config.h" #include "xbrz_config.h"
@ -43,60 +39,41 @@ http://board.byuu.org/viewtopic.php?f=10&t=2248
- support scaling up to 6xBRZ - support scaling up to 6xBRZ
*/ */
enum ColorFormat //from high bits -> low bits, 8 bit per channel enum class ColorFormat //from high bits -> low bits, 8 bit per channel
{ {
RGB, //8 bit for each red, green, blue, upper 8 bits unused RGB, //8 bit for each red, green, blue, upper 8 bits unused
ARGB, //including alpha channel, BGRA byte order on little-endian machines ARGB, //including alpha channel, BGRA byte order on little-endian machines
ARGB_UNBUFFERED, //like ARGB, but without the one-time buffer creation overhead (ca. 100 - 300 ms) at the expense of a slightly slower scaling time
}; };
const int SCALE_FACTOR_MAX = 6;
/* /*
-> map source (srcWidth * srcHeight) to target (scale * width x scale * height) image, optionally processing a half-open slice of rows [yFirst, yLast) only -> map source (srcWidth * srcHeight) to target (scale * width x scale * height) image, optionally processing a half-open slice of rows [yFirst, yLast) only
-> support for source/target pitch in bytes! -> support for source/target pitch in bytes!
-> if your emulator changes only a few image slices during each cycle (e.g. DOSBox) then there's no need to run xBRZ on the complete image: -> if your emulator changes only a few image slices during each cycle (e.g. DOSBox) then there's no need to run xBRZ on the complete image:
Just make sure you enlarge the source image slice by 2 rows on top and 2 on bottom (this is the additional range the xBRZ algorithm is using during analysis) Just make sure you enlarge the source image slice by 2 rows on top and 2 on bottom (this is the additional range the xBRZ algorithm is using during analysis)
Caveat: If there are multiple changed slices, make sure they do not overlap after adding these additional rows in order to avoid a memory race condition CAVEAT: If there are multiple changed slices, make sure they do not overlap after adding these additional rows in order to avoid a memory race condition
in the target image data if you are using multiple threads for processing each enlarged slice! in the target image data if you are using multiple threads for processing each enlarged slice!
THREAD-SAFETY: - parts of the same image may be scaled by multiple threads as long as the [yFirst, yLast) ranges do not overlap! THREAD-SAFETY: - parts of the same image may be scaled by multiple threads as long as the [yFirst, yLast) ranges do not overlap!
- there is a minor inefficiency for the first row of a slice, so avoid processing single rows only; suggestion: process 8-16 rows at least - there is a minor inefficiency for the first row of a slice, so avoid processing single rows only; suggestion: process at least 8-16 rows
*/ */
#ifdef max void scale(size_t factor, //valid range: 2 - SCALE_FACTOR_MAX
#undef max
#endif
void scale(size_t factor, //valid range: 2 - 6
const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight, const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight,
ColorFormat colFmt, ColorFormat colFmt,
const ScalerCfg& cfg = ScalerCfg(), const ScalerCfg& cfg = ScalerCfg(),
int yFirst = 0, int yLast = std::numeric_limits<int>::max()); //slice of source image int yFirst = 0, int yLast = std::numeric_limits<int>::max()); //slice of source image
void nearestNeighborScale(const uint32_t* src, int srcWidth, int srcHeight, void bilinearScale(const uint32_t* src, int srcWidth, int srcHeight,
uint32_t* trg, int trgWidth, int trgHeight); /**/ uint32_t* trg, int trgWidth, int trgHeight);
void nearestNeighborScale(const uint32_t* src, int srcWidth, int srcHeight,
/**/ uint32_t* trg, int trgWidth, int trgHeight);
enum SliceType
{
NN_SCALE_SLICE_SOURCE,
NN_SCALE_SLICE_TARGET,
};
void nearestNeighborScale(const uint32_t* src, int srcWidth, int srcHeight, int srcPitch, //pitch in bytes!
uint32_t* trg, int trgWidth, int trgHeight, int trgPitch,
SliceType st, int yFirst, int yLast);
//parameter tuning //parameter tuning
bool equalColorTest(uint32_t col1, uint32_t col2, ColorFormat colFmt, double luminanceWeight, double equalColorTolerance); bool equalColorTest(uint32_t col1, uint32_t col2, ColorFormat colFmt, double luminanceWeight, double equalColorTolerance);
//########################### implementation ###########################
inline
void nearestNeighborScale(const uint32_t* src, int srcWidth, int srcHeight,
uint32_t* trg, int trgWidth, int trgHeight)
{
nearestNeighborScale(src, srcWidth, srcHeight, srcWidth * sizeof(uint32_t),
trg, trgWidth, trgHeight, trgWidth * sizeof(uint32_t),
NN_SCALE_SLICE_TARGET, 0, trgHeight);
}
} }
#endif #endif

View file

@ -1,21 +1,17 @@
// **************************************************************************** // ****************************************************************************
// * This file is part of the HqMAME project. It is distributed under * // * This file is part of the xBRZ project. It is distributed under *
// * GNU General Public License: http://www.gnu.org/licenses/gpl-3.0 * // * GNU General Public License: https://www.gnu.org/licenses/gpl-3.0 *
// * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved * // * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved *
// * * // * *
// * Additionally and as a special exception, the author gives permission * // * Additionally and as a special exception, the author gives permission *
// * to link the code of this program with the MAME library (or with modified * // * to link the code of this program with the following libraries *
// * versions of MAME that use the same license as MAME), and distribute * // * (or with modified versions that use the same licenses), and distribute *
// * linked combinations including the two. You must obey the GNU General * // * linked combinations including the two: MAME, FreeFileSync, Snes9x, ePSXe *
// * Public License in all respects for all of the code used other than MAME. * // * You must obey the GNU General Public License in all respects for all of *
// * the code used other than MAME, FreeFileSync, Snes9x, ePSXe. *
// * If you modify this file, you may extend this exception to your version * // * If you modify this file, you may extend this exception to your version *
// * of the file, but you are not obligated to do so. If you do not wish to * // * of the file, but you are not obligated to do so. If you do not wish to *
// * do so, delete this exception statement from your version. * // * do so, delete this exception statement from your version. *
// * *
// * An explicit permission was granted to use xBRZ in combination with ZDoom *
// * and derived projects as long as it is used for non-commercial purposes. *
// * *
// * Backported to C++98 by Alexey Lysiuk *
// **************************************************************************** // ****************************************************************************
#ifndef XBRZ_CONFIG_HEADER_284578425345 #ifndef XBRZ_CONFIG_HEADER_284578425345
@ -27,18 +23,11 @@ namespace xbrz
{ {
struct ScalerCfg struct ScalerCfg
{ {
ScalerCfg() : double luminanceWeight = 1;
luminanceWeight(1), double equalColorTolerance = 30;
equalColorTolerance(30), double dominantDirectionThreshold = 3.6;
dominantDirectionThreshold(3.6), double steepDirectionThreshold = 2.2;
steepDirectionThreshold(2.2), double newTestAttribute = 0; //unused; test new parameters
newTestAttribute(0) {}
double luminanceWeight;
double equalColorTolerance;
double dominantDirectionThreshold;
double steepDirectionThreshold;
double newTestAttribute; //unused; test new parameters
}; };
} }

View file

@ -0,0 +1,268 @@
// ****************************************************************************
// * This file is part of the xBRZ project. It is distributed under *
// * GNU General Public License: https://www.gnu.org/licenses/gpl-3.0 *
// * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved *
// * *
// * Additionally and as a special exception, the author gives permission *
// * to link the code of this program with the following libraries *
// * (or with modified versions that use the same licenses), and distribute *
// * linked combinations including the two: MAME, FreeFileSync, Snes9x, ePSXe *
// * You must obey the GNU General Public License in all respects for all of *
// * the code used other than MAME, FreeFileSync, Snes9x, ePSXe. *
// * If you modify this file, you may extend this exception to your version *
// * of the file, but you are not obligated to do so. If you do not wish to *
// * do so, delete this exception statement from your version. *
// ****************************************************************************
#ifndef XBRZ_TOOLS_H_825480175091875
#define XBRZ_TOOLS_H_825480175091875
#include <cassert>
#include <algorithm>
#include <type_traits>
namespace xbrz
{
template <uint32_t N> inline
unsigned char getByte(uint32_t val) { return static_cast<unsigned char>((val >> (8 * N)) & 0xff); }
inline unsigned char getAlpha(uint32_t pix) { return getByte<3>(pix); }
inline unsigned char getRed (uint32_t pix) { return getByte<2>(pix); }
inline unsigned char getGreen(uint32_t pix) { return getByte<1>(pix); }
inline unsigned char getBlue (uint32_t pix) { return getByte<0>(pix); }
inline uint32_t makePixel(unsigned char a, unsigned char r, unsigned char g, unsigned char b) { return (a << 24) | (r << 16) | (g << 8) | b; }
inline uint32_t makePixel( unsigned char r, unsigned char g, unsigned char b) { return (r << 16) | (g << 8) | b; }
inline uint32_t rgb555to888(uint16_t pix) { return ((pix & 0x7C00) << 9) | ((pix & 0x03E0) << 6) | ((pix & 0x001F) << 3); }
inline uint32_t rgb565to888(uint16_t pix) { return ((pix & 0xF800) << 8) | ((pix & 0x07E0) << 5) | ((pix & 0x001F) << 3); }
inline uint16_t rgb888to555(uint32_t pix) { return static_cast<uint16_t>(((pix & 0xF80000) >> 9) | ((pix & 0x00F800) >> 6) | ((pix & 0x0000F8) >> 3)); }
inline uint16_t rgb888to565(uint32_t pix) { return static_cast<uint16_t>(((pix & 0xF80000) >> 8) | ((pix & 0x00FC00) >> 5) | ((pix & 0x0000F8) >> 3)); }
template <class Pix> inline
Pix* byteAdvance(Pix* ptr, int bytes)
{
using PixNonConst = typename std::remove_cv<Pix>::type;
using PixByte = typename std::conditional<std::is_same<Pix, PixNonConst>::value, char, const char>::type;
static_assert(std::is_integral<PixNonConst>::value, "Pix* is expected to be cast-able to char*");
return reinterpret_cast<Pix*>(reinterpret_cast<PixByte*>(ptr) + bytes);
}
//fill block with the given color
template <class Pix> inline
void fillBlock(Pix* trg, int pitch, Pix col, int blockWidth, int blockHeight)
{
//for (int y = 0; y < blockHeight; ++y, trg = byteAdvance(trg, pitch))
// std::fill(trg, trg + blockWidth, col);
for (int y = 0; y < blockHeight; ++y, trg = byteAdvance(trg, pitch))
for (int x = 0; x < blockWidth; ++x)
trg[x] = col;
}
//nearest-neighbor (going over target image - slow for upscaling, since source is read multiple times missing out on cache! Fast for similar image sizes!)
template <class PixSrc, class PixTrg, class PixConverter>
void nearestNeighborScale(const PixSrc* src, int srcWidth, int srcHeight, int srcPitch,
/**/ PixTrg* trg, int trgWidth, int trgHeight, int trgPitch,
int yFirst, int yLast, PixConverter pixCvrt /*convert PixSrc to PixTrg*/)
{
static_assert(std::is_integral<PixSrc>::value, "PixSrc* is expected to be cast-able to char*");
static_assert(std::is_integral<PixTrg>::value, "PixTrg* is expected to be cast-able to char*");
static_assert(std::is_same<decltype(pixCvrt(PixSrc())), PixTrg>::value, "PixConverter returning wrong pixel format");
if (srcPitch < srcWidth * static_cast<int>(sizeof(PixSrc)) ||
trgPitch < trgWidth * static_cast<int>(sizeof(PixTrg)))
{
assert(false);
return;
}
yFirst = std::max(yFirst, 0);
yLast = std::min(yLast, trgHeight);
if (yFirst >= yLast || srcHeight <= 0 || srcWidth <= 0) return;
for (int y = yFirst; y < yLast; ++y)
{
const int ySrc = srcHeight * y / trgHeight;
const PixSrc* const srcLine = byteAdvance(src, ySrc * srcPitch);
PixTrg* const trgLine = byteAdvance(trg, y * trgPitch);
for (int x = 0; x < trgWidth; ++x)
{
const int xSrc = srcWidth * x / trgWidth;
trgLine[x] = pixCvrt(srcLine[xSrc]);
}
}
}
//nearest-neighbor (going over source image - fast for upscaling, since source is read only once
template <class PixSrc, class PixTrg, class PixConverter>
void nearestNeighborScaleOverSource(const PixSrc* src, int srcWidth, int srcHeight, int srcPitch,
/**/ PixTrg* trg, int trgWidth, int trgHeight, int trgPitch,
int yFirst, int yLast, PixConverter pixCvrt /*convert PixSrc to PixTrg*/)
{
static_assert(std::is_integral<PixSrc>::value, "PixSrc* is expected to be cast-able to char*");
static_assert(std::is_integral<PixTrg>::value, "PixTrg* is expected to be cast-able to char*");
static_assert(std::is_same<decltype(pixCvrt(PixSrc())), PixTrg>::value, "PixConverter returning wrong pixel format");
if (srcPitch < srcWidth * static_cast<int>(sizeof(PixSrc)) ||
trgPitch < trgWidth * static_cast<int>(sizeof(PixTrg)))
{
assert(false);
return;
}
yFirst = std::max(yFirst, 0);
yLast = std::min(yLast, srcHeight);
if (yFirst >= yLast || trgWidth <= 0 || trgHeight <= 0) return;
for (int y = yFirst; y < yLast; ++y)
{
//mathematically: ySrc = floor(srcHeight * yTrg / trgHeight)
// => search for integers in: [ySrc, ySrc + 1) * trgHeight / srcHeight
//keep within for loop to support MT input slices!
const int yTrgFirst = ( y * trgHeight + srcHeight - 1) / srcHeight; //=ceil(y * trgHeight / srcHeight)
const int yTrgLast = ((y + 1) * trgHeight + srcHeight - 1) / srcHeight; //=ceil(((y + 1) * trgHeight) / srcHeight)
const int blockHeight = yTrgLast - yTrgFirst;
if (blockHeight > 0)
{
const PixSrc* srcLine = byteAdvance(src, y * srcPitch);
/**/ PixTrg* trgLine = byteAdvance(trg, yTrgFirst * trgPitch);
int xTrgFirst = 0;
for (int x = 0; x < srcWidth; ++x)
{
const int xTrgLast = ((x + 1) * trgWidth + srcWidth - 1) / srcWidth;
const int blockWidth = xTrgLast - xTrgFirst;
if (blockWidth > 0)
{
xTrgFirst = xTrgLast;
const auto trgPix = pixCvrt(srcLine[x]);
fillBlock(trgLine, trgPitch, trgPix, blockWidth, blockHeight);
trgLine += blockWidth;
}
}
}
}
}
template <class PixTrg, class PixConverter>
void bilinearScale(const uint32_t* src, int srcWidth, int srcHeight, int srcPitch,
/**/ PixTrg* trg, int trgWidth, int trgHeight, int trgPitch,
int yFirst, int yLast, PixConverter pixCvrt /*convert uint32_t to PixTrg*/)
{
static_assert(std::is_integral<PixTrg>::value, "PixTrg* is expected to be cast-able to char*");
static_assert(std::is_same<decltype(pixCvrt(uint32_t())), PixTrg>::value, "PixConverter returning wrong pixel format");
if (srcPitch < srcWidth * static_cast<int>(sizeof(uint32_t)) ||
trgPitch < trgWidth * static_cast<int>(sizeof(PixTrg)))
{
assert(false);
return;
}
yFirst = std::max(yFirst, 0);
yLast = std::min(yLast, trgHeight);
if (yFirst >= yLast || srcHeight <= 0 || srcWidth <= 0) return;
const double scaleX = static_cast<double>(trgWidth ) / srcWidth;
const double scaleY = static_cast<double>(trgHeight) / srcHeight;
//perf notes:
// -> double-based calculation is (slightly) faster than float
// -> precalculation gives significant boost; std::vector<> memory allocation is negligible!
struct CoeffsX
{
int x1;
int x2;
double xx1;
double x2x;
};
std::vector<CoeffsX> buf(trgWidth);
for (int x = 0; x < trgWidth; ++x)
{
const int x1 = srcWidth * x / trgWidth;
int x2 = x1 + 1;
if (x2 == srcWidth) --x2;
const double xx1 = x / scaleX - x1;
const double x2x = 1 - xx1;
buf[x] = { x1, x2, xx1, x2x };
}
for (int y = yFirst; y < yLast; ++y)
{
const int y1 = srcHeight * y / trgHeight;
int y2 = y1 + 1;
if (y2 == srcHeight) --y2;
const double yy1 = y / scaleY - y1;
const double y2y = 1 - yy1;
const uint32_t* const srcLine = byteAdvance(src, y1 * srcPitch);
const uint32_t* const srcLineNext = byteAdvance(src, y2 * srcPitch);
PixTrg* const trgLine = byteAdvance(trg, y * trgPitch);
for (int x = 0; x < trgWidth; ++x)
{
//perf: do NOT "simplify" the variable layout without measurement!
const int x1 = buf[x].x1;
const int x2 = buf[x].x2;
const double xx1 = buf[x].xx1;
const double x2x = buf[x].x2x;
const double x2xy2y = x2x * y2y;
const double xx1y2y = xx1 * y2y;
const double x2xyy1 = x2x * yy1;
const double xx1yy1 = xx1 * yy1;
auto interpolate = [=](int offset)
{
/*
https://en.wikipedia.org/wiki/Bilinear_interpolation
(c11(x2 - x) + c21(x - x1)) * (y2 - y ) +
(c12(x2 - x) + c22(x - x1)) * (y - y1)
*/
const auto c11 = (srcLine [x1] >> (8 * offset)) & 0xff;
const auto c21 = (srcLine [x2] >> (8 * offset)) & 0xff;
const auto c12 = (srcLineNext[x1] >> (8 * offset)) & 0xff;
const auto c22 = (srcLineNext[x2] >> (8 * offset)) & 0xff;
return c11 * x2xy2y + c21 * xx1y2y +
c12 * x2xyy1 + c22 * xx1yy1;
};
const double bi = interpolate(0);
const double gi = interpolate(1);
const double ri = interpolate(2);
const double ai = interpolate(3);
const auto b = static_cast<uint32_t>(bi + 0.5);
const auto g = static_cast<uint32_t>(gi + 0.5);
const auto r = static_cast<uint32_t>(ri + 0.5);
const auto a = static_cast<uint32_t>(ai + 0.5);
const uint32_t trgPix = (a << 24) | (r << 16) | (g << 8) | b;
trgLine[x] = pixCvrt(trgPix);
}
}
}
}
#endif //XBRZ_TOOLS_H_825480175091875