From 5fa5b062d6b5b4edb1784fc388e41da418435e5f Mon Sep 17 00:00:00 2001 From: Magnus Norddahl Date: Mon, 20 Feb 2017 22:49:52 +0100 Subject: [PATCH] Added php script for the span drawers --- src/swrenderer/drawers/r_draw_rgba.cpp | 49 + src/swrenderer/drawers/r_draw_rgba.h | 12 +- src/swrenderer/drawers/r_draw_span32.h | 5174 ++++++++++++++++++++++ src/swrenderer/drawers/r_draw_span32.php | 388 ++ 4 files changed, 5617 insertions(+), 6 deletions(-) create mode 100644 src/swrenderer/drawers/r_draw_span32.h create mode 100644 src/swrenderer/drawers/r_draw_span32.php diff --git a/src/swrenderer/drawers/r_draw_rgba.cpp b/src/swrenderer/drawers/r_draw_rgba.cpp index d2baacaae..532aa0940 100644 --- a/src/swrenderer/drawers/r_draw_rgba.cpp +++ b/src/swrenderer/drawers/r_draw_rgba.cpp @@ -42,6 +42,7 @@ #include "swrenderer/scene/r_light.h" #include "r_draw_wall32.h" #include "r_draw_sprite32.h" +#include "r_draw_span32.h" #include "gi.h" #include "stats.h" @@ -246,6 +247,54 @@ namespace swrenderer Queue->Push(args); } + void SWTruecolorDrawers::DrawSpan(const SpanDrawerArgs &args) + { + if (r_phpdrawers) + Queue->Push(args); + else + Queue->Push(args); + } + + void SWTruecolorDrawers::DrawSpanMasked(const SpanDrawerArgs &args) + { + if (r_phpdrawers) + Queue->Push(args); + else + Queue->Push(args); + } + + void SWTruecolorDrawers::DrawSpanTranslucent(const SpanDrawerArgs &args) + { + if (r_phpdrawers) + Queue->Push(args); + else + Queue->Push(args); + } + + void SWTruecolorDrawers::DrawSpanMaskedTranslucent(const SpanDrawerArgs &args) + { + if (r_phpdrawers) + Queue->Push(args); + else + Queue->Push(args); + } + + void SWTruecolorDrawers::DrawSpanAddClamp(const SpanDrawerArgs &args) + { + if (r_phpdrawers) + Queue->Push(args); + else + Queue->Push(args); + } + + void SWTruecolorDrawers::DrawSpanMaskedAddClamp(const SpanDrawerArgs &args) + { + if (r_phpdrawers) + Queue->Push(args); + else + Queue->Push(args); + } + DrawSpanLLVMCommand::DrawSpanLLVMCommand(const SpanDrawerArgs &drawerargs) { auto shade_constants = drawerargs.ColormapConstants(); diff --git a/src/swrenderer/drawers/r_draw_rgba.h b/src/swrenderer/drawers/r_draw_rgba.h index 383ba13a5..52452fab0 100644 --- a/src/swrenderer/drawers/r_draw_rgba.h +++ b/src/swrenderer/drawers/r_draw_rgba.h @@ -384,12 +384,12 @@ namespace swrenderer void DrawSubClampTranslatedColumn(const SpriteDrawerArgs &args) override; void DrawRevSubClampColumn(const SpriteDrawerArgs &args) override; void DrawRevSubClampTranslatedColumn(const SpriteDrawerArgs &args) override; - void DrawSpan(const SpanDrawerArgs &args) override { Queue->Push(args); } - void DrawSpanMasked(const SpanDrawerArgs &args) override { Queue->Push(args); } - void DrawSpanTranslucent(const SpanDrawerArgs &args) override { Queue->Push(args); } - void DrawSpanMaskedTranslucent(const SpanDrawerArgs &args) override { Queue->Push(args); } - void DrawSpanAddClamp(const SpanDrawerArgs &args) override { Queue->Push(args); } - void DrawSpanMaskedAddClamp(const SpanDrawerArgs &args) override { Queue->Push(args); } + void DrawSpan(const SpanDrawerArgs &args) override; + void DrawSpanMasked(const SpanDrawerArgs &args) override; + void DrawSpanTranslucent(const SpanDrawerArgs &args) override; + void DrawSpanMaskedTranslucent(const SpanDrawerArgs &args) override; + void DrawSpanAddClamp(const SpanDrawerArgs &args) override; + void DrawSpanMaskedAddClamp(const SpanDrawerArgs &args) override; void FillSpan(const SpanDrawerArgs &args) override { Queue->Push(args); } void DrawTiltedSpan(const SpanDrawerArgs &args, int y, int x1, int x2, const FVector3 &plane_sz, const FVector3 &plane_su, const FVector3 &plane_sv, bool plane_shade, int planeshade, float planelightfloat, fixed_t pviewx, fixed_t pviewy, FDynamicColormap *basecolormap) override diff --git a/src/swrenderer/drawers/r_draw_span32.h b/src/swrenderer/drawers/r_draw_span32.h new file mode 100644 index 000000000..6f282e7bd --- /dev/null +++ b/src/swrenderer/drawers/r_draw_span32.h @@ -0,0 +1,5174 @@ +/* +** Drawer commands for spans +** Copyright (c) 2016 Magnus Norddahl +** +** This software is provided 'as-is', without any express or implied +** warranty. In no event will the authors be held liable for any damages +** arising from the use of this software. +** +** Permission is granted to anyone to use this software for any purpose, +** including commercial applications, and to alter it and redistribute it +** freely, subject to the following restrictions: +** +** 1. The origin of this software must not be misrepresented; you must not +** claim that you wrote the original software. If you use this software +** in a product, an acknowledgment in the product documentation would be +** appreciated but is not required. +** 2. Altered source versions must be plainly marked as such, and must not be +** misrepresented as being the original software. +** 3. This notice may not be removed or altered from any source distribution. +** +*/ + +/* + Warning: this C++ source file has been auto-generated. Please modify the original php script that generated it. +*/ + +#pragma once + +#include "swrenderer/drawers/r_draw_rgba.h" +#include "swrenderer/viewport/r_spandrawer.h" + +namespace swrenderer +{ + class DrawSpan32Command : public DrawerCommand + { + protected: + SpanDrawerArgs args; + + public: + DrawSpan32Command(const SpanDrawerArgs &drawerargs) : args(drawerargs) { } + + void Execute(DrawerThread *thread) override + { + if (thread->line_skipped_by_thread(args.DestY())) return; + + uint32_t xbits = args.TextureWidthBits(); + uint32_t ybits = args.TextureHeightBits(); + uint32_t xstep = args.TextureUStep(); + uint32_t ystep = args.TextureVStep(); + uint32_t xfrac = args.TextureUPos(); + uint32_t yfrac = args.TextureVPos(); + uint32_t yshift = 32 - ybits; + uint32_t xshift = yshift - xbits; + uint32_t xmask = ((1 << xbits) - 1) << ybits; + + const uint32_t *source = (const uint32_t*)args.TexturePixels(); + + double lod = args.TextureLOD(); + bool mipmapped = args.MipmappedTexture(); + + bool magnifying = lod < 0.0; + if (r_mipmap && mipmapped) + { + int level = (int)lod; + while (level > 0) + { + if (xbits <= 2 || ybits <= 2) + break; + + source += (1 << (xbits)) * (1 << (ybits)); + xbits -= 1; + ybits -= 1; + level--; + } + } + + bool is_nearest_filter = !((magnifying && r_magfilter) || (!magnifying && r_minfilter)); + + auto shade_constants = args.ColormapConstants(); + if (shade_constants.simple_shade) + { + if (is_nearest_filter) + { + bool is_64x64 = xbits == 6 && ybits == 6; + if (is_64x64) + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + + // Sample + unsigned int ifgcolor[2]; + { + int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); + unsigned int sampleout = source[sample_index]; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i outcolor = fgcolor; + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + + // Sample + unsigned int ifgcolor[2]; + int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i outcolor = fgcolor; + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + else + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + + // Sample + unsigned int ifgcolor[2]; + { + int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + unsigned int sampleout = source[sample_index]; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i outcolor = fgcolor; + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + + // Sample + unsigned int ifgcolor[2]; + int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i outcolor = fgcolor; + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + } + else + { + bool is_64x64 = xbits == 6 && ybits == 6; + if (is_64x64) + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + xfrac -= 1 << (31 - xbits); + yfrac -= 1 << (31 - ybits); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + + // Sample + unsigned int ifgcolor[2]; + { + uint32_t xxbits = 26; + uint32_t yybits = 26; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + uint32_t xxbits = 26; + uint32_t yybits = 26; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i outcolor = fgcolor; + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + + // Sample + unsigned int ifgcolor[2]; + uint32_t xxbits = 26; + uint32_t yybits = 26; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i outcolor = fgcolor; + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + else + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + xfrac -= 1 << (31 - xbits); + yfrac -= 1 << (31 - ybits); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + + // Sample + unsigned int ifgcolor[2]; + { + uint32_t xxbits = 32 - xbits; + uint32_t yybits = 32 - ybits; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + uint32_t xxbits = 32 - xbits; + uint32_t yybits = 32 - ybits; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i outcolor = fgcolor; + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + + // Sample + unsigned int ifgcolor[2]; + uint32_t xxbits = 32 - xbits; + uint32_t yybits = 32 - ybits; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i outcolor = fgcolor; + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + } + } + else + { + if (is_nearest_filter) + { + bool is_64x64 = xbits == 6 && ybits == 6; + if (is_64x64) + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); + __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); + shade_fade = _mm_mullo_epi16(shade_fade, inv_light); + __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); + int desaturate = shade_constants.desaturate; + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + + // Sample + unsigned int ifgcolor[2]; + { + int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); + unsigned int sampleout = source[sample_index]; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i outcolor = fgcolor; + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + + // Sample + unsigned int ifgcolor[2]; + int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i outcolor = fgcolor; + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + else + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); + __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); + shade_fade = _mm_mullo_epi16(shade_fade, inv_light); + __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); + int desaturate = shade_constants.desaturate; + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + + // Sample + unsigned int ifgcolor[2]; + { + int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + unsigned int sampleout = source[sample_index]; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i outcolor = fgcolor; + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + + // Sample + unsigned int ifgcolor[2]; + int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i outcolor = fgcolor; + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + } + else + { + bool is_64x64 = xbits == 6 && ybits == 6; + if (is_64x64) + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); + __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); + shade_fade = _mm_mullo_epi16(shade_fade, inv_light); + __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); + int desaturate = shade_constants.desaturate; + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + xfrac -= 1 << (31 - xbits); + yfrac -= 1 << (31 - ybits); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + + // Sample + unsigned int ifgcolor[2]; + { + uint32_t xxbits = 26; + uint32_t yybits = 26; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + uint32_t xxbits = 26; + uint32_t yybits = 26; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i outcolor = fgcolor; + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + + // Sample + unsigned int ifgcolor[2]; + uint32_t xxbits = 26; + uint32_t yybits = 26; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i outcolor = fgcolor; + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + else + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); + __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); + shade_fade = _mm_mullo_epi16(shade_fade, inv_light); + __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); + int desaturate = shade_constants.desaturate; + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + xfrac -= 1 << (31 - xbits); + yfrac -= 1 << (31 - ybits); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + + // Sample + unsigned int ifgcolor[2]; + { + uint32_t xxbits = 32 - xbits; + uint32_t yybits = 32 - ybits; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + uint32_t xxbits = 32 - xbits; + uint32_t yybits = 32 - ybits; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i outcolor = fgcolor; + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + + // Sample + unsigned int ifgcolor[2]; + uint32_t xxbits = 32 - xbits; + uint32_t yybits = 32 - ybits; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i outcolor = fgcolor; + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + } + } + } + + FString DebugInfo() override { return "DrawSpan32Command"; } + }; + + class DrawSpanMasked32Command : public DrawerCommand + { + protected: + SpanDrawerArgs args; + + public: + DrawSpanMasked32Command(const SpanDrawerArgs &drawerargs) : args(drawerargs) { } + + void Execute(DrawerThread *thread) override + { + if (thread->line_skipped_by_thread(args.DestY())) return; + + uint32_t xbits = args.TextureWidthBits(); + uint32_t ybits = args.TextureHeightBits(); + uint32_t xstep = args.TextureUStep(); + uint32_t ystep = args.TextureVStep(); + uint32_t xfrac = args.TextureUPos(); + uint32_t yfrac = args.TextureVPos(); + uint32_t yshift = 32 - ybits; + uint32_t xshift = yshift - xbits; + uint32_t xmask = ((1 << xbits) - 1) << ybits; + + const uint32_t *source = (const uint32_t*)args.TexturePixels(); + + double lod = args.TextureLOD(); + bool mipmapped = args.MipmappedTexture(); + + bool magnifying = lod < 0.0; + if (r_mipmap && mipmapped) + { + int level = (int)lod; + while (level > 0) + { + if (xbits <= 2 || ybits <= 2) + break; + + source += (1 << (xbits)) * (1 << (ybits)); + xbits -= 1; + ybits -= 1; + level--; + } + } + + bool is_nearest_filter = !((magnifying && r_magfilter) || (!magnifying && r_minfilter)); + + auto shade_constants = args.ColormapConstants(); + if (shade_constants.simple_shade) + { + if (is_nearest_filter) + { + bool is_64x64 = xbits == 6 && ybits == 6; + if (is_64x64) + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); + unsigned int sampleout = source[sample_index]; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + + fgcolor = _mm_mullo_epi16(fgcolor, alpha); + bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); + __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + + fgcolor = _mm_mullo_epi16(fgcolor, alpha); + bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); + __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + else + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + unsigned int sampleout = source[sample_index]; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + + fgcolor = _mm_mullo_epi16(fgcolor, alpha); + bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); + __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + + fgcolor = _mm_mullo_epi16(fgcolor, alpha); + bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); + __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + } + else + { + bool is_64x64 = xbits == 6 && ybits == 6; + if (is_64x64) + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + xfrac -= 1 << (31 - xbits); + yfrac -= 1 << (31 - ybits); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + uint32_t xxbits = 26; + uint32_t yybits = 26; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + uint32_t xxbits = 26; + uint32_t yybits = 26; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + + fgcolor = _mm_mullo_epi16(fgcolor, alpha); + bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); + __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + uint32_t xxbits = 26; + uint32_t yybits = 26; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + + fgcolor = _mm_mullo_epi16(fgcolor, alpha); + bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); + __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + else + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + xfrac -= 1 << (31 - xbits); + yfrac -= 1 << (31 - ybits); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + uint32_t xxbits = 32 - xbits; + uint32_t yybits = 32 - ybits; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + uint32_t xxbits = 32 - xbits; + uint32_t yybits = 32 - ybits; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + + fgcolor = _mm_mullo_epi16(fgcolor, alpha); + bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); + __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + uint32_t xxbits = 32 - xbits; + uint32_t yybits = 32 - ybits; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + + fgcolor = _mm_mullo_epi16(fgcolor, alpha); + bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); + __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + } + } + else + { + if (is_nearest_filter) + { + bool is_64x64 = xbits == 6 && ybits == 6; + if (is_64x64) + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); + __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); + shade_fade = _mm_mullo_epi16(shade_fade, inv_light); + __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); + int desaturate = shade_constants.desaturate; + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); + unsigned int sampleout = source[sample_index]; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + + fgcolor = _mm_mullo_epi16(fgcolor, alpha); + bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); + __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + + fgcolor = _mm_mullo_epi16(fgcolor, alpha); + bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); + __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + else + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); + __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); + shade_fade = _mm_mullo_epi16(shade_fade, inv_light); + __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); + int desaturate = shade_constants.desaturate; + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + unsigned int sampleout = source[sample_index]; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + + fgcolor = _mm_mullo_epi16(fgcolor, alpha); + bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); + __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + + fgcolor = _mm_mullo_epi16(fgcolor, alpha); + bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); + __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + } + else + { + bool is_64x64 = xbits == 6 && ybits == 6; + if (is_64x64) + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); + __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); + shade_fade = _mm_mullo_epi16(shade_fade, inv_light); + __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); + int desaturate = shade_constants.desaturate; + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + xfrac -= 1 << (31 - xbits); + yfrac -= 1 << (31 - ybits); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + uint32_t xxbits = 26; + uint32_t yybits = 26; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + uint32_t xxbits = 26; + uint32_t yybits = 26; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + + fgcolor = _mm_mullo_epi16(fgcolor, alpha); + bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); + __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + uint32_t xxbits = 26; + uint32_t yybits = 26; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + + fgcolor = _mm_mullo_epi16(fgcolor, alpha); + bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); + __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + else + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); + __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); + shade_fade = _mm_mullo_epi16(shade_fade, inv_light); + __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); + int desaturate = shade_constants.desaturate; + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + xfrac -= 1 << (31 - xbits); + yfrac -= 1 << (31 - ybits); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + uint32_t xxbits = 32 - xbits; + uint32_t yybits = 32 - ybits; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + uint32_t xxbits = 32 - xbits; + uint32_t yybits = 32 - ybits; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + + fgcolor = _mm_mullo_epi16(fgcolor, alpha); + bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); + __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + uint32_t xxbits = 32 - xbits; + uint32_t yybits = 32 - ybits; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + + fgcolor = _mm_mullo_epi16(fgcolor, alpha); + bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); + __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + } + } + } + + FString DebugInfo() override { return "DrawSpanMasked32Command"; } + }; + + class DrawSpanTranslucent32Command : public DrawerCommand + { + protected: + SpanDrawerArgs args; + + public: + DrawSpanTranslucent32Command(const SpanDrawerArgs &drawerargs) : args(drawerargs) { } + + void Execute(DrawerThread *thread) override + { + if (thread->line_skipped_by_thread(args.DestY())) return; + + uint32_t xbits = args.TextureWidthBits(); + uint32_t ybits = args.TextureHeightBits(); + uint32_t xstep = args.TextureUStep(); + uint32_t ystep = args.TextureVStep(); + uint32_t xfrac = args.TextureUPos(); + uint32_t yfrac = args.TextureVPos(); + uint32_t yshift = 32 - ybits; + uint32_t xshift = yshift - xbits; + uint32_t xmask = ((1 << xbits) - 1) << ybits; + + const uint32_t *source = (const uint32_t*)args.TexturePixels(); + + double lod = args.TextureLOD(); + bool mipmapped = args.MipmappedTexture(); + + bool magnifying = lod < 0.0; + if (r_mipmap && mipmapped) + { + int level = (int)lod; + while (level > 0) + { + if (xbits <= 2 || ybits <= 2) + break; + + source += (1 << (xbits)) * (1 << (ybits)); + xbits -= 1; + ybits -= 1; + level--; + } + } + + bool is_nearest_filter = !((magnifying && r_magfilter) || (!magnifying && r_minfilter)); + + auto shade_constants = args.ColormapConstants(); + if (shade_constants.simple_shade) + { + if (is_nearest_filter) + { + bool is_64x64 = xbits == 6 && ybits == 6; + if (is_64x64) + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); + unsigned int sampleout = source[sample_index]; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i fgalpha = _mm_set1_epi16(srcalpha); + __m128i bgalpha = _mm_set1_epi16(destalpha); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i fgalpha = _mm_set1_epi16(srcalpha); + __m128i bgalpha = _mm_set1_epi16(destalpha); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + else + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + unsigned int sampleout = source[sample_index]; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i fgalpha = _mm_set1_epi16(srcalpha); + __m128i bgalpha = _mm_set1_epi16(destalpha); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i fgalpha = _mm_set1_epi16(srcalpha); + __m128i bgalpha = _mm_set1_epi16(destalpha); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + } + else + { + bool is_64x64 = xbits == 6 && ybits == 6; + if (is_64x64) + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + xfrac -= 1 << (31 - xbits); + yfrac -= 1 << (31 - ybits); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + uint32_t xxbits = 26; + uint32_t yybits = 26; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + uint32_t xxbits = 26; + uint32_t yybits = 26; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i fgalpha = _mm_set1_epi16(srcalpha); + __m128i bgalpha = _mm_set1_epi16(destalpha); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + uint32_t xxbits = 26; + uint32_t yybits = 26; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i fgalpha = _mm_set1_epi16(srcalpha); + __m128i bgalpha = _mm_set1_epi16(destalpha); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + else + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + xfrac -= 1 << (31 - xbits); + yfrac -= 1 << (31 - ybits); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + uint32_t xxbits = 32 - xbits; + uint32_t yybits = 32 - ybits; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + uint32_t xxbits = 32 - xbits; + uint32_t yybits = 32 - ybits; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i fgalpha = _mm_set1_epi16(srcalpha); + __m128i bgalpha = _mm_set1_epi16(destalpha); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + uint32_t xxbits = 32 - xbits; + uint32_t yybits = 32 - ybits; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i fgalpha = _mm_set1_epi16(srcalpha); + __m128i bgalpha = _mm_set1_epi16(destalpha); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + } + } + else + { + if (is_nearest_filter) + { + bool is_64x64 = xbits == 6 && ybits == 6; + if (is_64x64) + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); + __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); + shade_fade = _mm_mullo_epi16(shade_fade, inv_light); + __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); + int desaturate = shade_constants.desaturate; + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); + unsigned int sampleout = source[sample_index]; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i fgalpha = _mm_set1_epi16(srcalpha); + __m128i bgalpha = _mm_set1_epi16(destalpha); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i fgalpha = _mm_set1_epi16(srcalpha); + __m128i bgalpha = _mm_set1_epi16(destalpha); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + else + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); + __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); + shade_fade = _mm_mullo_epi16(shade_fade, inv_light); + __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); + int desaturate = shade_constants.desaturate; + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + unsigned int sampleout = source[sample_index]; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i fgalpha = _mm_set1_epi16(srcalpha); + __m128i bgalpha = _mm_set1_epi16(destalpha); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i fgalpha = _mm_set1_epi16(srcalpha); + __m128i bgalpha = _mm_set1_epi16(destalpha); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + } + else + { + bool is_64x64 = xbits == 6 && ybits == 6; + if (is_64x64) + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); + __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); + shade_fade = _mm_mullo_epi16(shade_fade, inv_light); + __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); + int desaturate = shade_constants.desaturate; + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + xfrac -= 1 << (31 - xbits); + yfrac -= 1 << (31 - ybits); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + uint32_t xxbits = 26; + uint32_t yybits = 26; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + uint32_t xxbits = 26; + uint32_t yybits = 26; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i fgalpha = _mm_set1_epi16(srcalpha); + __m128i bgalpha = _mm_set1_epi16(destalpha); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + uint32_t xxbits = 26; + uint32_t yybits = 26; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i fgalpha = _mm_set1_epi16(srcalpha); + __m128i bgalpha = _mm_set1_epi16(destalpha); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + else + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); + __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); + shade_fade = _mm_mullo_epi16(shade_fade, inv_light); + __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); + int desaturate = shade_constants.desaturate; + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + xfrac -= 1 << (31 - xbits); + yfrac -= 1 << (31 - ybits); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + uint32_t xxbits = 32 - xbits; + uint32_t yybits = 32 - ybits; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + uint32_t xxbits = 32 - xbits; + uint32_t yybits = 32 - ybits; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i fgalpha = _mm_set1_epi16(srcalpha); + __m128i bgalpha = _mm_set1_epi16(destalpha); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + uint32_t xxbits = 32 - xbits; + uint32_t yybits = 32 - ybits; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i fgalpha = _mm_set1_epi16(srcalpha); + __m128i bgalpha = _mm_set1_epi16(destalpha); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + } + } + } + + FString DebugInfo() override { return "DrawSpanTranslucent32Command"; } + }; + + class DrawSpanAddClamp32Command : public DrawerCommand + { + protected: + SpanDrawerArgs args; + + public: + DrawSpanAddClamp32Command(const SpanDrawerArgs &drawerargs) : args(drawerargs) { } + + void Execute(DrawerThread *thread) override + { + if (thread->line_skipped_by_thread(args.DestY())) return; + + uint32_t xbits = args.TextureWidthBits(); + uint32_t ybits = args.TextureHeightBits(); + uint32_t xstep = args.TextureUStep(); + uint32_t ystep = args.TextureVStep(); + uint32_t xfrac = args.TextureUPos(); + uint32_t yfrac = args.TextureVPos(); + uint32_t yshift = 32 - ybits; + uint32_t xshift = yshift - xbits; + uint32_t xmask = ((1 << xbits) - 1) << ybits; + + const uint32_t *source = (const uint32_t*)args.TexturePixels(); + + double lod = args.TextureLOD(); + bool mipmapped = args.MipmappedTexture(); + + bool magnifying = lod < 0.0; + if (r_mipmap && mipmapped) + { + int level = (int)lod; + while (level > 0) + { + if (xbits <= 2 || ybits <= 2) + break; + + source += (1 << (xbits)) * (1 << (ybits)); + xbits -= 1; + ybits -= 1; + level--; + } + } + + bool is_nearest_filter = !((magnifying && r_magfilter) || (!magnifying && r_minfilter)); + + auto shade_constants = args.ColormapConstants(); + if (shade_constants.simple_shade) + { + if (is_nearest_filter) + { + bool is_64x64 = xbits == 6 && ybits == 6; + if (is_64x64) + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); + unsigned int sampleout = source[sample_index]; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; + + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; + + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + else + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + unsigned int sampleout = source[sample_index]; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; + + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; + + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + } + else + { + bool is_64x64 = xbits == 6 && ybits == 6; + if (is_64x64) + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + xfrac -= 1 << (31 - xbits); + yfrac -= 1 << (31 - ybits); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + uint32_t xxbits = 26; + uint32_t yybits = 26; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + uint32_t xxbits = 26; + uint32_t yybits = 26; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; + + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + uint32_t xxbits = 26; + uint32_t yybits = 26; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; + + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + else + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + xfrac -= 1 << (31 - xbits); + yfrac -= 1 << (31 - ybits); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + uint32_t xxbits = 32 - xbits; + uint32_t yybits = 32 - ybits; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + uint32_t xxbits = 32 - xbits; + uint32_t yybits = 32 - ybits; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; + + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + uint32_t xxbits = 32 - xbits; + uint32_t yybits = 32 - ybits; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; + + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + } + } + else + { + if (is_nearest_filter) + { + bool is_64x64 = xbits == 6 && ybits == 6; + if (is_64x64) + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); + __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); + shade_fade = _mm_mullo_epi16(shade_fade, inv_light); + __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); + int desaturate = shade_constants.desaturate; + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); + unsigned int sampleout = source[sample_index]; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; + + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; + + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + else + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); + __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); + shade_fade = _mm_mullo_epi16(shade_fade, inv_light); + __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); + int desaturate = shade_constants.desaturate; + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + unsigned int sampleout = source[sample_index]; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; + + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; + + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + } + else + { + bool is_64x64 = xbits == 6 && ybits == 6; + if (is_64x64) + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); + __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); + shade_fade = _mm_mullo_epi16(shade_fade, inv_light); + __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); + int desaturate = shade_constants.desaturate; + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + xfrac -= 1 << (31 - xbits); + yfrac -= 1 << (31 - ybits); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + uint32_t xxbits = 26; + uint32_t yybits = 26; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + uint32_t xxbits = 26; + uint32_t yybits = 26; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; + + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + uint32_t xxbits = 26; + uint32_t yybits = 26; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; + + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + else + { + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); + __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); + shade_fade = _mm_mullo_epi16(shade_fade, inv_light); + __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); + int desaturate = shade_constants.desaturate; + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + xfrac -= 1 << (31 - xbits); + yfrac -= 1 << (31 - ybits); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + uint32_t xxbits = 32 - xbits; + uint32_t yybits = 32 - ybits; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + uint32_t xxbits = 32 - xbits; + uint32_t yybits = 32 - ybits; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; + + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + uint32_t xxbits = 32 - xbits; + uint32_t yybits = 32 - ybits; + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; + + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + } + } + } + } + + FString DebugInfo() override { return "DrawSpanAddClamp32Command"; } + }; + +} diff --git a/src/swrenderer/drawers/r_draw_span32.php b/src/swrenderer/drawers/r_draw_span32.php new file mode 100644 index 000000000..6a8a0b32a --- /dev/null +++ b/src/swrenderer/drawers/r_draw_span32.php @@ -0,0 +1,388 @@ +#!/usr/bin/php +/* +** Drawer commands for spans +** Copyright (c) 2016 Magnus Norddahl +** +** This software is provided 'as-is', without any express or implied +** warranty. In no event will the authors be held liable for any damages +** arising from the use of this software. +** +** Permission is granted to anyone to use this software for any purpose, +** including commercial applications, and to alter it and redistribute it +** freely, subject to the following restrictions: +** +** 1. The origin of this software must not be misrepresented; you must not +** claim that you wrote the original software. If you use this software +** in a product, an acknowledgment in the product documentation would be +** appreciated but is not required. +** 2. Altered source versions must be plainly marked as such, and must not be +** misrepresented as being the original software. +** 3. This notice may not be removed or altered from any source distribution. +** +*/ + +/* + Warning: this C++ source file has been auto-generated. Please modify the original php script that generated it. +*/ + +#pragma once + +#include "swrenderer/drawers/r_draw_rgba.h" +#include "swrenderer/viewport/r_spandrawer.h" + +namespace swrenderer +{ + + class : public DrawerCommand + { + protected: + SpanDrawerArgs args; + + public: + (const SpanDrawerArgs &drawerargs) : args(drawerargs) { } + + void Execute(DrawerThread *thread) override + { + if (thread->line_skipped_by_thread(args.DestY())) return; + + uint32_t xbits = args.TextureWidthBits(); + uint32_t ybits = args.TextureHeightBits(); + uint32_t xstep = args.TextureUStep(); + uint32_t ystep = args.TextureVStep(); + uint32_t xfrac = args.TextureUPos(); + uint32_t yfrac = args.TextureVPos(); + uint32_t yshift = 32 - ybits; + uint32_t xshift = yshift - xbits; + uint32_t xmask = ((1 << xbits) - 1) << ybits; + + const uint32_t *source = (const uint32_t*)args.TexturePixels(); + + double lod = args.TextureLOD(); + bool mipmapped = args.MipmappedTexture(); + + bool magnifying = lod < 0.0; + if (r_mipmap && mipmapped) + { + int level = (int)lod; + while (level > 0) + { + if (xbits <= 2 || ybits <= 2) + break; + + source += (1 << (xbits)) * (1 << (ybits)); + xbits -= 1; + ybits -= 1; + level--; + } + } + + bool is_nearest_filter = !((magnifying && r_magfilter) || (!magnifying && r_minfilter)); + + auto shade_constants = args.ColormapConstants(); + if (shade_constants.simple_shade) + { + + } + else + { + + } + } + + FString DebugInfo() override { return ""; } + }; + + + if (is_nearest_filter) + { + + } + else + { + + } + + bool is_64x64 = xbits == 6 && ybits == 6; + if (is_64x64) + { + + } + else + { + + } + + // Shade constants + int light = 256 - (args.Light() >> (FRACBITS - 8)); + __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); + __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); + + __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); + __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); + shade_fade = _mm_mullo_epi16(shade_fade, inv_light); + __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); + int desaturate = shade_constants.desaturate; + + + int count = args.DestX2() - args.DestX1() + 1; + int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); + uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); + + + xfrac -= 1 << (31 - xbits); + yfrac -= 1 << (31 - ybits); + + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); + + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) + { + int offset = index * 2; + + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); + + + // Sample + unsigned int ifgcolor[2]; + { + + ifgcolor[0] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + { + + ifgcolor[1] = sampleout; + xfrac += xstep; + yfrac += ystep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + + + // Blend + + + _mm_storel_epi64((__m128i*)(dest + offset), outcolor); + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index; + + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + + // Sample + unsigned int ifgcolor[2]; + + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + + + // Blend + + + dest[offset] = _mm_cvtsi128_si32(outcolor); + } + + int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); + unsigned int sampleout = source[sample_index]; + + int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + unsigned int sampleout = source[sample_index]; + + uint32_t xxbits = 26; + uint32_t yybits = 26; + + uint32_t xxbits = 32 - xbits; + uint32_t yybits = 32 - ybits; + + uint32_t xxshift = (32 - xxbits); + uint32_t yyshift = (32 - yybits); + uint32_t xxmask = (1 << xxshift) - 1; + uint32_t yymask = (1 << yyshift) - 1; + uint32_t x = xfrac >> xxbits; + uint32_t y = yfrac >> yybits; + + uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; + uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; + uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; + + uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + __m128i outcolor = fgcolor; + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + + fgcolor = _mm_mullo_epi16(fgcolor, alpha); + bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); + __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + __m128i fgalpha = _mm_set1_epi16(srcalpha); + __m128i bgalpha = _mm_set1_epi16(destalpha); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; + + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); + + __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); + __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); + + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + +}