From e697746e7d526f8ab4164872f9414b561b5ace4b Mon Sep 17 00:00:00 2001 From: Magnus Norddahl Date: Fri, 24 Feb 2017 16:59:45 +0100 Subject: [PATCH] Remove the php drawers and their generated output --- src/swrenderer/drawers/r_draw_rgba.cpp | 2 +- .../{r_draw_sky32.h => r_draw_sky32_sse2.h} | 0 src/swrenderer/drawers/r_draw_span32.h | 8054 ----------------- src/swrenderer/drawers/r_draw_span32.php | 443 - src/swrenderer/drawers/r_draw_sprite32.h | 6898 -------------- src/swrenderer/drawers/r_draw_sprite32.php | 385 - src/swrenderer/drawers/r_draw_wall32.h | 5416 ----------- src/swrenderer/drawers/r_draw_wall32.php | 370 - 8 files changed, 1 insertion(+), 21567 deletions(-) rename src/swrenderer/drawers/{r_draw_sky32.h => r_draw_sky32_sse2.h} (100%) delete mode 100644 src/swrenderer/drawers/r_draw_span32.h delete mode 100644 src/swrenderer/drawers/r_draw_span32.php delete mode 100644 src/swrenderer/drawers/r_draw_sprite32.h delete mode 100644 src/swrenderer/drawers/r_draw_sprite32.php delete mode 100644 src/swrenderer/drawers/r_draw_wall32.h delete mode 100644 src/swrenderer/drawers/r_draw_wall32.php diff --git a/src/swrenderer/drawers/r_draw_rgba.cpp b/src/swrenderer/drawers/r_draw_rgba.cpp index aa0855a102..48bbc2c38b 100644 --- a/src/swrenderer/drawers/r_draw_rgba.cpp +++ b/src/swrenderer/drawers/r_draw_rgba.cpp @@ -42,7 +42,7 @@ #include "r_draw_wall32_sse2.h" #include "r_draw_sprite32_sse2.h" #include "r_draw_span32_sse2.h" -#include "r_draw_sky32.h" +#include "r_draw_sky32_sse2.h" #include "gi.h" #include "stats.h" diff --git a/src/swrenderer/drawers/r_draw_sky32.h b/src/swrenderer/drawers/r_draw_sky32_sse2.h similarity index 100% rename from src/swrenderer/drawers/r_draw_sky32.h rename to src/swrenderer/drawers/r_draw_sky32_sse2.h diff --git a/src/swrenderer/drawers/r_draw_span32.h b/src/swrenderer/drawers/r_draw_span32.h deleted file mode 100644 index af2970f986..0000000000 --- a/src/swrenderer/drawers/r_draw_span32.h +++ /dev/null @@ -1,8054 +0,0 @@ -/* -** Drawer commands for spans -** Copyright (c) 2016 Magnus Norddahl -** -** This software is provided 'as-is', without any express or implied -** warranty. In no event will the authors be held liable for any damages -** arising from the use of this software. -** -** Permission is granted to anyone to use this software for any purpose, -** including commercial applications, and to alter it and redistribute it -** freely, subject to the following restrictions: -** -** 1. The origin of this software must not be misrepresented; you must not -** claim that you wrote the original software. If you use this software -** in a product, an acknowledgment in the product documentation would be -** appreciated but is not required. -** 2. Altered source versions must be plainly marked as such, and must not be -** misrepresented as being the original software. -** 3. This notice may not be removed or altered from any source distribution. -** -*/ - -/* - Warning: this C++ source file has been auto-generated. Please modify the original php script that generated it. -*/ - -#pragma once - -#include "swrenderer/drawers/r_draw_rgba.h" -#include "swrenderer/viewport/r_spandrawer.h" - -namespace swrenderer -{ - class DrawSpan32Command : public DrawerCommand - { - protected: - SpanDrawerArgs args; - - public: - DrawSpan32Command(const SpanDrawerArgs &drawerargs) : args(drawerargs) { } - - void Execute(DrawerThread *thread) override - { - if (thread->line_skipped_by_thread(args.DestY())) return; - - uint32_t xbits = args.TextureWidthBits(); - uint32_t ybits = args.TextureHeightBits(); - uint32_t xstep = args.TextureUStep(); - uint32_t ystep = args.TextureVStep(); - uint32_t xfrac = args.TextureUPos(); - uint32_t yfrac = args.TextureVPos(); - uint32_t yshift = 32 - ybits; - uint32_t xshift = yshift - xbits; - uint32_t xmask = ((1 << xbits) - 1) << ybits; - - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - - double lod = args.TextureLOD(); - bool mipmapped = args.MipmappedTexture(); - - bool magnifying = lod < 0.0; - if (r_mipmap && mipmapped) - { - int level = (int)lod; - while (level > 0) - { - if (xbits <= 2 || ybits <= 2) - break; - - source += (1 << (xbits)) * (1 << (ybits)); - xbits -= 1; - ybits -= 1; - level--; - } - } - - bool is_nearest_filter = !((magnifying && r_magfilter) || (!magnifying && r_minfilter)); - - auto shade_constants = args.ColormapConstants(); - if (shade_constants.simple_shade) - { - if (is_nearest_filter) - { - bool is_64x64 = xbits == 6 && ybits == 6; - if (is_64x64) - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - - // Sample - unsigned int ifgcolor[2]; - { - int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); - unsigned int sampleout = source[sample_index]; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - - // Sample - unsigned int ifgcolor[2]; - int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - - // Sample - unsigned int ifgcolor[2]; - { - int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - unsigned int sampleout = source[sample_index]; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - - // Sample - unsigned int ifgcolor[2]; - int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - else - { - bool is_64x64 = xbits == 6 && ybits == 6; - if (is_64x64) - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - xfrac -= 1 << (31 - xbits); - yfrac -= 1 << (31 - ybits); - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - - // Sample - unsigned int ifgcolor[2]; - { - uint32_t xxbits = 26; - uint32_t yybits = 26; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - uint32_t xxbits = 26; - uint32_t yybits = 26; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - - // Sample - unsigned int ifgcolor[2]; - uint32_t xxbits = 26; - uint32_t yybits = 26; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - xfrac -= 1 << (31 - xbits); - yfrac -= 1 << (31 - ybits); - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - - // Sample - unsigned int ifgcolor[2]; - { - uint32_t xxbits = 32 - xbits; - uint32_t yybits = 32 - ybits; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - uint32_t xxbits = 32 - xbits; - uint32_t yybits = 32 - ybits; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - - // Sample - unsigned int ifgcolor[2]; - uint32_t xxbits = 32 - xbits; - uint32_t yybits = 32 - ybits; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - } - else - { - if (is_nearest_filter) - { - bool is_64x64 = xbits == 6 && ybits == 6; - if (is_64x64) - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - - // Sample - unsigned int ifgcolor[2]; - { - int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); - unsigned int sampleout = source[sample_index]; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - - // Sample - unsigned int ifgcolor[2]; - int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - - // Sample - unsigned int ifgcolor[2]; - { - int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - unsigned int sampleout = source[sample_index]; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - - // Sample - unsigned int ifgcolor[2]; - int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - else - { - bool is_64x64 = xbits == 6 && ybits == 6; - if (is_64x64) - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - xfrac -= 1 << (31 - xbits); - yfrac -= 1 << (31 - ybits); - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - - // Sample - unsigned int ifgcolor[2]; - { - uint32_t xxbits = 26; - uint32_t yybits = 26; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - uint32_t xxbits = 26; - uint32_t yybits = 26; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - - // Sample - unsigned int ifgcolor[2]; - uint32_t xxbits = 26; - uint32_t yybits = 26; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - xfrac -= 1 << (31 - xbits); - yfrac -= 1 << (31 - ybits); - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - - // Sample - unsigned int ifgcolor[2]; - { - uint32_t xxbits = 32 - xbits; - uint32_t yybits = 32 - ybits; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - uint32_t xxbits = 32 - xbits; - uint32_t yybits = 32 - ybits; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - - // Sample - unsigned int ifgcolor[2]; - uint32_t xxbits = 32 - xbits; - uint32_t yybits = 32 - ybits; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - } - } - - FString DebugInfo() override { return "DrawSpan32Command"; } - }; - - class DrawSpanMasked32Command : public DrawerCommand - { - protected: - SpanDrawerArgs args; - - public: - DrawSpanMasked32Command(const SpanDrawerArgs &drawerargs) : args(drawerargs) { } - - void Execute(DrawerThread *thread) override - { - if (thread->line_skipped_by_thread(args.DestY())) return; - - uint32_t xbits = args.TextureWidthBits(); - uint32_t ybits = args.TextureHeightBits(); - uint32_t xstep = args.TextureUStep(); - uint32_t ystep = args.TextureVStep(); - uint32_t xfrac = args.TextureUPos(); - uint32_t yfrac = args.TextureVPos(); - uint32_t yshift = 32 - ybits; - uint32_t xshift = yshift - xbits; - uint32_t xmask = ((1 << xbits) - 1) << ybits; - - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - - double lod = args.TextureLOD(); - bool mipmapped = args.MipmappedTexture(); - - bool magnifying = lod < 0.0; - if (r_mipmap && mipmapped) - { - int level = (int)lod; - while (level > 0) - { - if (xbits <= 2 || ybits <= 2) - break; - - source += (1 << (xbits)) * (1 << (ybits)); - xbits -= 1; - ybits -= 1; - level--; - } - } - - bool is_nearest_filter = !((magnifying && r_magfilter) || (!magnifying && r_minfilter)); - - auto shade_constants = args.ColormapConstants(); - if (shade_constants.simple_shade) - { - if (is_nearest_filter) - { - bool is_64x64 = xbits == 6 && ybits == 6; - if (is_64x64) - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); - unsigned int sampleout = source[sample_index]; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - unsigned int sampleout = source[sample_index]; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - else - { - bool is_64x64 = xbits == 6 && ybits == 6; - if (is_64x64) - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - xfrac -= 1 << (31 - xbits); - yfrac -= 1 << (31 - ybits); - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - uint32_t xxbits = 26; - uint32_t yybits = 26; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - uint32_t xxbits = 26; - uint32_t yybits = 26; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - uint32_t xxbits = 26; - uint32_t yybits = 26; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - xfrac -= 1 << (31 - xbits); - yfrac -= 1 << (31 - ybits); - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - uint32_t xxbits = 32 - xbits; - uint32_t yybits = 32 - ybits; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - uint32_t xxbits = 32 - xbits; - uint32_t yybits = 32 - ybits; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - uint32_t xxbits = 32 - xbits; - uint32_t yybits = 32 - ybits; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - } - else - { - if (is_nearest_filter) - { - bool is_64x64 = xbits == 6 && ybits == 6; - if (is_64x64) - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); - unsigned int sampleout = source[sample_index]; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - unsigned int sampleout = source[sample_index]; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - else - { - bool is_64x64 = xbits == 6 && ybits == 6; - if (is_64x64) - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - xfrac -= 1 << (31 - xbits); - yfrac -= 1 << (31 - ybits); - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - uint32_t xxbits = 26; - uint32_t yybits = 26; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - uint32_t xxbits = 26; - uint32_t yybits = 26; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - uint32_t xxbits = 26; - uint32_t yybits = 26; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - xfrac -= 1 << (31 - xbits); - yfrac -= 1 << (31 - ybits); - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - uint32_t xxbits = 32 - xbits; - uint32_t yybits = 32 - ybits; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - uint32_t xxbits = 32 - xbits; - uint32_t yybits = 32 - ybits; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - uint32_t xxbits = 32 - xbits; - uint32_t yybits = 32 - ybits; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - } - } - - FString DebugInfo() override { return "DrawSpanMasked32Command"; } - }; - - class DrawSpanTranslucent32Command : public DrawerCommand - { - protected: - SpanDrawerArgs args; - - public: - DrawSpanTranslucent32Command(const SpanDrawerArgs &drawerargs) : args(drawerargs) { } - - void Execute(DrawerThread *thread) override - { - if (thread->line_skipped_by_thread(args.DestY())) return; - - uint32_t xbits = args.TextureWidthBits(); - uint32_t ybits = args.TextureHeightBits(); - uint32_t xstep = args.TextureUStep(); - uint32_t ystep = args.TextureVStep(); - uint32_t xfrac = args.TextureUPos(); - uint32_t yfrac = args.TextureVPos(); - uint32_t yshift = 32 - ybits; - uint32_t xshift = yshift - xbits; - uint32_t xmask = ((1 << xbits) - 1) << ybits; - - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - - double lod = args.TextureLOD(); - bool mipmapped = args.MipmappedTexture(); - - bool magnifying = lod < 0.0; - if (r_mipmap && mipmapped) - { - int level = (int)lod; - while (level > 0) - { - if (xbits <= 2 || ybits <= 2) - break; - - source += (1 << (xbits)) * (1 << (ybits)); - xbits -= 1; - ybits -= 1; - level--; - } - } - - bool is_nearest_filter = !((magnifying && r_magfilter) || (!magnifying && r_minfilter)); - - auto shade_constants = args.ColormapConstants(); - if (shade_constants.simple_shade) - { - if (is_nearest_filter) - { - bool is_64x64 = xbits == 6 && ybits == 6; - if (is_64x64) - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); - unsigned int sampleout = source[sample_index]; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i fgalpha = _mm_set1_epi16(srcalpha); - __m128i bgalpha = _mm_set1_epi16(destalpha); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i fgalpha = _mm_set1_epi16(srcalpha); - __m128i bgalpha = _mm_set1_epi16(destalpha); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - unsigned int sampleout = source[sample_index]; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i fgalpha = _mm_set1_epi16(srcalpha); - __m128i bgalpha = _mm_set1_epi16(destalpha); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i fgalpha = _mm_set1_epi16(srcalpha); - __m128i bgalpha = _mm_set1_epi16(destalpha); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - else - { - bool is_64x64 = xbits == 6 && ybits == 6; - if (is_64x64) - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - xfrac -= 1 << (31 - xbits); - yfrac -= 1 << (31 - ybits); - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - uint32_t xxbits = 26; - uint32_t yybits = 26; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - uint32_t xxbits = 26; - uint32_t yybits = 26; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i fgalpha = _mm_set1_epi16(srcalpha); - __m128i bgalpha = _mm_set1_epi16(destalpha); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - uint32_t xxbits = 26; - uint32_t yybits = 26; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i fgalpha = _mm_set1_epi16(srcalpha); - __m128i bgalpha = _mm_set1_epi16(destalpha); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - xfrac -= 1 << (31 - xbits); - yfrac -= 1 << (31 - ybits); - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - uint32_t xxbits = 32 - xbits; - uint32_t yybits = 32 - ybits; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - uint32_t xxbits = 32 - xbits; - uint32_t yybits = 32 - ybits; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i fgalpha = _mm_set1_epi16(srcalpha); - __m128i bgalpha = _mm_set1_epi16(destalpha); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - uint32_t xxbits = 32 - xbits; - uint32_t yybits = 32 - ybits; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i fgalpha = _mm_set1_epi16(srcalpha); - __m128i bgalpha = _mm_set1_epi16(destalpha); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - } - else - { - if (is_nearest_filter) - { - bool is_64x64 = xbits == 6 && ybits == 6; - if (is_64x64) - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); - unsigned int sampleout = source[sample_index]; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i fgalpha = _mm_set1_epi16(srcalpha); - __m128i bgalpha = _mm_set1_epi16(destalpha); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i fgalpha = _mm_set1_epi16(srcalpha); - __m128i bgalpha = _mm_set1_epi16(destalpha); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - unsigned int sampleout = source[sample_index]; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i fgalpha = _mm_set1_epi16(srcalpha); - __m128i bgalpha = _mm_set1_epi16(destalpha); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i fgalpha = _mm_set1_epi16(srcalpha); - __m128i bgalpha = _mm_set1_epi16(destalpha); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - else - { - bool is_64x64 = xbits == 6 && ybits == 6; - if (is_64x64) - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - xfrac -= 1 << (31 - xbits); - yfrac -= 1 << (31 - ybits); - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - uint32_t xxbits = 26; - uint32_t yybits = 26; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - uint32_t xxbits = 26; - uint32_t yybits = 26; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i fgalpha = _mm_set1_epi16(srcalpha); - __m128i bgalpha = _mm_set1_epi16(destalpha); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - uint32_t xxbits = 26; - uint32_t yybits = 26; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i fgalpha = _mm_set1_epi16(srcalpha); - __m128i bgalpha = _mm_set1_epi16(destalpha); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - xfrac -= 1 << (31 - xbits); - yfrac -= 1 << (31 - ybits); - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - uint32_t xxbits = 32 - xbits; - uint32_t yybits = 32 - ybits; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - uint32_t xxbits = 32 - xbits; - uint32_t yybits = 32 - ybits; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i fgalpha = _mm_set1_epi16(srcalpha); - __m128i bgalpha = _mm_set1_epi16(destalpha); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - uint32_t xxbits = 32 - xbits; - uint32_t yybits = 32 - ybits; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i fgalpha = _mm_set1_epi16(srcalpha); - __m128i bgalpha = _mm_set1_epi16(destalpha); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - } - } - - FString DebugInfo() override { return "DrawSpanTranslucent32Command"; } - }; - - class DrawSpanAddClamp32Command : public DrawerCommand - { - protected: - SpanDrawerArgs args; - - public: - DrawSpanAddClamp32Command(const SpanDrawerArgs &drawerargs) : args(drawerargs) { } - - void Execute(DrawerThread *thread) override - { - if (thread->line_skipped_by_thread(args.DestY())) return; - - uint32_t xbits = args.TextureWidthBits(); - uint32_t ybits = args.TextureHeightBits(); - uint32_t xstep = args.TextureUStep(); - uint32_t ystep = args.TextureVStep(); - uint32_t xfrac = args.TextureUPos(); - uint32_t yfrac = args.TextureVPos(); - uint32_t yshift = 32 - ybits; - uint32_t xshift = yshift - xbits; - uint32_t xmask = ((1 << xbits) - 1) << ybits; - - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - - double lod = args.TextureLOD(); - bool mipmapped = args.MipmappedTexture(); - - bool magnifying = lod < 0.0; - if (r_mipmap && mipmapped) - { - int level = (int)lod; - while (level > 0) - { - if (xbits <= 2 || ybits <= 2) - break; - - source += (1 << (xbits)) * (1 << (ybits)); - xbits -= 1; - ybits -= 1; - level--; - } - } - - bool is_nearest_filter = !((magnifying && r_magfilter) || (!magnifying && r_minfilter)); - - auto shade_constants = args.ColormapConstants(); - if (shade_constants.simple_shade) - { - if (is_nearest_filter) - { - bool is_64x64 = xbits == 6 && ybits == 6; - if (is_64x64) - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); - unsigned int sampleout = source[sample_index]; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - unsigned int sampleout = source[sample_index]; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - else - { - bool is_64x64 = xbits == 6 && ybits == 6; - if (is_64x64) - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - xfrac -= 1 << (31 - xbits); - yfrac -= 1 << (31 - ybits); - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - uint32_t xxbits = 26; - uint32_t yybits = 26; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - uint32_t xxbits = 26; - uint32_t yybits = 26; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - uint32_t xxbits = 26; - uint32_t yybits = 26; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - xfrac -= 1 << (31 - xbits); - yfrac -= 1 << (31 - ybits); - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - uint32_t xxbits = 32 - xbits; - uint32_t yybits = 32 - ybits; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - uint32_t xxbits = 32 - xbits; - uint32_t yybits = 32 - ybits; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - uint32_t xxbits = 32 - xbits; - uint32_t yybits = 32 - ybits; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - } - else - { - if (is_nearest_filter) - { - bool is_64x64 = xbits == 6 && ybits == 6; - if (is_64x64) - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); - unsigned int sampleout = source[sample_index]; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - unsigned int sampleout = source[sample_index]; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - else - { - bool is_64x64 = xbits == 6 && ybits == 6; - if (is_64x64) - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - xfrac -= 1 << (31 - xbits); - yfrac -= 1 << (31 - ybits); - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - uint32_t xxbits = 26; - uint32_t yybits = 26; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - uint32_t xxbits = 26; - uint32_t yybits = 26; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - uint32_t xxbits = 26; - uint32_t yybits = 26; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - xfrac -= 1 << (31 - xbits); - yfrac -= 1 << (31 - ybits); - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - uint32_t xxbits = 32 - xbits; - uint32_t yybits = 32 - ybits; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - uint32_t xxbits = 32 - xbits; - uint32_t yybits = 32 - ybits; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - uint32_t xxbits = 32 - xbits; - uint32_t yybits = 32 - ybits; - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - } - } - - FString DebugInfo() override { return "DrawSpanAddClamp32Command"; } - }; - -} diff --git a/src/swrenderer/drawers/r_draw_span32.php b/src/swrenderer/drawers/r_draw_span32.php deleted file mode 100644 index 400c981dc7..0000000000 --- a/src/swrenderer/drawers/r_draw_span32.php +++ /dev/null @@ -1,443 +0,0 @@ -#!/usr/bin/php -/* -** Drawer commands for spans -** Copyright (c) 2016 Magnus Norddahl -** -** This software is provided 'as-is', without any express or implied -** warranty. In no event will the authors be held liable for any damages -** arising from the use of this software. -** -** Permission is granted to anyone to use this software for any purpose, -** including commercial applications, and to alter it and redistribute it -** freely, subject to the following restrictions: -** -** 1. The origin of this software must not be misrepresented; you must not -** claim that you wrote the original software. If you use this software -** in a product, an acknowledgment in the product documentation would be -** appreciated but is not required. -** 2. Altered source versions must be plainly marked as such, and must not be -** misrepresented as being the original software. -** 3. This notice may not be removed or altered from any source distribution. -** -*/ - -/* - Warning: this C++ source file has been auto-generated. Please modify the original php script that generated it. -*/ - -#pragma once - -#include "swrenderer/drawers/r_draw_rgba.h" -#include "swrenderer/viewport/r_spandrawer.h" - -namespace swrenderer -{ - - class : public DrawerCommand - { - protected: - SpanDrawerArgs args; - - public: - (const SpanDrawerArgs &drawerargs) : args(drawerargs) { } - - void Execute(DrawerThread *thread) override - { - if (thread->line_skipped_by_thread(args.DestY())) return; - - uint32_t xbits = args.TextureWidthBits(); - uint32_t ybits = args.TextureHeightBits(); - uint32_t xstep = args.TextureUStep(); - uint32_t ystep = args.TextureVStep(); - uint32_t xfrac = args.TextureUPos(); - uint32_t yfrac = args.TextureVPos(); - uint32_t yshift = 32 - ybits; - uint32_t xshift = yshift - xbits; - uint32_t xmask = ((1 << xbits) - 1) << ybits; - - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - - double lod = args.TextureLOD(); - bool mipmapped = args.MipmappedTexture(); - - bool magnifying = lod < 0.0; - if (r_mipmap && mipmapped) - { - int level = (int)lod; - while (level > 0) - { - if (xbits <= 2 || ybits <= 2) - break; - - source += (1 << (xbits)) * (1 << (ybits)); - xbits -= 1; - ybits -= 1; - level--; - } - } - - bool is_nearest_filter = !((magnifying && r_magfilter) || (!magnifying && r_minfilter)); - - auto shade_constants = args.ColormapConstants(); - if (shade_constants.simple_shade) - { - - } - else - { - - } - } - - FString DebugInfo() override { return ""; } - }; - - - if (is_nearest_filter) - { - - } - else - { - - } - - bool is_64x64 = xbits == 6 && ybits == 6; - if (is_64x64) - { - - } - else - { - - } - - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpx = args.dc_viewpos.X; - float stepvpx = args.dc_viewpos_step.X; - __m128 viewpos_x = _mm_setr_ps(vpx, vpx + stepvpx, 0.0f, 0.0f); - __m128 step_viewpos_x = _mm_set1_ps(stepvpx * 2.0f); - - int count = args.DestX2() - args.DestX1() + 1; - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t *dest = (uint32_t*)RenderViewport::Instance()->GetDest(args.DestX1(), args.DestY()); - - - xfrac -= 1 << (31 - xbits); - yfrac -= 1 << (31 - ybits); - - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * 2; - - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + offset)), _mm_setzero_si128()); - - - // Sample - unsigned int ifgcolor[2]; - { - - ifgcolor[0] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - { - - ifgcolor[1] = sampleout; - xfrac += xstep; - yfrac += ystep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - - - // Blend - - - _mm_storel_epi64((__m128i*)(dest + offset), outcolor); - viewpos_x = _mm_add_ps(viewpos_x, step_viewpos_x); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index; - - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - - // Sample - unsigned int ifgcolor[2]; - - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - - - // Blend - - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - - int sample_index = ((xfrac >> (32 - 6 - 6)) & (63 * 64)) + (yfrac >> (32 - 6)); - unsigned int sampleout = source[sample_index]; - - int sample_index = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); - unsigned int sampleout = source[sample_index]; - - uint32_t xxbits = 26; - uint32_t yybits = 26; - - uint32_t xxbits = 32 - xbits; - uint32_t yybits = 32 - ybits; - - uint32_t xxshift = (32 - xxbits); - uint32_t yyshift = (32 - yybits); - uint32_t xxmask = (1 << xxshift) - 1; - uint32_t yymask = (1 << yyshift) - 1; - uint32_t x = xfrac >> xxbits; - uint32_t y = yfrac >> yybits; - - uint32_t p00 = source[((y & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p01 = source[(((y + 1) & yymask) + ((x & xxmask) << yyshift))]; - uint32_t p10 = source[((y & yymask) + (((x + 1) & xxmask) << yyshift))]; - uint32_t p11 = source[(((y + 1) & yymask) + (((x + 1) & xxmask) << yyshift))]; - - uint32_t inv_b = (xfrac >> (xxbits - 4)) & 15; - uint32_t inv_a = (yfrac >> (yybits - 4)) & 15; - uint32_t a = 16 - inv_a; - uint32_t b = 16 - inv_b; - - uint32_t sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - uint32_t salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - - __m128i material = fgcolor; - - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - __m128i fgalpha = _mm_set1_epi16(srcalpha); - __m128i bgalpha = _mm_set1_epi16(destalpha); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - - __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); - __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lyz2 = light_y; // L.y*L.y + L.z*L.z - __m128 Lx = _mm_sub_ps(light_x, viewpos_x); - __m128 dist2 = _mm_add_ps(Lyz2, _mm_mul_ps(Lx, Lx)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_z, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_z, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - -} diff --git a/src/swrenderer/drawers/r_draw_sprite32.h b/src/swrenderer/drawers/r_draw_sprite32.h deleted file mode 100644 index d54aff4e9e..0000000000 --- a/src/swrenderer/drawers/r_draw_sprite32.h +++ /dev/null @@ -1,6898 +0,0 @@ -/* -** Drawer commands for sprites -** Copyright (c) 2016 Magnus Norddahl -** -** This software is provided 'as-is', without any express or implied -** warranty. In no event will the authors be held liable for any damages -** arising from the use of this software. -** -** Permission is granted to anyone to use this software for any purpose, -** including commercial applications, and to alter it and redistribute it -** freely, subject to the following restrictions: -** -** 1. The origin of this software must not be misrepresented; you must not -** claim that you wrote the original software. If you use this software -** in a product, an acknowledgment in the product documentation would be -** appreciated but is not required. -** 2. Altered source versions must be plainly marked as such, and must not be -** misrepresented as being the original software. -** 3. This notice may not be removed or altered from any source distribution. -** -*/ - -/* - Warning: this C++ source file has been auto-generated. Please modify the original php script that generated it. -*/ - -#pragma once - -#include "swrenderer/drawers/r_draw_rgba.h" -#include "swrenderer/viewport/r_walldrawer.h" - -namespace swrenderer -{ - class DrawSpriteCopy32Command : public DrawerCommand - { - protected: - SpriteDrawerArgs args; - - public: - DrawSpriteCopy32Command(const SpriteDrawerArgs &drawerargs) : args(drawerargs) { } - - void Execute(DrawerThread *thread) override - { - auto shade_constants = args.ColormapConstants(); - if (shade_constants.simple_shade) - { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - bool is_nearest_filter = (source2 == nullptr); - if (is_nearest_filter) - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - mlight = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - mlight = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - frac -= one / 2; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - else - { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - bool is_nearest_filter = (source2 == nullptr); - if (is_nearest_filter) - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - __m128i lightcontrib = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - lightcontrib = _mm_sub_epi16(lightcontrib, mlight); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - __m128i lightcontrib = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - lightcontrib = _mm_sub_epi16(lightcontrib, mlight); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - frac -= one / 2; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - } - - FString DebugInfo() override { return "DrawSpriteCopy32Command"; } - }; - - class DrawSprite32Command : public DrawerCommand - { - protected: - SpriteDrawerArgs args; - - public: - DrawSprite32Command(const SpriteDrawerArgs &drawerargs) : args(drawerargs) { } - - void Execute(DrawerThread *thread) override - { - auto shade_constants = args.ColormapConstants(); - if (shade_constants.simple_shade) - { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - bool is_nearest_filter = (source2 == nullptr); - if (is_nearest_filter) - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - mlight = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - mlight = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - frac -= one / 2; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - else - { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - bool is_nearest_filter = (source2 == nullptr); - if (is_nearest_filter) - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - __m128i lightcontrib = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - lightcontrib = _mm_sub_epi16(lightcontrib, mlight); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - __m128i lightcontrib = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - lightcontrib = _mm_sub_epi16(lightcontrib, mlight); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - frac -= one / 2; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - } - - FString DebugInfo() override { return "DrawSprite32Command"; } - }; - - class DrawSpriteAddClamp32Command : public DrawerCommand - { - protected: - SpriteDrawerArgs args; - - public: - DrawSpriteAddClamp32Command(const SpriteDrawerArgs &drawerargs) : args(drawerargs) { } - - void Execute(DrawerThread *thread) override - { - auto shade_constants = args.ColormapConstants(); - if (shade_constants.simple_shade) - { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - bool is_nearest_filter = (source2 == nullptr); - if (is_nearest_filter) - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - mlight = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - mlight = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - frac -= one / 2; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - else - { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - bool is_nearest_filter = (source2 == nullptr); - if (is_nearest_filter) - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - __m128i lightcontrib = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - lightcontrib = _mm_sub_epi16(lightcontrib, mlight); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - __m128i lightcontrib = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - lightcontrib = _mm_sub_epi16(lightcontrib, mlight); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - frac -= one / 2; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - } - - FString DebugInfo() override { return "DrawSpriteAddClamp32Command"; } - }; - - class DrawSpriteSubClamp32Command : public DrawerCommand - { - protected: - SpriteDrawerArgs args; - - public: - DrawSpriteSubClamp32Command(const SpriteDrawerArgs &drawerargs) : args(drawerargs) { } - - void Execute(DrawerThread *thread) override - { - auto shade_constants = args.ColormapConstants(); - if (shade_constants.simple_shade) - { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - bool is_nearest_filter = (source2 == nullptr); - if (is_nearest_filter) - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - mlight = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - mlight = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - frac -= one / 2; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - else - { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - bool is_nearest_filter = (source2 == nullptr); - if (is_nearest_filter) - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - __m128i lightcontrib = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - lightcontrib = _mm_sub_epi16(lightcontrib, mlight); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - __m128i lightcontrib = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - lightcontrib = _mm_sub_epi16(lightcontrib, mlight); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - frac -= one / 2; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - } - - FString DebugInfo() override { return "DrawSpriteSubClamp32Command"; } - }; - - class DrawSpriteRevSubClamp32Command : public DrawerCommand - { - protected: - SpriteDrawerArgs args; - - public: - DrawSpriteRevSubClamp32Command(const SpriteDrawerArgs &drawerargs) : args(drawerargs) { } - - void Execute(DrawerThread *thread) override - { - auto shade_constants = args.ColormapConstants(); - if (shade_constants.simple_shade) - { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - bool is_nearest_filter = (source2 == nullptr); - if (is_nearest_filter) - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - mlight = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); - __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); - __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - mlight = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - frac -= one / 2; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); - __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); - __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - else - { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - bool is_nearest_filter = (source2 == nullptr); - if (is_nearest_filter) - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - __m128i lightcontrib = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - lightcontrib = _mm_sub_epi16(lightcontrib, mlight); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); - __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); - __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - __m128i lightcontrib = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - lightcontrib = _mm_sub_epi16(lightcontrib, mlight); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - frac -= one / 2; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); - __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); - __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - } - - FString DebugInfo() override { return "DrawSpriteRevSubClamp32Command"; } - }; - - class FillSprite32Command : public DrawerCommand - { - protected: - SpriteDrawerArgs args; - - public: - FillSprite32Command(const SpriteDrawerArgs &drawerargs) : args(drawerargs) { } - - void Execute(DrawerThread *thread) override - { - auto shade_constants = args.ColormapConstants(); - if (shade_constants.simple_shade) - { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - mlight = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - unsigned int sampleout = srccolor; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - unsigned int sampleout = srccolor; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - unsigned int sampleout = srccolor; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - __m128i lightcontrib = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - lightcontrib = _mm_sub_epi16(lightcontrib, mlight); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - unsigned int sampleout = srccolor; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - unsigned int sampleout = srccolor; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - unsigned int sampleout = srccolor; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - - FString DebugInfo() override { return "FillSprite32Command"; } - }; - - class FillSpriteAddClamp32Command : public DrawerCommand - { - protected: - SpriteDrawerArgs args; - - public: - FillSpriteAddClamp32Command(const SpriteDrawerArgs &drawerargs) : args(drawerargs) { } - - void Execute(DrawerThread *thread) override - { - auto shade_constants = args.ColormapConstants(); - if (shade_constants.simple_shade) - { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - mlight = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - unsigned int sampleout = srccolor; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - unsigned int sampleout = srccolor; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - unsigned int sampleout = srccolor; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - __m128i lightcontrib = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - lightcontrib = _mm_sub_epi16(lightcontrib, mlight); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - unsigned int sampleout = srccolor; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - unsigned int sampleout = srccolor; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - unsigned int sampleout = srccolor; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - - FString DebugInfo() override { return "FillSpriteAddClamp32Command"; } - }; - - class FillSpriteSubClamp32Command : public DrawerCommand - { - protected: - SpriteDrawerArgs args; - - public: - FillSpriteSubClamp32Command(const SpriteDrawerArgs &drawerargs) : args(drawerargs) { } - - void Execute(DrawerThread *thread) override - { - auto shade_constants = args.ColormapConstants(); - if (shade_constants.simple_shade) - { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - mlight = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - unsigned int sampleout = srccolor; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - unsigned int sampleout = srccolor; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - unsigned int sampleout = srccolor; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - __m128i lightcontrib = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - lightcontrib = _mm_sub_epi16(lightcontrib, mlight); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - unsigned int sampleout = srccolor; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - unsigned int sampleout = srccolor; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - unsigned int sampleout = srccolor; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - - FString DebugInfo() override { return "FillSpriteSubClamp32Command"; } - }; - - class FillSpriteRevSubClamp32Command : public DrawerCommand - { - protected: - SpriteDrawerArgs args; - - public: - FillSpriteRevSubClamp32Command(const SpriteDrawerArgs &drawerargs) : args(drawerargs) { } - - void Execute(DrawerThread *thread) override - { - auto shade_constants = args.ColormapConstants(); - if (shade_constants.simple_shade) - { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - mlight = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - unsigned int sampleout = srccolor; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - unsigned int sampleout = srccolor; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); - __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - unsigned int sampleout = srccolor; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); - __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - __m128i lightcontrib = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - lightcontrib = _mm_sub_epi16(lightcontrib, mlight); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - unsigned int sampleout = srccolor; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - unsigned int sampleout = srccolor; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); - __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - unsigned int sampleout = srccolor; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); - __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - - FString DebugInfo() override { return "FillSpriteRevSubClamp32Command"; } - }; - - class DrawSpriteShaded32Command : public DrawerCommand - { - protected: - SpriteDrawerArgs args; - - public: - DrawSpriteShaded32Command(const SpriteDrawerArgs &drawerargs) : args(drawerargs) { } - - void Execute(DrawerThread *thread) override - { - auto shade_constants = args.ColormapConstants(); - if (shade_constants.simple_shade) - { - const uint8_t *source = args.TexturePixels(); - const uint8_t *colormap = args.Colormap(); - const uint32_t *translation = (const uint32_t*)args.TranslationMap(); - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - mlight = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - unsigned int sampleout = color; - unsigned int sampleshadeout = colormap[source[frac >> FRACBITS]]; - sampleshadeout = clamp(sampleshadeout, 0, 64) * 4; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - unsigned int sampleout = color; - unsigned int sampleshadeout = colormap[source[frac >> FRACBITS]]; - sampleshadeout = clamp(sampleshadeout, 0, 64) * 4; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - - // Blend - __m128i alpha = _mm_set_epi16(ifgshade[1], ifgshade[1], ifgshade[1], ifgshade[1], ifgshade[0], ifgshade[0], ifgshade[0], ifgshade[0]); - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - unsigned int sampleout = color; - unsigned int sampleshadeout = colormap[source[frac >> FRACBITS]]; - sampleshadeout = clamp(sampleshadeout, 0, 64) * 4; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - - // Blend - __m128i alpha = _mm_set_epi16(ifgshade[1], ifgshade[1], ifgshade[1], ifgshade[1], ifgshade[0], ifgshade[0], ifgshade[0], ifgshade[0]); - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - const uint8_t *source = args.TexturePixels(); - const uint8_t *colormap = args.Colormap(); - const uint32_t *translation = (const uint32_t*)args.TranslationMap(); - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - __m128i lightcontrib = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - lightcontrib = _mm_sub_epi16(lightcontrib, mlight); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - unsigned int sampleout = color; - unsigned int sampleshadeout = colormap[source[frac >> FRACBITS]]; - sampleshadeout = clamp(sampleshadeout, 0, 64) * 4; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - unsigned int sampleout = color; - unsigned int sampleshadeout = colormap[source[frac >> FRACBITS]]; - sampleshadeout = clamp(sampleshadeout, 0, 64) * 4; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - - // Blend - __m128i alpha = _mm_set_epi16(ifgshade[1], ifgshade[1], ifgshade[1], ifgshade[1], ifgshade[0], ifgshade[0], ifgshade[0], ifgshade[0]); - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - unsigned int sampleout = color; - unsigned int sampleshadeout = colormap[source[frac >> FRACBITS]]; - sampleshadeout = clamp(sampleshadeout, 0, 64) * 4; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - - // Blend - __m128i alpha = _mm_set_epi16(ifgshade[1], ifgshade[1], ifgshade[1], ifgshade[1], ifgshade[0], ifgshade[0], ifgshade[0], ifgshade[0]); - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - - FString DebugInfo() override { return "DrawSpriteShaded32Command"; } - }; - - class DrawSpriteTranslated32Command : public DrawerCommand - { - protected: - SpriteDrawerArgs args; - - public: - DrawSpriteTranslated32Command(const SpriteDrawerArgs &drawerargs) : args(drawerargs) { } - - void Execute(DrawerThread *thread) override - { - auto shade_constants = args.ColormapConstants(); - if (shade_constants.simple_shade) - { - const uint8_t *source = args.TexturePixels(); - const uint8_t *colormap = args.Colormap(); - const uint32_t *translation = (const uint32_t*)args.TranslationMap(); - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - mlight = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - unsigned int sampleout = translation[source[frac >> FRACBITS]]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - unsigned int sampleout = translation[source[frac >> FRACBITS]]; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - unsigned int sampleout = translation[source[frac >> FRACBITS]]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - const uint8_t *source = args.TexturePixels(); - const uint8_t *colormap = args.Colormap(); - const uint32_t *translation = (const uint32_t*)args.TranslationMap(); - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - __m128i lightcontrib = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - lightcontrib = _mm_sub_epi16(lightcontrib, mlight); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - unsigned int sampleout = translation[source[frac >> FRACBITS]]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - unsigned int sampleout = translation[source[frac >> FRACBITS]]; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - unsigned int sampleout = translation[source[frac >> FRACBITS]]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - - FString DebugInfo() override { return "DrawSpriteTranslated32Command"; } - }; - - class DrawSpriteTranslatedAddClamp32Command : public DrawerCommand - { - protected: - SpriteDrawerArgs args; - - public: - DrawSpriteTranslatedAddClamp32Command(const SpriteDrawerArgs &drawerargs) : args(drawerargs) { } - - void Execute(DrawerThread *thread) override - { - auto shade_constants = args.ColormapConstants(); - if (shade_constants.simple_shade) - { - const uint8_t *source = args.TexturePixels(); - const uint8_t *colormap = args.Colormap(); - const uint32_t *translation = (const uint32_t*)args.TranslationMap(); - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - mlight = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - unsigned int sampleout = translation[source[frac >> FRACBITS]]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - unsigned int sampleout = translation[source[frac >> FRACBITS]]; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - unsigned int sampleout = translation[source[frac >> FRACBITS]]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - const uint8_t *source = args.TexturePixels(); - const uint8_t *colormap = args.Colormap(); - const uint32_t *translation = (const uint32_t*)args.TranslationMap(); - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - __m128i lightcontrib = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - lightcontrib = _mm_sub_epi16(lightcontrib, mlight); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - unsigned int sampleout = translation[source[frac >> FRACBITS]]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - unsigned int sampleout = translation[source[frac >> FRACBITS]]; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - unsigned int sampleout = translation[source[frac >> FRACBITS]]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - - FString DebugInfo() override { return "DrawSpriteTranslatedAddClamp32Command"; } - }; - - class DrawSpriteTranslatedSubClamp32Command : public DrawerCommand - { - protected: - SpriteDrawerArgs args; - - public: - DrawSpriteTranslatedSubClamp32Command(const SpriteDrawerArgs &drawerargs) : args(drawerargs) { } - - void Execute(DrawerThread *thread) override - { - auto shade_constants = args.ColormapConstants(); - if (shade_constants.simple_shade) - { - const uint8_t *source = args.TexturePixels(); - const uint8_t *colormap = args.Colormap(); - const uint32_t *translation = (const uint32_t*)args.TranslationMap(); - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - mlight = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - unsigned int sampleout = translation[source[frac >> FRACBITS]]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - unsigned int sampleout = translation[source[frac >> FRACBITS]]; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - unsigned int sampleout = translation[source[frac >> FRACBITS]]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - const uint8_t *source = args.TexturePixels(); - const uint8_t *colormap = args.Colormap(); - const uint32_t *translation = (const uint32_t*)args.TranslationMap(); - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - __m128i lightcontrib = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - lightcontrib = _mm_sub_epi16(lightcontrib, mlight); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - unsigned int sampleout = translation[source[frac >> FRACBITS]]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - unsigned int sampleout = translation[source[frac >> FRACBITS]]; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - unsigned int sampleout = translation[source[frac >> FRACBITS]]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - - FString DebugInfo() override { return "DrawSpriteTranslatedSubClamp32Command"; } - }; - - class DrawSpriteTranslatedRevSubClamp32Command : public DrawerCommand - { - protected: - SpriteDrawerArgs args; - - public: - DrawSpriteTranslatedRevSubClamp32Command(const SpriteDrawerArgs &drawerargs) : args(drawerargs) { } - - void Execute(DrawerThread *thread) override - { - auto shade_constants = args.ColormapConstants(); - if (shade_constants.simple_shade) - { - const uint8_t *source = args.TexturePixels(); - const uint8_t *colormap = args.Colormap(); - const uint32_t *translation = (const uint32_t*)args.TranslationMap(); - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - mlight = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - unsigned int sampleout = translation[source[frac >> FRACBITS]]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - unsigned int sampleout = translation[source[frac >> FRACBITS]]; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); - __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - unsigned int sampleout = translation[source[frac >> FRACBITS]]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); - __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - const uint8_t *source = args.TexturePixels(); - const uint8_t *colormap = args.Colormap(); - const uint32_t *translation = (const uint32_t*)args.TranslationMap(); - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - __m128i lightcontrib = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - lightcontrib = _mm_sub_epi16(lightcontrib, mlight); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - unsigned int sampleout = translation[source[frac >> FRACBITS]]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - unsigned int sampleout = translation[source[frac >> FRACBITS]]; - unsigned int sampleshadeout = 0; - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); - __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - unsigned int sampleout = translation[source[frac >> FRACBITS]]; - unsigned int sampleshadeout = 0; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); - __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - - FString DebugInfo() override { return "DrawSpriteTranslatedRevSubClamp32Command"; } - }; - -} diff --git a/src/swrenderer/drawers/r_draw_sprite32.php b/src/swrenderer/drawers/r_draw_sprite32.php deleted file mode 100644 index e08ba551fe..0000000000 --- a/src/swrenderer/drawers/r_draw_sprite32.php +++ /dev/null @@ -1,385 +0,0 @@ -#!/usr/bin/php -/* -** Drawer commands for sprites -** Copyright (c) 2016 Magnus Norddahl -** -** This software is provided 'as-is', without any express or implied -** warranty. In no event will the authors be held liable for any damages -** arising from the use of this software. -** -** Permission is granted to anyone to use this software for any purpose, -** including commercial applications, and to alter it and redistribute it -** freely, subject to the following restrictions: -** -** 1. The origin of this software must not be misrepresented; you must not -** claim that you wrote the original software. If you use this software -** in a product, an acknowledgment in the product documentation would be -** appreciated but is not required. -** 2. Altered source versions must be plainly marked as such, and must not be -** misrepresented as being the original software. -** 3. This notice may not be removed or altered from any source distribution. -** -*/ - -/* - Warning: this C++ source file has been auto-generated. Please modify the original php script that generated it. -*/ - -#pragma once - -#include "swrenderer/drawers/r_draw_rgba.h" -#include "swrenderer/viewport/r_walldrawer.h" - -namespace swrenderer -{ - - class : public DrawerCommand - { - protected: - SpriteDrawerArgs args; - - public: - (const SpriteDrawerArgs &drawerargs) : args(drawerargs) { } - - void Execute(DrawerThread *thread) override - { - auto shade_constants = args.ColormapConstants(); - if (shade_constants.simple_shade) - { - - } - else - { - - } - } - - FString DebugInfo() override { return ""; } - }; - - - const uint8_t *source = args.TexturePixels(); - const uint8_t *colormap = args.Colormap(); - const uint32_t *translation = (const uint32_t*)args.TranslationMap(); - - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - - bool is_nearest_filter = (source2 == nullptr); - if (is_nearest_filter) - { - - } - else - { - - } - - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - __m128i dynlight = _mm_cvtsi32_si128(args.DynamicLight()); - dynlight = _mm_unpacklo_epi8(dynlight, _mm_setzero_si128()); - dynlight = _mm_shuffle_epi32(dynlight, _MM_SHUFFLE(1,0,1,0)); - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - - mlight = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - __m128i lightcontrib = _mm_min_epi16(_mm_add_epi16(mlight, dynlight), _mm_set1_epi16(256)); - lightcontrib = _mm_sub_epi16(lightcontrib, mlight); - - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - - frac -= one / 2; - - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - { - - ifgcolor[0] = sampleout; - ifgshade[0] = sampleshadeout; - frac += fracstep; - } - { - - ifgcolor[1] = sampleout; - ifgshade[1] = sampleshadeout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - - - // Blend - - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - - // Sample - unsigned int ifgcolor[2], ifgshade[2]; - - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - ifgshade[0] = sampleshadeout; - ifgshade[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - - - // Blend - - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - - unsigned int sampleout = color; - unsigned int sampleshadeout = colormap[source[frac >> FRACBITS]]; - sampleshadeout = clamp(sampleshadeout, 0, 64) * 4; - - unsigned int sampleout = translation[source[frac >> FRACBITS]]; - unsigned int sampleshadeout = 0; - - unsigned int sampleout = srccolor; - unsigned int sampleshadeout = 0; - - int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - unsigned int sampleshadeout = 0; - - // Clamp to edge - unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - unsigned int sampleshadeout = 0; - - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - __m128i lit_dynlight = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, lightcontrib), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - fgcolor = _mm_add_epi16(fgcolor, lit_dynlight); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(256)); - - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - __m128i alpha = _mm_set_epi16(ifgshade[1], ifgshade[1], ifgshade[1], ifgshade[1], ifgshade[0], ifgshade[0], ifgshade[0], ifgshade[0]); - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - - __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); - __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - -} diff --git a/src/swrenderer/drawers/r_draw_wall32.h b/src/swrenderer/drawers/r_draw_wall32.h deleted file mode 100644 index 7e6dd931dd..0000000000 --- a/src/swrenderer/drawers/r_draw_wall32.h +++ /dev/null @@ -1,5416 +0,0 @@ -/* -** Drawer commands for walls -** Copyright (c) 2016 Magnus Norddahl -** -** This software is provided 'as-is', without any express or implied -** warranty. In no event will the authors be held liable for any damages -** arising from the use of this software. -** -** Permission is granted to anyone to use this software for any purpose, -** including commercial applications, and to alter it and redistribute it -** freely, subject to the following restrictions: -** -** 1. The origin of this software must not be misrepresented; you must not -** claim that you wrote the original software. If you use this software -** in a product, an acknowledgment in the product documentation would be -** appreciated but is not required. -** 2. Altered source versions must be plainly marked as such, and must not be -** misrepresented as being the original software. -** 3. This notice may not be removed or altered from any source distribution. -** -*/ - -/* - Warning: this C++ source file has been auto-generated. Please modify the original php script that generated it. -*/ - -#pragma once - -#include "swrenderer/drawers/r_draw_rgba.h" -#include "swrenderer/viewport/r_walldrawer.h" - -namespace swrenderer -{ - class DrawWall32Command : public DrawerCommand - { - protected: - WallDrawerArgs args; - - public: - DrawWall32Command(const WallDrawerArgs &drawerargs) : args(drawerargs) { } - - void Execute(DrawerThread *thread) override - { - auto shade_constants = args.ColormapConstants(); - if (shade_constants.simple_shade) - { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - bool is_nearest_filter = (source2 == nullptr); - if (is_nearest_filter) - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpz = args.dc_viewpos.Z + args.dc_viewpos_step.Z * thread->skipped_by_thread(dest_y); - float stepvpz = args.dc_viewpos_step.Z * thread->num_cores; - __m128 viewpos_z = _mm_setr_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); - __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - - // Sample - unsigned int ifgcolor[2]; - { - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - frac += fracstep; - } - { - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - ifgcolor[1] = sampleout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - - // Sample - unsigned int ifgcolor[2]; - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpz = args.dc_viewpos.Z + args.dc_viewpos_step.Z * thread->skipped_by_thread(dest_y); - float stepvpz = args.dc_viewpos_step.Z * thread->num_cores; - __m128 viewpos_z = _mm_setr_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); - __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - frac -= one / 2; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - - // Sample - unsigned int ifgcolor[2]; - { - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - frac += fracstep; - } - { - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[1] = sampleout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - - // Sample - unsigned int ifgcolor[2]; - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - else - { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - bool is_nearest_filter = (source2 == nullptr); - if (is_nearest_filter) - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpz = args.dc_viewpos.Z + args.dc_viewpos_step.Z * thread->skipped_by_thread(dest_y); - float stepvpz = args.dc_viewpos_step.Z * thread->num_cores; - __m128 viewpos_z = _mm_setr_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); - __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - - // Sample - unsigned int ifgcolor[2]; - { - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - frac += fracstep; - } - { - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - ifgcolor[1] = sampleout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - - // Sample - unsigned int ifgcolor[2]; - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpz = args.dc_viewpos.Z + args.dc_viewpos_step.Z * thread->skipped_by_thread(dest_y); - float stepvpz = args.dc_viewpos_step.Z * thread->num_cores; - __m128 viewpos_z = _mm_setr_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); - __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - frac -= one / 2; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - - // Sample - unsigned int ifgcolor[2]; - { - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - frac += fracstep; - } - { - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[1] = sampleout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - - // Sample - unsigned int ifgcolor[2]; - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - } - - FString DebugInfo() override { return "DrawWall32Command"; } - }; - - class DrawWallMasked32Command : public DrawerCommand - { - protected: - WallDrawerArgs args; - - public: - DrawWallMasked32Command(const WallDrawerArgs &drawerargs) : args(drawerargs) { } - - void Execute(DrawerThread *thread) override - { - auto shade_constants = args.ColormapConstants(); - if (shade_constants.simple_shade) - { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - bool is_nearest_filter = (source2 == nullptr); - if (is_nearest_filter) - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpz = args.dc_viewpos.Z + args.dc_viewpos_step.Z * thread->skipped_by_thread(dest_y); - float stepvpz = args.dc_viewpos_step.Z * thread->num_cores; - __m128 viewpos_z = _mm_setr_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); - __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - frac += fracstep; - } - { - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - ifgcolor[1] = sampleout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpz = args.dc_viewpos.Z + args.dc_viewpos_step.Z * thread->skipped_by_thread(dest_y); - float stepvpz = args.dc_viewpos_step.Z * thread->num_cores; - __m128 viewpos_z = _mm_setr_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); - __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - frac -= one / 2; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - frac += fracstep; - } - { - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[1] = sampleout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - else - { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - bool is_nearest_filter = (source2 == nullptr); - if (is_nearest_filter) - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpz = args.dc_viewpos.Z + args.dc_viewpos_step.Z * thread->skipped_by_thread(dest_y); - float stepvpz = args.dc_viewpos_step.Z * thread->num_cores; - __m128 viewpos_z = _mm_setr_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); - __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - frac += fracstep; - } - { - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - ifgcolor[1] = sampleout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpz = args.dc_viewpos.Z + args.dc_viewpos_step.Z * thread->skipped_by_thread(dest_y); - float stepvpz = args.dc_viewpos_step.Z * thread->num_cores; - __m128 viewpos_z = _mm_setr_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); - __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - frac -= one / 2; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - frac += fracstep; - } - { - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[1] = sampleout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - } - - FString DebugInfo() override { return "DrawWallMasked32Command"; } - }; - - class DrawWallAddClamp32Command : public DrawerCommand - { - protected: - WallDrawerArgs args; - - public: - DrawWallAddClamp32Command(const WallDrawerArgs &drawerargs) : args(drawerargs) { } - - void Execute(DrawerThread *thread) override - { - auto shade_constants = args.ColormapConstants(); - if (shade_constants.simple_shade) - { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - bool is_nearest_filter = (source2 == nullptr); - if (is_nearest_filter) - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpz = args.dc_viewpos.Z + args.dc_viewpos_step.Z * thread->skipped_by_thread(dest_y); - float stepvpz = args.dc_viewpos_step.Z * thread->num_cores; - __m128 viewpos_z = _mm_setr_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); - __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - frac += fracstep; - } - { - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - ifgcolor[1] = sampleout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpz = args.dc_viewpos.Z + args.dc_viewpos_step.Z * thread->skipped_by_thread(dest_y); - float stepvpz = args.dc_viewpos_step.Z * thread->num_cores; - __m128 viewpos_z = _mm_setr_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); - __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - frac -= one / 2; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - frac += fracstep; - } - { - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[1] = sampleout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - else - { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - bool is_nearest_filter = (source2 == nullptr); - if (is_nearest_filter) - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpz = args.dc_viewpos.Z + args.dc_viewpos_step.Z * thread->skipped_by_thread(dest_y); - float stepvpz = args.dc_viewpos_step.Z * thread->num_cores; - __m128 viewpos_z = _mm_setr_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); - __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - frac += fracstep; - } - { - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - ifgcolor[1] = sampleout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpz = args.dc_viewpos.Z + args.dc_viewpos_step.Z * thread->skipped_by_thread(dest_y); - float stepvpz = args.dc_viewpos_step.Z * thread->num_cores; - __m128 viewpos_z = _mm_setr_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); - __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - frac -= one / 2; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - frac += fracstep; - } - { - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[1] = sampleout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - } - - FString DebugInfo() override { return "DrawWallAddClamp32Command"; } - }; - - class DrawWallSubClamp32Command : public DrawerCommand - { - protected: - WallDrawerArgs args; - - public: - DrawWallSubClamp32Command(const WallDrawerArgs &drawerargs) : args(drawerargs) { } - - void Execute(DrawerThread *thread) override - { - auto shade_constants = args.ColormapConstants(); - if (shade_constants.simple_shade) - { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - bool is_nearest_filter = (source2 == nullptr); - if (is_nearest_filter) - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpz = args.dc_viewpos.Z + args.dc_viewpos_step.Z * thread->skipped_by_thread(dest_y); - float stepvpz = args.dc_viewpos_step.Z * thread->num_cores; - __m128 viewpos_z = _mm_setr_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); - __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - frac += fracstep; - } - { - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - ifgcolor[1] = sampleout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpz = args.dc_viewpos.Z + args.dc_viewpos_step.Z * thread->skipped_by_thread(dest_y); - float stepvpz = args.dc_viewpos_step.Z * thread->num_cores; - __m128 viewpos_z = _mm_setr_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); - __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - frac -= one / 2; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - frac += fracstep; - } - { - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[1] = sampleout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - else - { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - bool is_nearest_filter = (source2 == nullptr); - if (is_nearest_filter) - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpz = args.dc_viewpos.Z + args.dc_viewpos_step.Z * thread->skipped_by_thread(dest_y); - float stepvpz = args.dc_viewpos_step.Z * thread->num_cores; - __m128 viewpos_z = _mm_setr_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); - __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - frac += fracstep; - } - { - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - ifgcolor[1] = sampleout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpz = args.dc_viewpos.Z + args.dc_viewpos_step.Z * thread->skipped_by_thread(dest_y); - float stepvpz = args.dc_viewpos_step.Z * thread->num_cores; - __m128 viewpos_z = _mm_setr_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); - __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - frac -= one / 2; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - frac += fracstep; - } - { - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[1] = sampleout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - } - - FString DebugInfo() override { return "DrawWallSubClamp32Command"; } - }; - - class DrawWallRevSubClamp32Command : public DrawerCommand - { - protected: - WallDrawerArgs args; - - public: - DrawWallRevSubClamp32Command(const WallDrawerArgs &drawerargs) : args(drawerargs) { } - - void Execute(DrawerThread *thread) override - { - auto shade_constants = args.ColormapConstants(); - if (shade_constants.simple_shade) - { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - bool is_nearest_filter = (source2 == nullptr); - if (is_nearest_filter) - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpz = args.dc_viewpos.Z + args.dc_viewpos_step.Z * thread->skipped_by_thread(dest_y); - float stepvpz = args.dc_viewpos_step.Z * thread->num_cores; - __m128 viewpos_z = _mm_setr_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); - __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - frac += fracstep; - } - { - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - ifgcolor[1] = sampleout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); - __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); - __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpz = args.dc_viewpos.Z + args.dc_viewpos_step.Z * thread->skipped_by_thread(dest_y); - float stepvpz = args.dc_viewpos_step.Z * thread->num_cores; - __m128 viewpos_z = _mm_setr_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); - __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - frac -= one / 2; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - frac += fracstep; - } - { - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[1] = sampleout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); - __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); - __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - else - { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - bool is_nearest_filter = (source2 == nullptr); - if (is_nearest_filter) - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpz = args.dc_viewpos.Z + args.dc_viewpos_step.Z * thread->skipped_by_thread(dest_y); - float stepvpz = args.dc_viewpos_step.Z * thread->num_cores; - __m128 viewpos_z = _mm_setr_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); - __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - frac += fracstep; - } - { - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - ifgcolor[1] = sampleout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); - __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); - __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - else - { - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpz = args.dc_viewpos.Z + args.dc_viewpos_step.Z * thread->skipped_by_thread(dest_y); - float stepvpz = args.dc_viewpos_step.Z * thread->num_cores; - __m128 viewpos_z = _mm_setr_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); - __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - frac -= one / 2; - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - { - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - frac += fracstep; - } - { - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[1] = sampleout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); - __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - // Sample - unsigned int ifgcolor[2]; - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - __m128i material = fgcolor; - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - - // Blend - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); - __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - } - } - } - - FString DebugInfo() override { return "DrawWallRevSubClamp32Command"; } - }; - -} diff --git a/src/swrenderer/drawers/r_draw_wall32.php b/src/swrenderer/drawers/r_draw_wall32.php deleted file mode 100644 index bc003c9a5f..0000000000 --- a/src/swrenderer/drawers/r_draw_wall32.php +++ /dev/null @@ -1,370 +0,0 @@ -#!/usr/bin/php -/* -** Drawer commands for walls -** Copyright (c) 2016 Magnus Norddahl -** -** This software is provided 'as-is', without any express or implied -** warranty. In no event will the authors be held liable for any damages -** arising from the use of this software. -** -** Permission is granted to anyone to use this software for any purpose, -** including commercial applications, and to alter it and redistribute it -** freely, subject to the following restrictions: -** -** 1. The origin of this software must not be misrepresented; you must not -** claim that you wrote the original software. If you use this software -** in a product, an acknowledgment in the product documentation would be -** appreciated but is not required. -** 2. Altered source versions must be plainly marked as such, and must not be -** misrepresented as being the original software. -** 3. This notice may not be removed or altered from any source distribution. -** -*/ - -/* - Warning: this C++ source file has been auto-generated. Please modify the original php script that generated it. -*/ - -#pragma once - -#include "swrenderer/drawers/r_draw_rgba.h" -#include "swrenderer/viewport/r_walldrawer.h" - -namespace swrenderer -{ - - class : public DrawerCommand - { - protected: - WallDrawerArgs args; - - public: - (const WallDrawerArgs &drawerargs) : args(drawerargs) { } - - void Execute(DrawerThread *thread) override - { - auto shade_constants = args.ColormapConstants(); - if (shade_constants.simple_shade) - { - - } - else - { - - } - } - - FString DebugInfo() override { return ""; } - }; - - - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - bool is_nearest_filter = (source2 == nullptr); - if (is_nearest_filter) - { - - } - else - { - - } - - int textureheight = args.TextureHeight(); - uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; - - // Shade constants - int light = 256 - (args.Light() >> (FRACBITS - 8)); - __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); - __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - - __m128i inv_desaturate = _mm_setr_epi16(256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate, 256 - shade_constants.desaturate); - __m128i shade_fade = _mm_set_epi16(shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, shade_constants.fade_alpha, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); - shade_fade = _mm_mullo_epi16(shade_fade, inv_light); - __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); - int desaturate = shade_constants.desaturate; - - - int count = args.Count(); - int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); - uint32_t fracstep = args.TextureVStep(); - uint32_t frac = args.TextureVPos(); - uint32_t texturefracx = args.TextureUPos(); - uint32_t *dest = (uint32_t*)args.Dest(); - int dest_y = args.DestY(); - - auto lights = args.dc_lights; - auto num_lights = args.dc_num_lights; - float vpz = args.dc_viewpos.Z + args.dc_viewpos_step.Z * thread->skipped_by_thread(dest_y); - float stepvpz = args.dc_viewpos_step.Z * thread->num_cores; - __m128 viewpos_z = _mm_setr_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); - __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); - - count = thread->count_for_thread(dest_y, count); - if (count <= 0) return; - frac += thread->skipped_by_thread(dest_y) * fracstep; - dest = thread->dest_for_thread(dest_y, pitch, dest); - fracstep *= thread->num_cores; - pitch *= thread->num_cores; - - frac -= one / 2; - - uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); - uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); - - int ssecount = count / 2; - for (int index = 0; index < ssecount; index++) - { - int offset = index * pitch * 2; - uint32_t desttmp[2]; - desttmp[0] = dest[offset]; - desttmp[1] = dest[offset + pitch]; - - __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); - - - // Sample - unsigned int ifgcolor[2]; - { - - ifgcolor[0] = sampleout; - frac += fracstep; - } - { - - ifgcolor[1] = sampleout; - frac += fracstep; - } - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - - - // Blend - - - _mm_storel_epi64((__m128i*)desttmp, outcolor); - dest[offset] = desttmp[0]; - dest[offset + pitch] = desttmp[1]; - viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); - } - - if (ssecount * 2 != count) - { - int index = ssecount * 2; - int offset = index * pitch; - - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); - - - // Sample - unsigned int ifgcolor[2]; - - ifgcolor[0] = sampleout; - ifgcolor[1] = 0; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - - // Shade - - - // Blend - - - dest[offset] = _mm_cvtsi128_si32(outcolor); - } - - int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int sampleout = source[sample_index]; - - unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; - unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; - unsigned int y0 = frac_y0 >> FRACBITS; - unsigned int y1 = frac_y1 >> FRACBITS; - - unsigned int p00 = source[y0]; - unsigned int p01 = source[y1]; - unsigned int p10 = source2[y0]; - unsigned int p11 = source2[y1]; - - unsigned int inv_b = texturefracx; - unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; - unsigned int a = 16 - inv_a; - unsigned int b = 16 - inv_b; - - unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - - unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - - __m128i material = fgcolor; - - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); - - int blue0 = BPART(ifgcolor[0]); - int green0 = GPART(ifgcolor[0]); - int red0 = RPART(ifgcolor[0]); - int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - - int blue1 = BPART(ifgcolor[1]); - int green1 = GPART(ifgcolor[1]); - int red1 = RPART(ifgcolor[1]); - int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; - - __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); - - fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); - fgcolor = _mm_mullo_epi16(fgcolor, mlight); - fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); - - __m128i outcolor = fgcolor; - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - fgcolor = _mm_mullo_epi16(fgcolor, alpha); - bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); - __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - uint32_t alpha0 = APART(ifgcolor[0]); - uint32_t alpha1 = APART(ifgcolor[1]); - alpha0 += alpha0 >> 7; // 255->256 - alpha1 += alpha1 >> 7; // 255->256 - uint32_t inv_alpha0 = 256 - alpha0; - uint32_t inv_alpha1 = 256 - alpha1; - - uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; - uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; - uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; - uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; - - __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); - __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); - - fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); - bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); - - __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); - __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - - - __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - - __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); - __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - - __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); - __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - - - out_lo = _mm_srai_epi32(out_lo, 8); - out_hi = _mm_srai_epi32(out_hi, 8); - __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - __m128i lit = _mm_setzero_si128(); - - for (int i = 0; i != num_lights; i++) - { - __m128 light_x = _mm_set1_ps(lights[i].x); - __m128 light_y = _mm_set1_ps(lights[i].y); - __m128 light_z = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128 m256 = _mm_set1_ps(256.0f); - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y - __m128 Lz = _mm_sub_ps(light_z, viewpos_z); - __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = dot(N,L) * attenuation - __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); - - __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); - attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); - - __m128i light_color = _mm_cvtsi32_si128(lights[i].color); - light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); - light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); - - lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); - } - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - -}