diff --git a/src/d_dehacked.cpp b/src/d_dehacked.cpp index dcb42a2bb..1065836c1 100644 --- a/src/d_dehacked.cpp +++ b/src/d_dehacked.cpp @@ -2844,7 +2844,7 @@ static bool LoadDehSupp () sc.MustGetStringName(","); sc.MustGetNumber(); - if (s.State == NULL || !actortype->OwnsState(s.State + sc.Number)) + if (s.State == NULL || sc.Number < 1 || !actortype->OwnsState(s.State + sc.Number - 1)) { sc.ScriptError("Invalid state range in '%s'", type->TypeName.GetChars()); } diff --git a/src/events.cpp b/src/events.cpp index b07211777..2649bf162 100755 --- a/src/events.cpp +++ b/src/events.cpp @@ -582,9 +582,9 @@ DEFINE_ACTION_FUNCTION(DEventHandler, SendNetworkEvent) { PARAM_PROLOGUE; PARAM_STRING(name); - PARAM_INT(arg1); - PARAM_INT(arg2); - PARAM_INT(arg3); + PARAM_INT_DEF(arg1); + PARAM_INT_DEF(arg2); + PARAM_INT_DEF(arg3); // ACTION_RETURN_BOOL(E_SendNetworkEvent(name, arg1, arg2, arg3, false)); diff --git a/src/intermission/intermission.cpp b/src/intermission/intermission.cpp index 650f03f87..bc052b5dd 100644 --- a/src/intermission/intermission.cpp +++ b/src/intermission/intermission.cpp @@ -732,10 +732,6 @@ DIntermissionController::DIntermissionController(FIntermissionDescriptor *Desc, mScreen = NULL; mFirst = true; mGameState = state; - - // If the intermission finishes straight away then cancel the wipe. - if(!NextPage()) - wipegamestate = GS_FINALE; } bool DIntermissionController::NextPage () @@ -898,6 +894,13 @@ void F_StartIntermission(FIntermissionDescriptor *desc, bool deleteme, uint8_t s viewactive = false; automapactive = false; DIntermissionController::CurrentIntermission = Create(desc, deleteme, state); + + // If the intermission finishes straight away then cancel the wipe. + if (!DIntermissionController::CurrentIntermission->NextPage()) + { + wipegamestate = GS_FINALE; + } + GC::WriteBarrier(DIntermissionController::CurrentIntermission); } diff --git a/src/intermission/intermission.h b/src/intermission/intermission.h index 565e3a164..9111ebe99 100644 --- a/src/intermission/intermission.h +++ b/src/intermission/intermission.h @@ -304,6 +304,7 @@ public: void OnDestroy() override; friend void F_AdvanceIntermission(); + friend void F_StartIntermission(FIntermissionDescriptor *, bool, uint8_t); }; diff --git a/src/p_mobj.cpp b/src/p_mobj.cpp index ac07b54f1..912d30066 100644 --- a/src/p_mobj.cpp +++ b/src/p_mobj.cpp @@ -436,6 +436,7 @@ void AActor::Serialize(FSerializer &arc) A("stamina", stamina) ("goal", goal) A("waterlevel", waterlevel) + A("boomwaterlevel", boomwaterlevel) A("minmissilechance", MinMissileChance) A("spawnflags", SpawnFlags) ("inventory", Inventory) diff --git a/src/polyrenderer/drawers/poly_drawer32.h b/src/polyrenderer/drawers/poly_drawer32.h new file mode 100644 index 000000000..1d83eb1ba --- /dev/null +++ b/src/polyrenderer/drawers/poly_drawer32.h @@ -0,0 +1,782 @@ +/* +** Projected triangle drawer +** Copyright (c) 2016 Magnus Norddahl +** +** This software is provided 'as-is', without any express or implied +** warranty. In no event will the authors be held liable for any damages +** arising from the use of this software. +** +** Permission is granted to anyone to use this software for any purpose, +** including commercial applications, and to alter it and redistribute it +** freely, subject to the following restrictions: +** +** 1. The origin of this software must not be misrepresented; you must not +** claim that you wrote the original software. If you use this software +** in a product, an acknowledgment in the product documentation would be +** appreciated but is not required. +** 2. Altered source versions must be plainly marked as such, and must not be +** misrepresented as being the original software. +** 3. This notice may not be removed or altered from any source distribution. +** +*/ + +#pragma once + +#include "screen_triangle.h" + +namespace TriScreenDrawerModes +{ + namespace + { + struct BgraColor + { + uint32_t b, g, r, a; + BgraColor() { } + BgraColor(uint32_t c) : b(BPART(c)), g(GPART(c)), r(RPART(c)), a(APART(c)) { } + BgraColor &operator=(uint32_t c) { b = BPART(c); g = GPART(c); r = RPART(c); a = APART(c); return *this; } + operator uint32_t() const { return MAKEARGB(a, r, g, b); } + }; + } + + template + FORCEINLINE unsigned int Sample32(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, uint32_t oneU, uint32_t oneV, uint32_t color, const uint32_t *translation) + { + uint32_t texel; + if (SamplerT::Mode == (int)Samplers::Shaded || SamplerT::Mode == (int)Samplers::Stencil || SamplerT::Mode == (int)Samplers::Fill || SamplerT::Mode == (int)Samplers::Fuzz) + { + return color; + } + else if (SamplerT::Mode == (int)Samplers::Translated) + { + const uint8_t *texpal = (const uint8_t *)texPixels; + uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16; + uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16; + return translation[texpal[texelX * texHeight + texelY]]; + } + else if (FilterModeT::Mode == (int)FilterModes::Nearest) + { + uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16; + uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16; + texel = texPixels[texelX * texHeight + texelY]; + } + else + { + u -= oneU >> 1; + v -= oneV >> 1; + + unsigned int frac_x0 = (((uint32_t)u << 8) >> FRACBITS) * texWidth; + unsigned int frac_x1 = ((((uint32_t)u << 8) + oneU) >> FRACBITS) * texWidth; + unsigned int frac_y0 = (((uint32_t)v << 8) >> FRACBITS) * texHeight; + unsigned int frac_y1 = ((((uint32_t)v << 8) + oneV) >> FRACBITS) * texHeight; + unsigned int x0 = frac_x0 >> FRACBITS; + unsigned int x1 = frac_x1 >> FRACBITS; + unsigned int y0 = frac_y0 >> FRACBITS; + unsigned int y1 = frac_y1 >> FRACBITS; + + unsigned int p00 = texPixels[x0 * texHeight + y0]; + unsigned int p01 = texPixels[x0 * texHeight + y1]; + unsigned int p10 = texPixels[x1 * texHeight + y0]; + unsigned int p11 = texPixels[x1 * texHeight + y1]; + + unsigned int inv_a = (frac_x1 >> (FRACBITS - 4)) & 15; + unsigned int inv_b = (frac_y1 >> (FRACBITS - 4)) & 15; + unsigned int a = 16 - inv_a; + unsigned int b = 16 - inv_b; + + unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + texel = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + } + + if (SamplerT::Mode == (int)Samplers::Skycap) + { + int start_fade = 2; // How fast it should fade out + + int alpha_top = clamp(v >> (16 - start_fade), 0, 256); + int alpha_bottom = clamp(((2 << 24) - v) >> (16 - start_fade), 0, 256); + int a = MIN(alpha_top, alpha_bottom); + int inv_a = 256 - a; + + uint32_t r = RPART(texel); + uint32_t g = GPART(texel); + uint32_t b = BPART(texel); + uint32_t fg_a = APART(texel); + uint32_t bg_red = RPART(color); + uint32_t bg_green = GPART(color); + uint32_t bg_blue = BPART(color); + r = (r * a + bg_red * inv_a + 127) >> 8; + g = (g * a + bg_green * inv_a + 127) >> 8; + b = (b * a + bg_blue * inv_a + 127) >> 8; + return MAKEARGB(fg_a, r, g, b); + } + else + { + return texel; + } + } + + template + FORCEINLINE unsigned int SampleShade32(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, int &fuzzpos) + { + if (SamplerT::Mode == (int)Samplers::Shaded) + { + const uint8_t *texpal = (const uint8_t *)texPixels; + uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16; + uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16; + unsigned int sampleshadeout = texpal[texelX * texHeight + texelY]; + sampleshadeout += sampleshadeout >> 7; // 255 -> 256 + return sampleshadeout; + } + else if (SamplerT::Mode == (int)Samplers::Stencil) + { + uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16; + uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16; + unsigned int sampleshadeout = APART(texPixels[texelX * texHeight + texelY]); + sampleshadeout += sampleshadeout >> 7; // 255 -> 256 + return sampleshadeout; + } + else if (SamplerT::Mode == (int)Samplers::Fuzz) + { + uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16; + uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16; + unsigned int sampleshadeout = APART(texPixels[texelX * texHeight + texelY]); + sampleshadeout += sampleshadeout >> 7; // 255 -> 256 + sampleshadeout = (sampleshadeout * fuzzcolormap[fuzzpos++]) >> 5; + if (fuzzpos >= FUZZTABLE) fuzzpos = 0; + return sampleshadeout; + } + else + { + return 0; + } + } + + template + FORCEINLINE BgraColor Shade32(BgraColor fgcolor, BgraColor mlight, uint32_t desaturate, uint32_t inv_desaturate, BgraColor shade_fade, BgraColor shade_light) + { + if (ShadeModeT::Mode == (int)ShadeMode::Simple) + { + fgcolor.r = (fgcolor.r * mlight.r) >> 8; + fgcolor.g = (fgcolor.g * mlight.g) >> 8; + fgcolor.b = (fgcolor.b * mlight.b) >> 8; + } + else if (ShadeModeT::Mode == (int)ShadeMode::Advanced) + { + uint32_t intensity = ((fgcolor.r * 77 + fgcolor.g * 143 + fgcolor.b * 37) >> 8) * desaturate; + fgcolor.r = (((shade_fade.r + ((fgcolor.r * inv_desaturate + intensity) >> 8) * mlight.r) >> 8) * shade_light.r) >> 8; + fgcolor.g = (((shade_fade.g + ((fgcolor.g * inv_desaturate + intensity) >> 8) * mlight.g) >> 8) * shade_light.g) >> 8; + fgcolor.b = (((shade_fade.b + ((fgcolor.b * inv_desaturate + intensity) >> 8) * mlight.b) >> 8) * shade_light.b) >> 8; + } + return fgcolor; + } + + template + FORCEINLINE BgraColor Blend32(BgraColor fgcolor, BgraColor bgcolor, uint32_t ifgcolor, uint32_t ifgshade, uint32_t srcalpha, uint32_t destalpha) + { + if (BlendT::Mode == (int)BlendModes::Opaque) + { + return fgcolor; + } + else if (BlendT::Mode == (int)BlendModes::Masked) + { + return (ifgcolor == 0) ? bgcolor : fgcolor; + } + else if (BlendT::Mode == (int)BlendModes::AddSrcColorOneMinusSrcColor) + { + uint32_t srcred = fgcolor.r + (fgcolor.r >> 7); + uint32_t srcgreen = fgcolor.g + (fgcolor.g >> 7); + uint32_t srcblue = fgcolor.b + (fgcolor.b >> 7); + uint32_t inv_srcred = 256 - srcred; + uint32_t inv_srcgreen = 256 - srcgreen; + uint32_t inv_srcblue = 256 - srcblue; + + BgraColor outcolor; + outcolor.r = (fgcolor.r * srcred + bgcolor.r * inv_srcred) >> 8; + outcolor.g = (fgcolor.g * srcgreen + bgcolor.g * inv_srcgreen) >> 8; + outcolor.b = (fgcolor.b * srcblue + bgcolor.b * inv_srcblue) >> 8; + outcolor.a = 255; + return outcolor; + } + else if (BlendT::Mode == (int)BlendModes::Shaded) + { + uint32_t alpha = ifgshade; + uint32_t inv_alpha = 256 - alpha; + + BgraColor outcolor; + outcolor.r = (fgcolor.r * alpha + bgcolor.r * inv_alpha) >> 8; + outcolor.g = (fgcolor.g * alpha + bgcolor.g * inv_alpha) >> 8; + outcolor.b = (fgcolor.b * alpha + bgcolor.b * inv_alpha) >> 8; + outcolor.a = 255; + return outcolor; + } + else if (BlendT::Mode == (int)BlendModes::AddClampShaded) + { + uint32_t alpha = ifgshade; + BgraColor outcolor; + outcolor.r = ((fgcolor.r * alpha) >> 8) + bgcolor.r; + outcolor.g = ((fgcolor.g * alpha) >> 8) + bgcolor.g; + outcolor.b = ((fgcolor.b * alpha) >> 8) + bgcolor.b; + outcolor.a = 255; + return outcolor; + } + else + { + uint32_t alpha = APART(ifgcolor); + alpha += alpha >> 7; // 255->256 + uint32_t inv_alpha = 256 - alpha; + + uint32_t bgalpha = (destalpha * alpha + (inv_alpha << 8) + 128) >> 8; + uint32_t fgalpha = (srcalpha * alpha + 128) >> 8; + + fgcolor.r *= fgalpha; + fgcolor.g *= fgalpha; + fgcolor.b *= fgalpha; + bgcolor.r *= bgalpha; + bgcolor.g *= bgalpha; + bgcolor.b *= bgalpha; + + BgraColor outcolor; + if (BlendT::Mode == (int)BlendModes::AddClamp) + { + outcolor.r = MIN((fgcolor.r + bgcolor.r) >> 8, 255); + outcolor.g = MIN((fgcolor.g + bgcolor.g) >> 8, 255); + outcolor.b = MIN((fgcolor.b + bgcolor.b) >> 8, 255); + } + else if (BlendT::Mode == (int)BlendModes::SubClamp) + { + outcolor.r = MAX(int32_t(fgcolor.r - bgcolor.r) >> 8, 0); + outcolor.g = MAX(int32_t(fgcolor.g - bgcolor.g) >> 8, 0); + outcolor.b = MAX(int32_t(fgcolor.b - bgcolor.b) >> 8, 0); + } + else if (BlendT::Mode == (int)BlendModes::RevSubClamp) + { + outcolor.r = MAX(int32_t(bgcolor.r - fgcolor.r) >> 8, 0); + outcolor.g = MAX(int32_t(bgcolor.g - fgcolor.g) >> 8, 0); + outcolor.b = MAX(int32_t(bgcolor.b - fgcolor.b) >> 8, 0); + } + outcolor.a = 255; + return outcolor; + } + } +} + +template +class TriScreenDrawer32 +{ +public: + static void Execute(int x, int y, uint32_t mask0, uint32_t mask1, const TriDrawTriangleArgs *args) + { + using namespace TriScreenDrawerModes; + + bool is_simple_shade = args->uniforms->SimpleShade(); + + if (SamplerT::Mode == (int)Samplers::Texture) + { + bool is_nearest_filter = args->uniforms->NearestFilter(); + + if (is_simple_shade) + { + if (is_nearest_filter) + DrawBlock(x, y, mask0, mask1, args); + else + DrawBlock(x, y, mask0, mask1, args); + } + else + { + if (is_nearest_filter) + DrawBlock(x, y, mask0, mask1, args); + else + DrawBlock(x, y, mask0, mask1, args); + } + } + else if (SamplerT::Mode == (int)Samplers::Fuzz) + { + DrawBlock(x, y, mask0, mask1, args); + } + else // no linear filtering for translated, shaded, stencil, fill or skycap + { + if (is_simple_shade) + { + DrawBlock(x, y, mask0, mask1, args); + } + else + { + DrawBlock(x, y, mask0, mask1, args); + } + } + } + +private: + template + FORCEINLINE static void DrawBlock(int destX, int destY, uint32_t mask0, uint32_t mask1, const TriDrawTriangleArgs *args) + { + using namespace TriScreenDrawerModes; + + bool is_fixed_light = args->uniforms->FixedLight(); + uint32_t lightmask = is_fixed_light ? 0 : 0xffffffff; + uint32_t srcalpha = args->uniforms->SrcAlpha(); + uint32_t destalpha = args->uniforms->DestAlpha(); + + int fuzzpos = (ScreenTriangle::FuzzStart + destX * 123 + destY) % FUZZTABLE; + + // Calculate gradients + const TriVertex &v1 = *args->v1; + ScreenTriangleStepVariables gradientX = args->gradientX; + ScreenTriangleStepVariables gradientY = args->gradientY; + ScreenTriangleStepVariables blockPosY; + blockPosY.W = v1.w + gradientX.W * (destX - v1.x) + gradientY.W * (destY - v1.y); + blockPosY.U = v1.u * v1.w + gradientX.U * (destX - v1.x) + gradientY.U * (destY - v1.y); + blockPosY.V = v1.v * v1.w + gradientX.V * (destX - v1.x) + gradientY.V * (destY - v1.y); + gradientX.W *= 8.0f; + gradientX.U *= 8.0f; + gradientX.V *= 8.0f; + + // Output + uint32_t * RESTRICT destOrg = (uint32_t*)args->dest; + int pitch = args->pitch; + uint32_t *dest = destOrg + destX + destY * pitch; + + // Light + uint32_t light = args->uniforms->Light(); + float shade = 2.0f - (light + 12.0f) / 128.0f; + float globVis = args->uniforms->GlobVis() * (1.0f / 32.0f); + light += (light >> 7); // 255 -> 256 + + // Sampling stuff + uint32_t color = args->uniforms->Color(); + const uint32_t * RESTRICT translation = (const uint32_t *)args->uniforms->Translation(); + const uint32_t * RESTRICT texPixels = (const uint32_t *)args->uniforms->TexturePixels(); + uint32_t texWidth = args->uniforms->TextureWidth(); + uint32_t texHeight = args->uniforms->TextureHeight(); + uint32_t oneU, oneV; + if (SamplerT::Mode != (int)Samplers::Fill) + { + oneU = ((0x800000 + texWidth - 1) / texWidth) * 2 + 1; + oneV = ((0x800000 + texHeight - 1) / texHeight) * 2 + 1; + } + else + { + oneU = 0; + oneV = 0; + } + + // Shade constants + int inv_desaturate; + BgraColor shade_fade, shade_light; + int desaturate; + if (ShadeModeT::Mode == (int)ShadeMode::Advanced) + { + shade_fade.r = args->uniforms->ShadeFadeRed(); + shade_fade.g = args->uniforms->ShadeFadeGreen(); + shade_fade.b = args->uniforms->ShadeFadeBlue(); + shade_light.r = args->uniforms->ShadeLightRed(); + shade_light.g = args->uniforms->ShadeLightGreen(); + shade_light.b = args->uniforms->ShadeLightBlue(); + desaturate = args->uniforms->ShadeDesaturate(); + inv_desaturate = 256 - desaturate; + } + else + { + inv_desaturate = 0; + shade_fade.r = 0; + shade_fade.g = 0; + shade_fade.b = 0; + shade_light.r = 0; + shade_light.g = 0; + shade_light.b = 0; + desaturate = 0; + } + + if (mask0 == 0xffffffff && mask1 == 0xffffffff) + { + for (int y = 0; y < 8; y++) + { + float rcpW = 0x01000000 / blockPosY.W; + int32_t posU = (int32_t)(blockPosY.U * rcpW); + int32_t posV = (int32_t)(blockPosY.V * rcpW); + + fixed_t lightpos = FRACUNIT - (int)(clamp(shade - MIN(24.0f / 32.0f, globVis * blockPosY.W), 0.0f, 31.0f / 32.0f) * (float)FRACUNIT); + lightpos = (lightpos & lightmask) | ((light << 8) & ~lightmask); + + ScreenTriangleStepVariables blockPosX = blockPosY; + blockPosX.W += gradientX.W; + blockPosX.U += gradientX.U; + blockPosX.V += gradientX.V; + + rcpW = 0x01000000 / blockPosX.W; + int32_t nextU = (int32_t)(blockPosX.U * rcpW); + int32_t nextV = (int32_t)(blockPosX.V * rcpW); + int32_t stepU = (nextU - posU) / 8; + int32_t stepV = (nextV - posV) / 8; + + fixed_t lightnext = FRACUNIT - (fixed_t)(clamp(shade - MIN(24.0f / 32.0f, globVis * blockPosX.W), 0.0f, 31.0f / 32.0f) * (float)FRACUNIT); + fixed_t lightstep = (lightnext - lightpos) / 8; + lightstep = lightstep & lightmask; + + for (int ix = 0; ix < 8; ix++) + { + // Load bgcolor + BgraColor bgcolor; + if (BlendT::Mode != (int)BlendModes::Opaque) + bgcolor = dest[ix]; + else + bgcolor = 0; + + // Sample fgcolor + unsigned int ifgcolor = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + unsigned int ifgshade = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + posU += stepU; + posV += stepV; + + // Setup light + int lightpos0 = lightpos >> 8; + lightpos += lightstep; + BgraColor mlight; + mlight.r = lightpos0; + mlight.g = lightpos0; + mlight.b = lightpos0; + + BgraColor shade_fade_lit; + if (ShadeModeT::Mode == (int)ShadeMode::Advanced) + { + uint32_t inv_light = 256 - lightpos0; + shade_fade_lit.r = shade_fade.r * inv_light; + shade_fade_lit.g = shade_fade.g * inv_light; + shade_fade_lit.b = shade_fade.b * inv_light; + } + else + { + shade_fade_lit.r = 0; + shade_fade_lit.g = 0; + shade_fade_lit.b = 0; + } + + // Shade and blend + BgraColor fgcolor = Shade32(ifgcolor, mlight, desaturate, inv_desaturate, shade_fade_lit, shade_light); + BgraColor outcolor = Blend32(fgcolor, bgcolor, ifgcolor, ifgshade, srcalpha, destalpha); + + // Store result + dest[ix] = outcolor; + } + + blockPosY.W += gradientY.W; + blockPosY.U += gradientY.U; + blockPosY.V += gradientY.V; + + dest += pitch; + } + } + else + { + // mask0 loop: + for (int y = 0; y < 4; y++) + { + float rcpW = 0x01000000 / blockPosY.W; + int32_t posU = (int32_t)(blockPosY.U * rcpW); + int32_t posV = (int32_t)(blockPosY.V * rcpW); + + fixed_t lightpos = FRACUNIT - (fixed_t)(clamp(shade - MIN(24.0f / 32.0f, globVis * blockPosY.W), 0.0f, 31.0f / 32.0f) * (float)FRACUNIT); + lightpos = (lightpos & lightmask) | ((light << 8) & ~lightmask); + + ScreenTriangleStepVariables blockPosX = blockPosY; + blockPosX.W += gradientX.W; + blockPosX.U += gradientX.U; + blockPosX.V += gradientX.V; + + rcpW = 0x01000000 / blockPosX.W; + int32_t nextU = (int32_t)(blockPosX.U * rcpW); + int32_t nextV = (int32_t)(blockPosX.V * rcpW); + int32_t stepU = (nextU - posU) / 8; + int32_t stepV = (nextV - posV) / 8; + + fixed_t lightnext = FRACUNIT - (fixed_t)(clamp(shade - MIN(24.0f / 32.0f, globVis * blockPosX.W), 0.0f, 31.0f / 32.0f) * (float)FRACUNIT); + fixed_t lightstep = (lightnext - lightpos) / 8; + lightstep = lightstep & lightmask; + + for (int x = 0; x < 8; x++) + { + // Load bgcolor + BgraColor bgcolor; + if (BlendT::Mode != (int)BlendModes::Opaque) + { + if (mask0 & (1 << 31)) bgcolor = dest[x]; + } + else + bgcolor = 0; + + // Sample fgcolor + unsigned int ifgcolor = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + unsigned int ifgshade = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + posU += stepU; + posV += stepV; + + // Setup light + int lightpos0 = lightpos >> 8; + lightpos += lightstep; + BgraColor mlight; + mlight.r = lightpos0; + mlight.g = lightpos0; + mlight.b = lightpos0; + + BgraColor shade_fade_lit; + if (ShadeModeT::Mode == (int)ShadeMode::Advanced) + { + uint32_t inv_light = 256 - lightpos0; + shade_fade_lit.r = shade_fade.r * inv_light; + shade_fade_lit.g = shade_fade.g * inv_light; + shade_fade_lit.b = shade_fade.b * inv_light; + } + else + { + shade_fade_lit.r = 0; + shade_fade_lit.g = 0; + shade_fade_lit.b = 0; + } + + // Shade and blend + BgraColor fgcolor = Shade32(ifgcolor, mlight, desaturate, inv_desaturate, shade_fade_lit, shade_light); + BgraColor outcolor = Blend32(fgcolor, bgcolor, ifgcolor, ifgshade, srcalpha, destalpha); + + // Store result + if (mask0 & (1 << 31)) dest[x] = outcolor; + + mask0 <<= 1; + } + + blockPosY.W += gradientY.W; + blockPosY.U += gradientY.U; + blockPosY.V += gradientY.V; + + dest += pitch; + } + + // mask1 loop: + for (int y = 0; y < 4; y++) + { + float rcpW = 0x01000000 / blockPosY.W; + int32_t posU = (int32_t)(blockPosY.U * rcpW); + int32_t posV = (int32_t)(blockPosY.V * rcpW); + + fixed_t lightpos = FRACUNIT - (fixed_t)(clamp(shade - MIN(24.0f / 32.0f, globVis * blockPosY.W), 0.0f, 31.0f / 32.0f) * (float)FRACUNIT); + lightpos = (lightpos & lightmask) | ((light << 8) & ~lightmask); + + ScreenTriangleStepVariables blockPosX = blockPosY; + blockPosX.W += gradientX.W; + blockPosX.U += gradientX.U; + blockPosX.V += gradientX.V; + + rcpW = 0x01000000 / blockPosX.W; + int32_t nextU = (int32_t)(blockPosX.U * rcpW); + int32_t nextV = (int32_t)(blockPosX.V * rcpW); + int32_t stepU = (nextU - posU) / 8; + int32_t stepV = (nextV - posV) / 8; + + fixed_t lightnext = FRACUNIT - (fixed_t)(clamp(shade - MIN(24.0f / 32.0f, globVis * blockPosX.W), 0.0f, 31.0f / 32.0f) * (float)FRACUNIT); + fixed_t lightstep = (lightnext - lightpos) / 8; + lightstep = lightstep & lightmask; + + for (int x = 0; x < 8; x++) + { + // Load bgcolor + BgraColor bgcolor; + if (BlendT::Mode != (int)BlendModes::Opaque) + { + if (mask1 & (1 << 31)) bgcolor = dest[x]; + } + else + bgcolor = 0; + + // Sample fgcolor + unsigned int ifgcolor = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + unsigned int ifgshade = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + posU += stepU; + posV += stepV; + + // Setup light + int lightpos0 = lightpos >> 8; + lightpos += lightstep; + BgraColor mlight; + mlight.r = lightpos0; + mlight.g = lightpos0; + mlight.b = lightpos0; + + BgraColor shade_fade_lit; + if (ShadeModeT::Mode == (int)ShadeMode::Advanced) + { + uint32_t inv_light = 256 - lightpos0; + shade_fade_lit.r = shade_fade.r * inv_light; + shade_fade_lit.g = shade_fade.g * inv_light; + shade_fade_lit.b = shade_fade.b * inv_light; + } + else + { + shade_fade_lit.r = 0; + shade_fade_lit.g = 0; + shade_fade_lit.b = 0; + } + + // Shade and blend + BgraColor fgcolor = Shade32(ifgcolor, mlight, desaturate, inv_desaturate, shade_fade_lit, shade_light); + BgraColor outcolor = Blend32(fgcolor, bgcolor, ifgcolor, ifgshade, srcalpha, destalpha); + + // Store result + if (mask1 & (1 << 31)) dest[x] = outcolor; + + mask1 <<= 1; + } + + blockPosY.W += gradientY.W; + blockPosY.U += gradientY.U; + blockPosY.V += gradientY.V; + + dest += pitch; + } + } + } +}; + +template +class RectScreenDrawer32 +{ +public: + static void Execute(const void *destOrg, int destWidth, int destHeight, int destPitch, const RectDrawArgs *args, WorkerThreadData *thread) + { + using namespace TriScreenDrawerModes; + + if (args->SimpleShade()) + { + Loop(destOrg, destWidth, destHeight, destPitch, args, thread); + } + else + { + Loop(destOrg, destWidth, destHeight, destPitch, args, thread); + } + } + +private: + template + FORCEINLINE static void Loop(const void *destOrg, int destWidth, int destHeight, int destPitch, const RectDrawArgs *args, WorkerThreadData *thread) + { + using namespace TriScreenDrawerModes; + + int x0 = clamp((int)(args->X0() + 0.5f), 0, destWidth); + int x1 = clamp((int)(args->X1() + 0.5f), 0, destWidth); + int y0 = clamp((int)(args->Y0() + 0.5f), 0, destHeight); + int y1 = clamp((int)(args->Y1() + 0.5f), 0, destHeight); + + if (x1 <= x0 || y1 <= y0) + return; + + uint32_t srcalpha = args->SrcAlpha(); + uint32_t destalpha = args->DestAlpha(); + + // Setup step variables + float fstepU = (args->U1() - args->U0()) / (args->X1() - args->X0()); + float fstepV = (args->V1() - args->V0()) / (args->Y1() - args->Y0()); + uint32_t startU = (int32_t)((args->U0() + (x0 + 0.5f - args->X0()) * fstepU) * 0x1000000); + uint32_t startV = (int32_t)((args->V0() + (y0 + 0.5f - args->Y0()) * fstepV) * 0x1000000); + uint32_t stepU = (int32_t)(fstepU * 0x1000000); + uint32_t stepV = (int32_t)(fstepV * 0x1000000); + + // Sampling stuff + uint32_t color = args->Color(); + const uint32_t * RESTRICT translation = (const uint32_t *)args->Translation(); + const uint32_t * RESTRICT texPixels = (const uint32_t *)args->TexturePixels(); + uint32_t texWidth = args->TextureWidth(); + uint32_t texHeight = args->TextureHeight(); + uint32_t oneU, oneV; + if (SamplerT::Mode != (int)Samplers::Fill) + { + oneU = ((0x800000 + texWidth - 1) / texWidth) * 2 + 1; + oneV = ((0x800000 + texHeight - 1) / texHeight) * 2 + 1; + } + else + { + oneU = 0; + oneV = 0; + } + + // Setup light + uint32_t lightpos = args->Light(); + lightpos += lightpos >> 7; // 255 -> 256 + BgraColor mlight; + + // Shade constants + int inv_desaturate; + BgraColor shade_fade_lit, shade_light; + int desaturate; + if (ShadeModeT::Mode == (int)ShadeMode::Advanced) + { + uint32_t inv_light = 256 - lightpos; + shade_fade_lit.r = args->ShadeFadeRed() * inv_light; + shade_fade_lit.g = args->ShadeFadeGreen() * inv_light; + shade_fade_lit.b = args->ShadeFadeBlue() * inv_light; + shade_light.r = args->ShadeLightRed(); + shade_light.g = args->ShadeLightGreen(); + shade_light.b = args->ShadeLightBlue(); + desaturate = args->ShadeDesaturate(); + inv_desaturate = 256 - desaturate; + mlight.r = lightpos; + mlight.g = lightpos; + mlight.b = lightpos; + } + else + { + inv_desaturate = 0; + shade_fade_lit.r = 0; + shade_fade_lit.g = 0; + shade_fade_lit.b = 0; + shade_light.r = 0; + shade_light.g = 0; + shade_light.b = 0; + desaturate = 0; + mlight.r = lightpos; + mlight.g = lightpos; + mlight.b = lightpos; + } + + int count = x1 - x0; + + int fuzzpos = (ScreenTriangle::FuzzStart + x0 * 123 + y0) % FUZZTABLE; + + uint32_t posV = startV; + for (int y = y0; y < y1; y++, posV += stepV) + { + int coreBlock = y / 8; + if (coreBlock % thread->num_cores != thread->core) + { + fuzzpos = (fuzzpos + count) % FUZZTABLE; + continue; + } + + uint32_t *dest = ((uint32_t*)destOrg) + y * destPitch + x0; + + uint32_t posU = startU; + for (int i = 0; i < count; i++) + { + // Load bgcolor + BgraColor bgcolor; + if (BlendT::Mode != (int)BlendModes::Opaque) + bgcolor = *dest; + else + bgcolor = 0; + + // Sample fgcolor + unsigned int ifgcolor = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + unsigned int ifgshade = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + posU += stepU; + + // Shade and blend + BgraColor fgcolor = Shade32(ifgcolor, mlight, desaturate, inv_desaturate, shade_fade_lit, shade_light); + BgraColor outcolor = Blend32(fgcolor, bgcolor, ifgcolor, ifgshade, srcalpha, destalpha); + + // Store result + *dest = outcolor; + dest++; + } + } + } +}; diff --git a/src/polyrenderer/drawers/poly_drawer32_avx2.h b/src/polyrenderer/drawers/poly_drawer32_avx2.h new file mode 100644 index 000000000..9091ae21a --- /dev/null +++ b/src/polyrenderer/drawers/poly_drawer32_avx2.h @@ -0,0 +1,739 @@ +/* +** Projected triangle drawer +** Copyright (c) 2016 Magnus Norddahl +** +** This software is provided 'as-is', without any express or implied +** warranty. In no event will the authors be held liable for any damages +** arising from the use of this software. +** +** Permission is granted to anyone to use this software for any purpose, +** including commercial applications, and to alter it and redistribute it +** freely, subject to the following restrictions: +** +** 1. The origin of this software must not be misrepresented; you must not +** claim that you wrote the original software. If you use this software +** in a product, an acknowledgment in the product documentation would be +** appreciated but is not required. +** 2. Altered source versions must be plainly marked as such, and must not be +** misrepresented as being the original software. +** 3. This notice may not be removed or altered from any source distribution. +** +*/ + +#pragma once + +#include "screen_triangle.h" + +#ifdef _MSC_VER +#pragma warning(disable: 4752) // warning C4752 : found Intel(R) Advanced Vector Extensions; consider using /arch:AVX +#endif + +namespace TriScreenDrawerModes +{ + template + FORCEINLINE unsigned int VECTORCALL Sample32_AVX2(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, uint32_t oneU, uint32_t oneV, uint32_t color, const uint32_t *translation) + { + uint32_t texel; + if (SamplerT::Mode == (int)Samplers::Shaded || SamplerT::Mode == (int)Samplers::Stencil || SamplerT::Mode == (int)Samplers::Fill || SamplerT::Mode == (int)Samplers::Fuzz) + { + return color; + } + else if (SamplerT::Mode == (int)Samplers::Translated) + { + const uint8_t *texpal = (const uint8_t *)texPixels; + uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16; + uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16; + return translation[texpal[texelX * texHeight + texelY]]; + } + else if (FilterModeT::Mode == (int)FilterModes::Nearest) + { + uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16; + uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16; + texel = texPixels[texelX * texHeight + texelY]; + } + else + { + u -= oneU >> 1; + v -= oneV >> 1; + + unsigned int frac_x0 = (((uint32_t)u << 8) >> FRACBITS) * texWidth; + unsigned int frac_x1 = ((((uint32_t)u << 8) + oneU) >> FRACBITS) * texWidth; + unsigned int frac_y0 = (((uint32_t)v << 8) >> FRACBITS) * texHeight; + unsigned int frac_y1 = ((((uint32_t)v << 8) + oneV) >> FRACBITS) * texHeight; + unsigned int x0 = frac_x0 >> FRACBITS; + unsigned int x1 = frac_x1 >> FRACBITS; + unsigned int y0 = frac_y0 >> FRACBITS; + unsigned int y1 = frac_y1 >> FRACBITS; + + unsigned int p00 = texPixels[x0 * texHeight + y0]; + unsigned int p01 = texPixels[x0 * texHeight + y1]; + unsigned int p10 = texPixels[x1 * texHeight + y0]; + unsigned int p11 = texPixels[x1 * texHeight + y1]; + + unsigned int inv_a = (frac_x1 >> (FRACBITS - 4)) & 15; + unsigned int inv_b = (frac_y1 >> (FRACBITS - 4)) & 15; + unsigned int a = 16 - inv_a; + unsigned int b = 16 - inv_b; + + unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + texel = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + } + + if (SamplerT::Mode == (int)Samplers::Skycap) + { + int start_fade = 2; // How fast it should fade out + + int alpha_top = clamp(v >> (16 - start_fade), 0, 256); + int alpha_bottom = clamp(((2 << 24) - v) >> (16 - start_fade), 0, 256); + int a = MIN(alpha_top, alpha_bottom); + int inv_a = 256 - a; + + uint32_t r = RPART(texel); + uint32_t g = GPART(texel); + uint32_t b = BPART(texel); + uint32_t fg_a = APART(texel); + uint32_t bg_red = RPART(color); + uint32_t bg_green = GPART(color); + uint32_t bg_blue = BPART(color); + r = (r * a + bg_red * inv_a + 127) >> 8; + g = (g * a + bg_green * inv_a + 127) >> 8; + b = (b * a + bg_blue * inv_a + 127) >> 8; + return MAKEARGB(fg_a, r, g, b); + } + else + { + return texel; + } + } + + template + FORCEINLINE unsigned int VECTORCALL SampleShade32_AVX2(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, int &fuzzpos) + { + if (SamplerT::Mode == (int)Samplers::Shaded) + { + const uint8_t *texpal = (const uint8_t *)texPixels; + uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16; + uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16; + unsigned int sampleshadeout = texpal[texelX * texHeight + texelY]; + sampleshadeout += sampleshadeout >> 7; // 255 -> 256 + return sampleshadeout; + } + else if (SamplerT::Mode == (int)Samplers::Stencil) + { + uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16; + uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16; + unsigned int sampleshadeout = APART(texPixels[texelX * texHeight + texelY]); + sampleshadeout += sampleshadeout >> 7; // 255 -> 256 + return sampleshadeout; + } + else if (SamplerT::Mode == (int)Samplers::Fuzz) + { + uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16; + uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16; + unsigned int sampleshadeout = APART(texPixels[texelX * texHeight + texelY]); + sampleshadeout += sampleshadeout >> 7; // 255 -> 256 + sampleshadeout = (sampleshadeout * fuzzcolormap[fuzzpos++]) >> 5; + if (fuzzpos >= FUZZTABLE) fuzzpos = 0; + return sampleshadeout; + } + else + { + return 0; + } + } + + template + FORCEINLINE __m256i VECTORCALL Shade32_AVX2(__m256i fgcolor, __m256i mlight, __m256i desaturate, __m256i inv_desaturate, __m256i shade_fade, __m256i shade_light) + { + if (ShadeModeT::Mode == (int)ShadeMode::Simple) + { + fgcolor = _mm256_srli_epi16(_mm256_mullo_epi16(fgcolor, mlight), 8); + } + else if (ShadeModeT::Mode == (int)ShadeMode::Advanced) + { + __m256i intensity = _mm256_mullo_epi16(fgcolor, _mm256_set_epi16(0, 77, 143, 37, 0, 77, 143, 37, 0, 77, 143, 37, 0, 77, 143, 37)); + intensity = _mm256_add_epi16(intensity, _mm256_srli_epi64(intensity, 32)); + intensity = _mm256_add_epi16(intensity, _mm256_srli_epi64(intensity, 16)); + intensity = _mm256_srli_epi16(intensity, 8); + intensity = _mm256_mullo_epi16(intensity, desaturate); + intensity = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(intensity, _MM_SHUFFLE(3, 0, 0, 0)), _MM_SHUFFLE(3, 0, 0, 0)); + + fgcolor = _mm256_srli_epi16(_mm256_add_epi16(_mm256_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm256_mullo_epi16(fgcolor, mlight); + fgcolor = _mm256_srli_epi16(_mm256_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm256_srli_epi16(_mm256_mullo_epi16(fgcolor, shade_light), 8); + } + return fgcolor; + } + + template + FORCEINLINE __m256i VECTORCALL Blend32_AVX2(__m256i fgcolor, __m256i bgcolor, __m256i ifgcolor, __m256i ifgshade, __m256i srcalpha, __m256i destalpha) + { + if (BlendT::Mode == (int)BlendModes::Opaque) + { + __m256i outcolor = fgcolor; + outcolor = _mm256_packus_epi16(outcolor, _mm256_setzero_si256()); + return outcolor; + } + else if (BlendT::Mode == (int)BlendModes::Masked) + { + __m256i mask = _mm256_cmpeq_epi32(_mm256_packus_epi16(fgcolor, _mm256_setzero_si256()), _mm256_setzero_si256()); + mask = _mm256_unpacklo_epi8(mask, _mm256_setzero_si256()); + __m256i outcolor = _mm256_or_si256(_mm256_and_si256(mask, bgcolor), _mm256_andnot_si256(mask, fgcolor)); + outcolor = _mm256_packus_epi16(outcolor, _mm256_setzero_si256()); + outcolor = _mm256_or_si256(outcolor, _mm256_set1_epi32(0xff000000)); + return outcolor; + } + else if (BlendT::Mode == (int)BlendModes::AddSrcColorOneMinusSrcColor) + { + __m256i inv_srccolor = _mm256_sub_epi16(_mm256_set1_epi16(256), _mm256_add_epi16(fgcolor, _mm256_srli_epi16(fgcolor, 7))); + __m256i outcolor = _mm256_add_epi16(fgcolor, _mm256_srli_epi16(_mm256_mullo_epi16(bgcolor, inv_srccolor), 8)); + outcolor = _mm256_packus_epi16(outcolor, _mm256_setzero_si256()); + return outcolor; + } + else if (BlendT::Mode == (int)BlendModes::Shaded) + { + ifgshade = _mm256_srli_epi32(_mm256_add_epi32(_mm256_mul_epu32(ifgshade, srcalpha), _mm256_set1_epi32(128)), 8); + __m256i alpha = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(ifgshade, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0)); + __m256i inv_alpha = _mm256_sub_epi16(_mm256_set1_epi16(256), alpha); + + fgcolor = _mm256_mullo_epi16(fgcolor, alpha); + bgcolor = _mm256_mullo_epi16(bgcolor, inv_alpha); + __m256i outcolor = _mm256_srli_epi16(_mm256_add_epi16(fgcolor, bgcolor), 8); + outcolor = _mm256_packus_epi16(outcolor, _mm256_setzero_si256()); + outcolor = _mm256_or_si256(outcolor, _mm256_set1_epi32(0xff000000)); + return outcolor; + } + else if (BlendT::Mode == (int)BlendModes::AddClampShaded) + { + ifgshade = _mm256_srli_epi32(_mm256_add_epi32(_mm256_mul_epu32(ifgshade, srcalpha), _mm256_set1_epi32(128)), 8); + __m256i alpha = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(ifgshade, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0)); + __m256i inv_alpha = _mm256_sub_epi16(_mm256_set1_epi16(256), alpha); + + fgcolor = _mm256_srli_epi16(_mm256_mullo_epi16(fgcolor, alpha), 8); + __m256i outcolor = _mm256_add_epi16(fgcolor, bgcolor); + outcolor = _mm256_packus_epi16(outcolor, _mm256_setzero_si256()); + outcolor = _mm256_or_si256(outcolor, _mm256_set1_epi32(0xff000000)); + return outcolor; + } + else + { + __m256i alpha = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(ifgcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)); + alpha = _mm256_srli_epi16(_mm256_add_epi16(alpha, _mm256_srli_epi16(alpha, 7)), 1); // 255->128 + __m256i inv_alpha = _mm256_sub_epi16(_mm256_set1_epi16(128), alpha); + + __m256i bgalpha = _mm256_srli_epi16(_mm256_add_epi16(_mm256_add_epi16(_mm256_mullo_epi16(destalpha, alpha), _mm256_slli_epi16(inv_alpha, 8)), _mm256_set1_epi32(64)), 7); + __m256i fgalpha = _mm256_srli_epi16(_mm256_add_epi16(_mm256_mullo_epi16(srcalpha, alpha), _mm256_set1_epi32(64)), 7); + + fgcolor = _mm256_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm256_mullo_epi16(bgcolor, bgalpha); + + __m256i fg_lo = _mm256_unpacklo_epi16(fgcolor, _mm256_setzero_si256()); + __m256i bg_lo = _mm256_unpacklo_epi16(bgcolor, _mm256_setzero_si256()); + __m256i fg_hi = _mm256_unpackhi_epi16(fgcolor, _mm256_setzero_si256()); + __m256i bg_hi = _mm256_unpackhi_epi16(bgcolor, _mm256_setzero_si256()); + + __m256i out_lo, out_hi; + if (BlendT::Mode == (int)BlendModes::AddClamp) + { + out_lo = _mm256_add_epi32(fg_lo, bg_lo); + out_hi = _mm256_add_epi32(fg_hi, bg_hi); + } + else if (BlendT::Mode == (int)BlendModes::SubClamp) + { + out_lo = _mm256_sub_epi32(fg_lo, bg_lo); + out_hi = _mm256_sub_epi32(fg_hi, bg_hi); + } + else if (BlendT::Mode == (int)BlendModes::RevSubClamp) + { + out_lo = _mm256_sub_epi32(bg_lo, fg_lo); + out_hi = _mm256_sub_epi32(bg_hi, fg_hi); + } + + out_lo = _mm256_srai_epi32(out_lo, 8); + out_hi = _mm256_srai_epi32(out_hi, 8); + __m256i outcolor = _mm256_packs_epi32(out_lo, out_hi); + outcolor = _mm256_packus_epi16(outcolor, _mm256_setzero_si256()); + outcolor = _mm256_or_si256(outcolor, _mm256_set1_epi32(0xff000000)); + return outcolor; + } + } +} + +template +class TriScreenDrawer32_AVX2 +{ +public: + static void Execute(int x, int y, uint32_t mask0, uint32_t mask1, const TriDrawTriangleArgs *args) + { + using namespace TriScreenDrawerModes; + + bool is_simple_shade = args->uniforms->SimpleShade(); + + if (SamplerT::Mode == (int)Samplers::Texture) + { + bool is_nearest_filter = args->uniforms->NearestFilter(); + + if (is_simple_shade) + { + if (is_nearest_filter) + DrawBlock(x, y, mask0, mask1, args); + else + DrawBlock(x, y, mask0, mask1, args); + } + else + { + if (is_nearest_filter) + DrawBlock(x, y, mask0, mask1, args); + else + DrawBlock(x, y, mask0, mask1, args); + } + } + else if (SamplerT::Mode == (int)Samplers::Fuzz) + { + DrawBlock(x, y, mask0, mask1, args); + } + else // no linear filtering for translated, shaded, stencil, fill or skycap + { + if (is_simple_shade) + { + DrawBlock(x, y, mask0, mask1, args); + } + else + { + DrawBlock(x, y, mask0, mask1, args); + } + } + } + +private: + template + FORCEINLINE static void VECTORCALL DrawBlock(int destX, int destY, uint32_t mask0, uint32_t mask1, const TriDrawTriangleArgs *args) + { + using namespace TriScreenDrawerModes; + + bool is_fixed_light = args->uniforms->FixedLight(); + __m128i lightmask = _mm_set1_epi32(is_fixed_light ? 0 : 0xffffffff); + __m256i srcalpha = _mm256_set1_epi16(args->uniforms->SrcAlpha()); + __m256i destalpha = _mm256_set1_epi16(args->uniforms->DestAlpha()); + + int fuzzpos = (ScreenTriangle::FuzzStart + destX * 123 + destY) % FUZZTABLE; + + // Light + uint32_t light = args->uniforms->Light(); + float shade = MIN(2.0f - (light + 12.0f) / 128.0f, 31.0f / 32.0f); + float globVis = args->uniforms->GlobVis() * (1.0f / 32.0f); + light += (light >> 7); // 255 -> 256 + light <<= 8; + __m128i fixedlight = _mm_set1_epi32(light); + + // Calculate gradients + const TriVertex &v1 = *args->v1; + __m128 gradientX = _mm_setr_ps(args->gradientX.W, args->gradientX.U, args->gradientX.V, 0.0f); + __m128 gradientY = _mm_setr_ps(args->gradientY.W, args->gradientY.U, args->gradientY.V, 0.0f); + __m128 blockPosY = _mm_add_ps(_mm_add_ps( + _mm_setr_ps(v1.w, v1.u * v1.w, v1.v * v1.w, globVis), + _mm_mul_ps(gradientX, _mm_set1_ps(destX - v1.x))), + _mm_mul_ps(gradientY, _mm_set1_ps(destY - v1.y))); + gradientX = _mm_mul_ps(gradientX, _mm_set1_ps(8.0f)); + + // Output + uint32_t * RESTRICT destOrg = (uint32_t*)args->dest; + int pitch = args->pitch; + uint32_t *dest = destOrg + destX + destY * pitch; + int offset_next_line = pitch - 8; + + // Sampling stuff + uint32_t color = args->uniforms->Color(); + const uint32_t * RESTRICT translation = (const uint32_t *)args->uniforms->Translation(); + const uint32_t * RESTRICT texPixels = (const uint32_t *)args->uniforms->TexturePixels(); + uint32_t texWidth = args->uniforms->TextureWidth(); + uint32_t texHeight = args->uniforms->TextureHeight(); + uint32_t oneU, oneV; + if (SamplerT::Mode != (int)Samplers::Fill) + { + oneU = ((0x800000 + texWidth - 1) / texWidth) * 2 + 1; + oneV = ((0x800000 + texHeight - 1) / texHeight) * 2 + 1; + } + else + { + oneU = 0; + oneV = 0; + } + + // Shade constants + __m256i inv_desaturate, shade_fade, shade_light; + __m256i desaturate; + if (ShadeModeT::Mode == (int)ShadeMode::Advanced) + { + inv_desaturate = _mm256_setr_epi16( + 256, 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate(), + 256, 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate(), + 256, 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate(), + 256, 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate()); + shade_fade = _mm256_set_epi16( + args->uniforms->ShadeFadeAlpha(), args->uniforms->ShadeFadeRed(), args->uniforms->ShadeFadeGreen(), args->uniforms->ShadeFadeBlue(), + args->uniforms->ShadeFadeAlpha(), args->uniforms->ShadeFadeRed(), args->uniforms->ShadeFadeGreen(), args->uniforms->ShadeFadeBlue(), + args->uniforms->ShadeFadeAlpha(), args->uniforms->ShadeFadeRed(), args->uniforms->ShadeFadeGreen(), args->uniforms->ShadeFadeBlue(), + args->uniforms->ShadeFadeAlpha(), args->uniforms->ShadeFadeRed(), args->uniforms->ShadeFadeGreen(), args->uniforms->ShadeFadeBlue()); + shade_light = _mm256_set_epi16( + args->uniforms->ShadeLightAlpha(), args->uniforms->ShadeLightRed(), args->uniforms->ShadeLightGreen(), args->uniforms->ShadeLightBlue(), + args->uniforms->ShadeLightAlpha(), args->uniforms->ShadeLightRed(), args->uniforms->ShadeLightGreen(), args->uniforms->ShadeLightBlue(), + args->uniforms->ShadeLightAlpha(), args->uniforms->ShadeLightRed(), args->uniforms->ShadeLightGreen(), args->uniforms->ShadeLightBlue(), + args->uniforms->ShadeLightAlpha(), args->uniforms->ShadeLightRed(), args->uniforms->ShadeLightGreen(), args->uniforms->ShadeLightBlue()); + desaturate = _mm256_sub_epi16(_mm256_set1_epi16(256), inv_desaturate); + } + else + { + inv_desaturate = _mm256_setzero_si256(); + shade_fade = _mm256_setzero_si256(); + shade_fade = _mm256_setzero_si256(); + shade_light = _mm256_setzero_si256(); + desaturate = _mm256_setzero_si256(); + } + + if (mask0 == 0xffffffff && mask1 == 0xffffffff) + { + for (int y = 0; y < 8; y++) + { + __m128 blockPosX = _mm_add_ps(blockPosY, gradientX); + __m128 W = _mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 rcpW = _mm_div_ps(_mm_set1_ps((float)0x01000000), W); + __m128i posUV = _mm_cvtps_epi32(_mm_mul_ps(_mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(2, 1, 2, 1)), rcpW)); + + __m128 vis = _mm_mul_ps(_mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(3, 3, 3, 3)), W); + __m128i lightpospair = _mm_sub_epi32( + _mm_set1_epi32(FRACUNIT), + _mm_cvtps_epi32(_mm_mul_ps( + _mm_max_ps(_mm_sub_ps(_mm_set1_ps(shade), _mm_min_ps(_mm_set1_ps(24.0f / 32.0f), vis)), _mm_setzero_ps()), + _mm_set1_ps((float)FRACUNIT)))); + lightpospair = _mm_or_si128(_mm_and_si128(lightmask, lightpospair), _mm_andnot_si128(lightmask, fixedlight)); + + int32_t posU = _mm_cvtsi128_si32(posUV); + int32_t posV = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 4)); + int32_t nextU = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 8)); + int32_t nextV = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 12)); + int32_t lightpos = _mm_cvtsi128_si32(lightpospair); + int32_t lightnext = _mm_cvtsi128_si32(_mm_srli_si128(lightpospair, 8)); + int32_t stepU = (nextU - posU) >> 3; + int32_t stepV = (nextV - posV) >> 3; + fixed_t lightstep = (lightnext - lightpos) >> 3; + + for (int ix = 0; ix < 2; ix++) + { + // Load bgcolor + __m256i bgcolor; + if (BlendT::Mode != (int)BlendModes::Opaque) + { + __m128i bgpacked = _mm_loadu_si128((__m128i*)dest); + bgcolor = _mm256_set_m128i(_mm_unpackhi_epi8(bgpacked, _mm_setzero_si128()), _mm_unpacklo_epi8(bgpacked, _mm_setzero_si128())); + } + else + bgcolor = _mm256_setzero_si256(); + + // Sample fgcolor + unsigned int ifgcolor0 = Sample32_AVX2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + unsigned int ifgshade0 = SampleShade32_AVX2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + posU += stepU; + posV += stepV; + + unsigned int ifgcolor1 = Sample32_AVX2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + unsigned int ifgshade1 = SampleShade32_AVX2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + posU += stepU; + posV += stepV; + + unsigned int ifgcolor2 = Sample32_AVX2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + unsigned int ifgshade2 = SampleShade32_AVX2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + posU += stepU; + posV += stepV; + + unsigned int ifgcolor3 = Sample32_AVX2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + unsigned int ifgshade3 = SampleShade32_AVX2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + posU += stepU; + posV += stepV; + + // Setup light + int lightpos0 = lightpos >> 8; + lightpos += lightstep; + int lightpos1 = lightpos >> 8; + lightpos += lightstep; + int lightpos2 = lightpos >> 8; + lightpos += lightstep; + int lightpos3 = lightpos >> 8; + lightpos += lightstep; + __m256i mlight = _mm256_set_epi16( + 256, lightpos3, lightpos3, lightpos3, + 256, lightpos2, lightpos2, lightpos2, + 256, lightpos1, lightpos1, lightpos1, + 256, lightpos0, lightpos0, lightpos0); + + __m256i shade_fade_lit; + if (ShadeModeT::Mode == (int)ShadeMode::Advanced) + { + __m256i inv_light = _mm256_sub_epi16(_mm256_set_epi16(0, 256, 256, 256, 0, 256, 256, 256, 0, 256, 256, 256, 0, 256, 256, 256), mlight); + shade_fade_lit = _mm256_mullo_epi16(shade_fade, inv_light); + } + else + { + shade_fade_lit = _mm256_setzero_si256(); + } + + // Shade and blend + __m128i fgpacked = _mm_set_epi32(ifgcolor3, ifgcolor2, ifgcolor1, ifgcolor0); + __m128i shadepacked = _mm_set_epi32(ifgshade3, ifgshade2, ifgshade1, ifgshade0); + __m256i mifgcolor = _mm256_set_m128i(_mm_unpackhi_epi8(fgpacked, _mm_setzero_si128()), _mm_unpacklo_epi8(fgpacked, _mm_setzero_si128())); + __m256i mifgshade = _mm256_set_m128i(_mm_unpackhi_epi32(shadepacked, shadepacked), _mm_unpacklo_epi32(shadepacked, shadepacked)); + __m256i fgcolor = mifgcolor; + fgcolor = Shade32_AVX2(fgcolor, mlight, desaturate, inv_desaturate, shade_fade_lit, shade_light); + __m256i outcolor = Blend32_AVX2(fgcolor, bgcolor, mifgcolor, mifgshade, srcalpha, destalpha); + + // Store result + _mm_storeu_si128((__m128i*)dest, _mm_or_si128(_mm256_extracti128_si256(outcolor, 0), _mm_slli_si128(_mm256_extracti128_si256(outcolor, 1), 8))); + dest += 4; + } + + blockPosY = _mm_add_ps(blockPosY, gradientY); + + dest += offset_next_line; + } + } + else + { + // mask0 loop: + for (int y = 0; y < 4; y++) + { + __m128 blockPosX = _mm_add_ps(blockPosY, gradientX); + __m128 W = _mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 rcpW = _mm_div_ps(_mm_set1_ps((float)0x01000000), W); + __m128i posUV = _mm_cvtps_epi32(_mm_mul_ps(_mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(2, 1, 2, 1)), rcpW)); + + __m128 vis = _mm_mul_ps(_mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(3, 3, 3, 3)), W); + __m128i lightpospair = _mm_sub_epi32( + _mm_set1_epi32(FRACUNIT), + _mm_cvtps_epi32(_mm_mul_ps( + _mm_max_ps(_mm_sub_ps(_mm_set1_ps(shade), _mm_min_ps(_mm_set1_ps(24.0f / 32.0f), vis)), _mm_setzero_ps()), + _mm_set1_ps((float)FRACUNIT)))); + lightpospair = _mm_or_si128(_mm_and_si128(lightmask, lightpospair), _mm_andnot_si128(lightmask, fixedlight)); + + int32_t posU = _mm_cvtsi128_si32(posUV); + int32_t posV = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 4)); + int32_t nextU = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 8)); + int32_t nextV = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 12)); + int32_t lightpos = _mm_cvtsi128_si32(lightpospair); + int32_t lightnext = _mm_cvtsi128_si32(_mm_srli_si128(lightpospair, 8)); + int32_t stepU = (nextU - posU) >> 3; + int32_t stepV = (nextV - posV) >> 3; + fixed_t lightstep = (lightnext - lightpos) >> 3; + + for (int x = 0; x < 2; x++) + { + // Load bgcolor + uint32_t desttmp[4]; + __m256i bgcolor; + if (BlendT::Mode != (int)BlendModes::Opaque) + { + if (mask0 & (1 << 31)) desttmp[0] = dest[0]; + if (mask0 & (1 << 30)) desttmp[1] = dest[1]; + if (mask0 & (1 << 29)) desttmp[2] = dest[2]; + if (mask0 & (1 << 28)) desttmp[3] = dest[3]; + + __m128i bgpacked = _mm_loadu_si128((__m128i*)(desttmp)); + bgcolor = _mm256_set_m128i(_mm_unpackhi_epi8(bgpacked, _mm_setzero_si128()), _mm_unpacklo_epi8(bgpacked, _mm_setzero_si128())); + } + else + bgcolor = _mm256_setzero_si256(); + + // Sample fgcolor + unsigned int ifgcolor0 = Sample32_AVX2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + unsigned int ifgshade0 = SampleShade32_AVX2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + posU += stepU; + posV += stepV; + + unsigned int ifgcolor1 = Sample32_AVX2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + unsigned int ifgshade1 = SampleShade32_AVX2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + posU += stepU; + posV += stepV; + + unsigned int ifgcolor2 = Sample32_AVX2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + unsigned int ifgshade2 = SampleShade32_AVX2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + posU += stepU; + posV += stepV; + + unsigned int ifgcolor3 = Sample32_AVX2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + unsigned int ifgshade3 = SampleShade32_AVX2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + posU += stepU; + posV += stepV; + + // Setup light + int lightpos0 = lightpos >> 8; + lightpos += lightstep; + int lightpos1 = lightpos >> 8; + lightpos += lightstep; + int lightpos2 = lightpos >> 8; + lightpos += lightstep; + int lightpos3 = lightpos >> 8; + lightpos += lightstep; + __m256i mlight = _mm256_set_epi16( + 256, lightpos3, lightpos3, lightpos3, + 256, lightpos2, lightpos2, lightpos2, + 256, lightpos1, lightpos1, lightpos1, + 256, lightpos0, lightpos0, lightpos0); + + __m256i shade_fade_lit; + if (ShadeModeT::Mode == (int)ShadeMode::Advanced) + { + __m256i inv_light = _mm256_sub_epi16(_mm256_set_epi16(0, 256, 256, 256, 0, 256, 256, 256, 0, 256, 256, 256, 0, 256, 256, 256), mlight); + shade_fade_lit = _mm256_mullo_epi16(shade_fade, inv_light); + } + else + { + shade_fade_lit = _mm256_setzero_si256(); + } + + // Shade and blend + __m128i fgpacked = _mm_set_epi32(ifgcolor3, ifgcolor2, ifgcolor1, ifgcolor0); + __m128i shadepacked = _mm_set_epi32(ifgshade3, ifgshade2, ifgshade1, ifgshade0); + __m256i mifgcolor = _mm256_set_m128i(_mm_unpackhi_epi8(fgpacked, _mm_setzero_si128()), _mm_unpacklo_epi8(fgpacked, _mm_setzero_si128())); + __m256i mifgshade = _mm256_set_m128i(_mm_unpackhi_epi32(shadepacked, shadepacked), _mm_unpacklo_epi32(shadepacked, shadepacked)); + __m256i fgcolor = mifgcolor; + fgcolor = Shade32_AVX2(fgcolor, mlight, desaturate, inv_desaturate, shade_fade_lit, shade_light); + __m256i outcolor = Blend32_AVX2(fgcolor, bgcolor, mifgcolor, mifgshade, srcalpha, destalpha); + + // Store result + _mm_storeu_si128((__m128i*)desttmp, _mm_or_si128(_mm256_extracti128_si256(outcolor, 0), _mm_slli_si128(_mm256_extracti128_si256(outcolor, 1), 8))); + if (mask0 & (1 << 31)) dest[0] = desttmp[0]; + if (mask0 & (1 << 30)) dest[1] = desttmp[1]; + if (mask0 & (1 << 29)) dest[2] = desttmp[2]; + if (mask0 & (1 << 28)) dest[3] = desttmp[3]; + + mask0 <<= 4; + dest += 4; + } + + blockPosY = _mm_add_ps(blockPosY, gradientY); + + dest += offset_next_line; + } + + // mask1 loop: + for (int y = 0; y < 4; y++) + { + __m128 blockPosX = _mm_add_ps(blockPosY, gradientX); + __m128 W = _mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 rcpW = _mm_div_ps(_mm_set1_ps((float)0x01000000), W); + __m128i posUV = _mm_cvtps_epi32(_mm_mul_ps(_mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(2, 1, 2, 1)), rcpW)); + + __m128 vis = _mm_mul_ps(_mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(3, 3, 3, 3)), W); + __m128i lightpospair = _mm_sub_epi32( + _mm_set1_epi32(FRACUNIT), + _mm_cvtps_epi32(_mm_mul_ps( + _mm_max_ps(_mm_sub_ps(_mm_set1_ps(shade), _mm_min_ps(_mm_set1_ps(24.0f / 32.0f), vis)), _mm_setzero_ps()), + _mm_set1_ps((float)FRACUNIT)))); + lightpospair = _mm_or_si128(_mm_and_si128(lightmask, lightpospair), _mm_andnot_si128(lightmask, fixedlight)); + + int32_t posU = _mm_cvtsi128_si32(posUV); + int32_t posV = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 4)); + int32_t nextU = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 8)); + int32_t nextV = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 12)); + int32_t lightpos = _mm_cvtsi128_si32(lightpospair); + int32_t lightnext = _mm_cvtsi128_si32(_mm_srli_si128(lightpospair, 8)); + int32_t stepU = (nextU - posU) >> 3; + int32_t stepV = (nextV - posV) >> 3; + fixed_t lightstep = (lightnext - lightpos) >> 3; + + for (int x = 0; x < 2; x++) + { + // Load bgcolor + uint32_t desttmp[4]; + __m256i bgcolor; + if (BlendT::Mode != (int)BlendModes::Opaque) + { + if (mask1 & (1 << 31)) desttmp[0] = dest[0]; + if (mask1 & (1 << 30)) desttmp[1] = dest[1]; + if (mask1 & (1 << 29)) desttmp[2] = dest[2]; + if (mask1 & (1 << 28)) desttmp[3] = dest[3]; + + __m128i bgpacked = _mm_loadu_si128((__m128i*)(desttmp)); + bgcolor = _mm256_set_m128i(_mm_unpackhi_epi8(bgpacked, _mm_setzero_si128()), _mm_unpacklo_epi8(bgpacked, _mm_setzero_si128())); + } + else + bgcolor = _mm256_setzero_si256(); + + // Sample fgcolor + unsigned int ifgcolor0 = Sample32_AVX2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + unsigned int ifgshade0 = SampleShade32_AVX2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + posU += stepU; + posV += stepV; + + unsigned int ifgcolor1 = Sample32_AVX2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + unsigned int ifgshade1 = SampleShade32_AVX2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + posU += stepU; + posV += stepV; + + unsigned int ifgcolor2 = Sample32_AVX2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + unsigned int ifgshade2 = SampleShade32_AVX2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + posU += stepU; + posV += stepV; + + unsigned int ifgcolor3 = Sample32_AVX2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + unsigned int ifgshade3 = SampleShade32_AVX2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + posU += stepU; + posV += stepV; + + // Setup light + int lightpos0 = lightpos >> 8; + lightpos += lightstep; + int lightpos1 = lightpos >> 8; + lightpos += lightstep; + int lightpos2 = lightpos >> 8; + lightpos += lightstep; + int lightpos3 = lightpos >> 8; + lightpos += lightstep; + __m256i mlight = _mm256_set_epi16( + 256, lightpos3, lightpos3, lightpos3, + 256, lightpos2, lightpos2, lightpos2, + 256, lightpos1, lightpos1, lightpos1, + 256, lightpos0, lightpos0, lightpos0); + + __m256i shade_fade_lit; + if (ShadeModeT::Mode == (int)ShadeMode::Advanced) + { + __m256i inv_light = _mm256_sub_epi16(_mm256_set_epi16(0, 256, 256, 256, 0, 256, 256, 256, 0, 256, 256, 256, 0, 256, 256, 256), mlight); + shade_fade_lit = _mm256_mullo_epi16(shade_fade, inv_light); + } + else + { + shade_fade_lit = _mm256_setzero_si256(); + } + + // Shade and blend + __m128i fgpacked = _mm_set_epi32(ifgcolor3, ifgcolor2, ifgcolor1, ifgcolor0); + __m128i shadepacked = _mm_set_epi32(ifgshade3, ifgshade2, ifgshade1, ifgshade0); + __m256i mifgcolor = _mm256_set_m128i(_mm_unpackhi_epi8(fgpacked, _mm_setzero_si128()), _mm_unpacklo_epi8(fgpacked, _mm_setzero_si128())); + __m256i mifgshade = _mm256_set_m128i(_mm_unpackhi_epi32(shadepacked, shadepacked), _mm_unpacklo_epi32(shadepacked, shadepacked)); + __m256i fgcolor = mifgcolor; + fgcolor = Shade32_AVX2(fgcolor, mlight, desaturate, inv_desaturate, shade_fade_lit, shade_light); + __m256i outcolor = Blend32_AVX2(fgcolor, bgcolor, mifgcolor, mifgshade, srcalpha, destalpha); + + // Store result + _mm_storeu_si128((__m128i*)desttmp, _mm_or_si128(_mm256_extracti128_si256(outcolor, 0), _mm_slli_si128(_mm256_extracti128_si256(outcolor, 1), 8))); + if (mask1 & (1 << 31)) dest[0] = desttmp[0]; + if (mask1 & (1 << 30)) dest[1] = desttmp[1]; + if (mask1 & (1 << 29)) dest[2] = desttmp[2]; + if (mask1 & (1 << 28)) dest[3] = desttmp[3]; + + mask1 <<= 4; + dest += 4; + } + + blockPosY = _mm_add_ps(blockPosY, gradientY); + + dest += offset_next_line; + } + } + } +}; diff --git a/src/polyrenderer/drawers/poly_drawer32_sse2.h b/src/polyrenderer/drawers/poly_drawer32_sse2.h index 5125c93c7..2f690f7e8 100644 --- a/src/polyrenderer/drawers/poly_drawer32_sse2.h +++ b/src/polyrenderer/drawers/poly_drawer32_sse2.h @@ -27,7 +27,7 @@ namespace TriScreenDrawerModes { template - FORCEINLINE unsigned int VECTORCALL Sample32(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, uint32_t oneU, uint32_t oneV, uint32_t color, const uint32_t *translation) + FORCEINLINE unsigned int VECTORCALL Sample32_SSE2(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, uint32_t oneU, uint32_t oneV, uint32_t color, const uint32_t *translation) { uint32_t texel; if (SamplerT::Mode == (int)Samplers::Shaded || SamplerT::Mode == (int)Samplers::Stencil || SamplerT::Mode == (int)Samplers::Fill || SamplerT::Mode == (int)Samplers::Fuzz) @@ -107,7 +107,7 @@ namespace TriScreenDrawerModes } template - FORCEINLINE unsigned int VECTORCALL SampleShade32(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, int &fuzzpos) + FORCEINLINE unsigned int VECTORCALL SampleShade32_SSE2(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, int &fuzzpos) { if (SamplerT::Mode == (int)Samplers::Shaded) { @@ -143,7 +143,7 @@ namespace TriScreenDrawerModes } template - FORCEINLINE __m128i VECTORCALL Shade32(__m128i fgcolor, __m128i mlight, unsigned int ifgcolor0, unsigned int ifgcolor1, int desaturate, __m128i inv_desaturate, __m128i shade_fade, __m128i shade_light) + FORCEINLINE __m128i VECTORCALL Shade32_SSE2(__m128i fgcolor, __m128i mlight, unsigned int ifgcolor0, unsigned int ifgcolor1, int desaturate, __m128i inv_desaturate, __m128i shade_fade, __m128i shade_light) { if (ShadeModeT::Mode == (int)ShadeMode::Simple) { @@ -172,7 +172,7 @@ namespace TriScreenDrawerModes } template - FORCEINLINE __m128i VECTORCALL Blend32(__m128i fgcolor, __m128i bgcolor, unsigned int ifgcolor0, unsigned int ifgcolor1, unsigned int ifgshade0, unsigned int ifgshade1, uint32_t srcalpha, uint32_t destalpha) + FORCEINLINE __m128i VECTORCALL Blend32_SSE2(__m128i fgcolor, __m128i bgcolor, unsigned int ifgcolor0, unsigned int ifgcolor1, unsigned int ifgshade0, unsigned int ifgshade1, uint32_t srcalpha, uint32_t destalpha) { if (BlendT::Mode == (int)BlendModes::Opaque) { @@ -275,7 +275,7 @@ namespace TriScreenDrawerModes } template -class TriScreenDrawer32 +class TriScreenDrawer32_SSE2 { public: static void Execute(int x, int y, uint32_t mask0, uint32_t mask1, const TriDrawTriangleArgs *args) @@ -430,13 +430,13 @@ private: // Sample fgcolor unsigned int ifgcolor[2], ifgshade[2]; - ifgcolor[0] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - ifgshade[0] = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + ifgcolor[0] = Sample32_SSE2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + ifgshade[0] = SampleShade32_SSE2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); posU += stepU; posV += stepV; - ifgcolor[1] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - ifgshade[1] = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + ifgcolor[1] = Sample32_SSE2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + ifgshade[1] = SampleShade32_SSE2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); posU += stepU; posV += stepV; @@ -460,8 +460,8 @@ private: // Shade and blend __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - fgcolor = Shade32(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); - __m128i outcolor = Blend32(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); + fgcolor = Shade32_SSE2(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); + __m128i outcolor = Blend32_SSE2(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); // Store result _mm_storel_epi64((__m128i*)(dest + ix * 2), outcolor); @@ -517,13 +517,13 @@ private: // Sample fgcolor unsigned int ifgcolor[2], ifgshade[2]; - ifgcolor[0] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - ifgshade[0] = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + ifgcolor[0] = Sample32_SSE2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + ifgshade[0] = SampleShade32_SSE2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); posU += stepU; posV += stepV; - ifgcolor[1] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - ifgshade[1] = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + ifgcolor[1] = Sample32_SSE2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + ifgshade[1] = SampleShade32_SSE2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); posU += stepU; posV += stepV; @@ -547,8 +547,8 @@ private: // Shade and blend __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - fgcolor = Shade32(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); - __m128i outcolor = Blend32(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); + fgcolor = Shade32_SSE2(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); + __m128i outcolor = Blend32_SSE2(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); // Store result _mm_storel_epi64((__m128i*)desttmp, outcolor); @@ -606,13 +606,13 @@ private: // Sample fgcolor unsigned int ifgcolor[2], ifgshade[2]; - ifgcolor[0] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - ifgshade[0] = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + ifgcolor[0] = Sample32_SSE2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + ifgshade[0] = SampleShade32_SSE2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); posU += stepU; posV += stepV; - ifgcolor[1] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - ifgshade[1] = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + ifgcolor[1] = Sample32_SSE2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + ifgshade[1] = SampleShade32_SSE2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); posU += stepU; posV += stepV; @@ -636,8 +636,8 @@ private: // Shade and blend __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - fgcolor = Shade32(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); - __m128i outcolor = Blend32(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); + fgcolor = Shade32_SSE2(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); + __m128i outcolor = Blend32_SSE2(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); // Store result _mm_storel_epi64((__m128i*)desttmp, outcolor); @@ -658,7 +658,7 @@ private: }; template -class RectScreenDrawer32 +class RectScreenDrawer32_SSE2 { public: static void Execute(const void *destOrg, int destWidth, int destHeight, int destPitch, const RectDrawArgs *args, WorkerThreadData *thread) @@ -780,18 +780,18 @@ private: // Sample fgcolor unsigned int ifgcolor[2], ifgshade[2]; - ifgcolor[0] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - ifgshade[0] = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + ifgcolor[0] = Sample32_SSE2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + ifgshade[0] = SampleShade32_SSE2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); posU += stepU; - ifgcolor[1] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - ifgshade[1] = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + ifgcolor[1] = Sample32_SSE2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + ifgshade[1] = SampleShade32_SSE2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); posU += stepU; // Shade and blend __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - fgcolor = Shade32(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); - __m128i outcolor = Blend32(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); + fgcolor = Shade32_SSE2(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); + __m128i outcolor = Blend32_SSE2(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); // Store result _mm_storel_epi64((__m128i*)dest, outcolor); @@ -809,16 +809,16 @@ private: // Sample fgcolor unsigned int ifgcolor[2], ifgshade[2]; - ifgcolor[0] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - ifgshade[0] = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + ifgcolor[0] = Sample32_SSE2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + ifgshade[0] = SampleShade32_SSE2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); ifgcolor[1] = 0; ifgshade[1] = 0; posU += stepU; // Shade and blend __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - fgcolor = Shade32(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); - __m128i outcolor = Blend32(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); + fgcolor = Shade32_SSE2(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); + __m128i outcolor = Blend32_SSE2(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); // Store result *dest = _mm_cvtsi128_si32(outcolor); diff --git a/src/polyrenderer/drawers/poly_triangle.cpp b/src/polyrenderer/drawers/poly_triangle.cpp index 3e7caec90..e4d91a65e 100644 --- a/src/polyrenderer/drawers/poly_triangle.cpp +++ b/src/polyrenderer/drawers/poly_triangle.cpp @@ -37,6 +37,7 @@ #include "polyrenderer/poly_renderer.h" #include "swrenderer/drawers/r_draw_rgba.h" #include "screen_triangle.h" +#include "x86.h" int PolyTriangleDrawer::viewport_x; int PolyTriangleDrawer::viewport_y; @@ -151,14 +152,8 @@ ShadedTriVertex PolyTriangleDrawer::shade_vertex(const TriMatrix &objectToClip, return sv; } -void PolyTriangleDrawer::draw_shaded_triangle(const ShadedTriVertex *vert, bool ccw, TriDrawTriangleArgs *args, WorkerThreadData *thread) +void PolyTriangleDrawer::clip_to_viewport(TriVertex *clippedvert, int numclipvert) { - // Cull, clip and generate additional vertices as needed - TriVertex clippedvert[max_additional_vertices]; - int numclipvert = clipedge(vert, clippedvert); - -#ifdef NO_SSE - // Map to 2D viewport: for (int j = 0; j < numclipvert; j++) { auto &v = clippedvert[j]; @@ -173,8 +168,11 @@ void PolyTriangleDrawer::draw_shaded_triangle(const ShadedTriVertex *vert, bool v.x = viewport_x + viewport_width * (1.0f + v.x) * 0.5f; v.y = viewport_y + viewport_height * (1.0f - v.y) * 0.5f; } -#else - // Map to 2D viewport: +} + +#ifndef NO_SSE +void PolyTriangleDrawer::clip_to_viewport_sse2(TriVertex *clippedvert, int numclipvert) +{ __m128 mviewport_x = _mm_set1_ps((float)viewport_x); __m128 mviewport_y = _mm_set1_ps((float)viewport_y); __m128 mviewport_halfwidth = _mm_set1_ps(viewport_width * 0.5f); @@ -205,8 +203,21 @@ void PolyTriangleDrawer::draw_shaded_triangle(const ShadedTriVertex *vert, bool _mm_storeu_ps(&clippedvert[j + 2].x, vz); _mm_storeu_ps(&clippedvert[j + 3].x, vw); } +} #endif +void PolyTriangleDrawer::draw_shaded_triangle(const ShadedTriVertex *vert, bool ccw, TriDrawTriangleArgs *args, WorkerThreadData *thread) +{ + // Cull, clip and generate additional vertices as needed + TriVertex clippedvert[max_additional_vertices]; + int numclipvert = CPU.bSSE2 ? clipedge_sse2(vert, clippedvert) : clipedge(vert, clippedvert); + + // Map to 2D viewport: + if (CPU.bSSE2) + clip_to_viewport_sse2(clippedvert, numclipvert); + else + clip_to_viewport(clippedvert, numclipvert); + // Keep varyings in -128 to 128 range if possible if (numclipvert > 0) { @@ -255,7 +266,6 @@ int PolyTriangleDrawer::clipedge(const ShadedTriVertex *verts, TriVertex *clippe // halfspace clip distances static const int numclipdistances = 7; -#ifdef NO_SSE float clipdistance[numclipdistances * 3]; bool needsclipping = false; float *clipd = clipdistance; @@ -282,43 +292,6 @@ int PolyTriangleDrawer::clipedge(const ShadedTriVertex *verts, TriVertex *clippe } return 3; } -#else - __m128 mx = _mm_loadu_ps(&verts[0].x); - __m128 my = _mm_loadu_ps(&verts[1].x); - __m128 mz = _mm_loadu_ps(&verts[2].x); - __m128 mw = _mm_setzero_ps(); - _MM_TRANSPOSE4_PS(mx, my, mz, mw); - __m128 clipd0 = _mm_add_ps(mx, mw); - __m128 clipd1 = _mm_sub_ps(mw, mx); - __m128 clipd2 = _mm_add_ps(my, mw); - __m128 clipd3 = _mm_sub_ps(mw, my); - __m128 clipd4 = _mm_add_ps(mz, mw); - __m128 clipd5 = _mm_sub_ps(mw, mz); - __m128 clipd6 = _mm_setr_ps(verts[0].clipDistance0, verts[1].clipDistance0, verts[2].clipDistance0, 0.0f); - __m128 mneedsclipping = _mm_cmplt_ps(clipd0, _mm_setzero_ps()); - mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd1, _mm_setzero_ps())); - mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd2, _mm_setzero_ps())); - mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd3, _mm_setzero_ps())); - mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd4, _mm_setzero_ps())); - mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd5, _mm_setzero_ps())); - mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd6, _mm_setzero_ps())); - if (_mm_movemask_ps(mneedsclipping) == 0) - { - for (int i = 0; i < 3; i++) - { - memcpy(clippedvert + i, verts + i, sizeof(TriVertex)); - } - return 3; - } - float clipdistance[numclipdistances * 4]; - _mm_storeu_ps(clipdistance, clipd0); - _mm_storeu_ps(clipdistance + 4, clipd1); - _mm_storeu_ps(clipdistance + 8, clipd2); - _mm_storeu_ps(clipdistance + 12, clipd3); - _mm_storeu_ps(clipdistance + 16, clipd4); - _mm_storeu_ps(clipdistance + 20, clipd5); - _mm_storeu_ps(clipdistance + 24, clipd6); -#endif // use barycentric weights while clipping vertices float weights[max_additional_vertices * 3 * 2]; @@ -341,7 +314,6 @@ int PolyTriangleDrawer::clipedge(const ShadedTriVertex *verts, TriVertex *clippe for (int i = 0; i < inputverts; i++) { int j = (i + 1) % inputverts; -#ifdef NO_SSE float clipdistance1 = clipdistance[0 * numclipdistances + p] * input[i * 3 + 0] + clipdistance[1 * numclipdistances + p] * input[i * 3 + 1] + @@ -351,17 +323,6 @@ int PolyTriangleDrawer::clipedge(const ShadedTriVertex *verts, TriVertex *clippe clipdistance[0 * numclipdistances + p] * input[j * 3 + 0] + clipdistance[1 * numclipdistances + p] * input[j * 3 + 1] + clipdistance[2 * numclipdistances + p] * input[j * 3 + 2]; -#else - float clipdistance1 = - clipdistance[0 + p * 4] * input[i * 3 + 0] + - clipdistance[1 + p * 4] * input[i * 3 + 1] + - clipdistance[2 + p * 4] * input[i * 3 + 2]; - - float clipdistance2 = - clipdistance[0 + p * 4] * input[j * 3 + 0] + - clipdistance[1 + p * 4] * input[j * 3 + 1] + - clipdistance[2 + p * 4] * input[j * 3 + 2]; -#endif // Clip halfspace if ((clipdistance1 >= 0.0f || clipdistance2 >= 0.0f) && outputverts + 1 < max_additional_vertices) @@ -408,6 +369,129 @@ int PolyTriangleDrawer::clipedge(const ShadedTriVertex *verts, TriVertex *clippe return inputverts; } +#ifndef NO_SSE +int PolyTriangleDrawer::clipedge_sse2(const ShadedTriVertex *verts, TriVertex *clippedvert) +{ + // Clip and cull so that the following is true for all vertices: + // -v.w <= v.x <= v.w + // -v.w <= v.y <= v.w + // -v.w <= v.z <= v.w + + // halfspace clip distances + static const int numclipdistances = 7; + __m128 mx = _mm_loadu_ps(&verts[0].x); + __m128 my = _mm_loadu_ps(&verts[1].x); + __m128 mz = _mm_loadu_ps(&verts[2].x); + __m128 mw = _mm_setzero_ps(); + _MM_TRANSPOSE4_PS(mx, my, mz, mw); + __m128 clipd0 = _mm_add_ps(mx, mw); + __m128 clipd1 = _mm_sub_ps(mw, mx); + __m128 clipd2 = _mm_add_ps(my, mw); + __m128 clipd3 = _mm_sub_ps(mw, my); + __m128 clipd4 = _mm_add_ps(mz, mw); + __m128 clipd5 = _mm_sub_ps(mw, mz); + __m128 clipd6 = _mm_setr_ps(verts[0].clipDistance0, verts[1].clipDistance0, verts[2].clipDistance0, 0.0f); + __m128 mneedsclipping = _mm_cmplt_ps(clipd0, _mm_setzero_ps()); + mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd1, _mm_setzero_ps())); + mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd2, _mm_setzero_ps())); + mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd3, _mm_setzero_ps())); + mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd4, _mm_setzero_ps())); + mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd5, _mm_setzero_ps())); + mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd6, _mm_setzero_ps())); + if (_mm_movemask_ps(mneedsclipping) == 0) + { + for (int i = 0; i < 3; i++) + { + memcpy(clippedvert + i, verts + i, sizeof(TriVertex)); + } + return 3; + } + float clipdistance[numclipdistances * 4]; + _mm_storeu_ps(clipdistance, clipd0); + _mm_storeu_ps(clipdistance + 4, clipd1); + _mm_storeu_ps(clipdistance + 8, clipd2); + _mm_storeu_ps(clipdistance + 12, clipd3); + _mm_storeu_ps(clipdistance + 16, clipd4); + _mm_storeu_ps(clipdistance + 20, clipd5); + _mm_storeu_ps(clipdistance + 24, clipd6); + + // use barycentric weights while clipping vertices + float weights[max_additional_vertices * 3 * 2]; + for (int i = 0; i < 3; i++) + { + weights[i * 3 + 0] = 0.0f; + weights[i * 3 + 1] = 0.0f; + weights[i * 3 + 2] = 0.0f; + weights[i * 3 + i] = 1.0f; + } + + // Clip against each halfspace + float *input = weights; + float *output = weights + max_additional_vertices * 3; + int inputverts = 3; + for (int p = 0; p < numclipdistances; p++) + { + // Clip each edge + int outputverts = 0; + for (int i = 0; i < inputverts; i++) + { + int j = (i + 1) % inputverts; + float clipdistance1 = + clipdistance[0 + p * 4] * input[i * 3 + 0] + + clipdistance[1 + p * 4] * input[i * 3 + 1] + + clipdistance[2 + p * 4] * input[i * 3 + 2]; + + float clipdistance2 = + clipdistance[0 + p * 4] * input[j * 3 + 0] + + clipdistance[1 + p * 4] * input[j * 3 + 1] + + clipdistance[2 + p * 4] * input[j * 3 + 2]; + + // Clip halfspace + if ((clipdistance1 >= 0.0f || clipdistance2 >= 0.0f) && outputverts + 1 < max_additional_vertices) + { + float t1 = (clipdistance1 < 0.0f) ? MAX(-clipdistance1 / (clipdistance2 - clipdistance1), 0.0f) : 0.0f; + float t2 = (clipdistance2 < 0.0f) ? MIN(1.0f + clipdistance2 / (clipdistance1 - clipdistance2), 1.0f) : 1.0f; + + // add t1 vertex + for (int k = 0; k < 3; k++) + output[outputverts * 3 + k] = input[i * 3 + k] * (1.0f - t1) + input[j * 3 + k] * t1; + outputverts++; + + if (t2 != 1.0f && t2 > t1) + { + // add t2 vertex + for (int k = 0; k < 3; k++) + output[outputverts * 3 + k] = input[i * 3 + k] * (1.0f - t2) + input[j * 3 + k] * t2; + outputverts++; + } + } + } + std::swap(input, output); + inputverts = outputverts; + if (inputverts == 0) + break; + } + + // Convert barycentric weights to actual vertices + for (int i = 0; i < inputverts; i++) + { + auto &v = clippedvert[i]; + memset(&v, 0, sizeof(TriVertex)); + for (int w = 0; w < 3; w++) + { + float weight = input[i * 3 + w]; + v.x += verts[w].x * weight; + v.y += verts[w].y * weight; + v.z += verts[w].z * weight; + v.w += verts[w].w * weight; + v.u += verts[w].u * weight; + v.v += verts[w].v * weight; + } + } + return inputverts; +} +#endif + ///////////////////////////////////////////////////////////////////////////// DrawPolyTrianglesCommand::DrawPolyTrianglesCommand(const PolyDrawArgs &args, bool mirror) diff --git a/src/polyrenderer/drawers/poly_triangle.h b/src/polyrenderer/drawers/poly_triangle.h index c939149d3..ca92ac735 100644 --- a/src/polyrenderer/drawers/poly_triangle.h +++ b/src/polyrenderer/drawers/poly_triangle.h @@ -47,8 +47,12 @@ private: static ShadedTriVertex shade_vertex(const TriMatrix &objectToClip, const float *clipPlane, const TriVertex &v); static void draw_arrays(const PolyDrawArgs &args, WorkerThreadData *thread); static void draw_shaded_triangle(const ShadedTriVertex *vertices, bool ccw, TriDrawTriangleArgs *args, WorkerThreadData *thread); - + static void clip_to_viewport(TriVertex *clippedvert, int numclipvert); static int clipedge(const ShadedTriVertex *verts, TriVertex *clippedvert); +#ifndef NO_SSE + static void clip_to_viewport_sse2(TriVertex *clippedvert, int numclipvert); + static int clipedge_sse2(const ShadedTriVertex *verts, TriVertex *clippedvert); +#endif static int viewport_x, viewport_y, viewport_width, viewport_height, dest_pitch, dest_width, dest_height; static bool dest_bgra; diff --git a/src/polyrenderer/drawers/screen_triangle.cpp b/src/polyrenderer/drawers/screen_triangle.cpp index 8547335ed..bbfb0361f 100644 --- a/src/polyrenderer/drawers/screen_triangle.cpp +++ b/src/polyrenderer/drawers/screen_triangle.cpp @@ -36,11 +36,20 @@ #include "poly_triangle.h" #include "swrenderer/drawers/r_draw_rgba.h" #include "screen_triangle.h" +#include "poly_drawer32.h" +#include "poly_drawer8.h" #ifndef NO_SSE #include "poly_drawer32_sse2.h" #endif -#include "poly_drawer8.h" +#include "x86.h" +namespace +{ + class SSE2CPU { public: static const int HasSSE2 = 1; }; + class GenericCPU { public: static const int HasSSE2 = 0; }; +} + +template class TriangleBlock { public: @@ -114,9 +123,17 @@ private: void ClipTest(); void StencilWrite(); void SubsectorWrite(); + +#ifndef NO_SSE + void CoverageTestSSE2(); + void StencilEqualTestSSE2(); + void SubsectorTestSSE2(); + void SubsectorWriteSSE2(); +#endif }; -TriangleBlock::TriangleBlock(const TriDrawTriangleArgs *args) +template +TriangleBlock::TriangleBlock(const TriDrawTriangleArgs *args) { const TriVertex &v1 = *args->v1; const TriVertex &v2 = *args->v2; @@ -145,19 +162,32 @@ TriangleBlock::TriangleBlock(const TriDrawTriangleArgs *args) const int X2 = (int)round(16.0f * v2.x); const int X3 = (int)round(16.0f * v3.x); #else - int tempround[4 * 3]; - __m128 m16 = _mm_set1_ps(16.0f); - __m128 mhalf = _mm_set1_ps(65536.5f); - __m128i m65536 = _mm_set1_epi32(65536); - _mm_storeu_si128((__m128i*)tempround, _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v1), m16), mhalf)), m65536)); - _mm_storeu_si128((__m128i*)(tempround + 4), _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v2), m16), mhalf)), m65536)); - _mm_storeu_si128((__m128i*)(tempround + 8), _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v3), m16), mhalf)), m65536)); - const int X1 = tempround[0]; - const int X2 = tempround[4]; - const int X3 = tempround[8]; - const int Y1 = tempround[1]; - const int Y2 = tempround[5]; - const int Y3 = tempround[9]; + int Y1, Y2, Y3, X1, X2, X3; + if (CPUType::HasSSE2 == 1) + { + int tempround[4 * 3]; + __m128 m16 = _mm_set1_ps(16.0f); + __m128 mhalf = _mm_set1_ps(65536.5f); + __m128i m65536 = _mm_set1_epi32(65536); + _mm_storeu_si128((__m128i*)tempround, _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v1), m16), mhalf)), m65536)); + _mm_storeu_si128((__m128i*)(tempround + 4), _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v2), m16), mhalf)), m65536)); + _mm_storeu_si128((__m128i*)(tempround + 8), _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v3), m16), mhalf)), m65536)); + X1 = tempround[0]; + X2 = tempround[4]; + X3 = tempround[8]; + Y1 = tempround[1]; + Y2 = tempround[5]; + Y3 = tempround[9]; + } + else + { + Y1 = (int)round(16.0f * v1.y); + Y2 = (int)round(16.0f * v2.y); + Y3 = (int)round(16.0f * v3.y); + X1 = (int)round(16.0f * v1.x); + X2 = (int)round(16.0f * v2.x); + X3 = (int)round(16.0f * v3.x); + } #endif // Deltas @@ -203,28 +233,32 @@ TriangleBlock::TriangleBlock(const TriDrawTriangleArgs *args) if (DY31 < 0 || (DY31 == 0 && DX31 > 0)) C3++; #ifndef NO_SSE - mFDY12Offset = _mm_setr_epi32(0, FDY12, FDY12 * 2, FDY12 * 3); - mFDY23Offset = _mm_setr_epi32(0, FDY23, FDY23 * 2, FDY23 * 3); - mFDY31Offset = _mm_setr_epi32(0, FDY31, FDY31 * 2, FDY31 * 3); - mFDY12x4 = _mm_set1_epi32(FDY12 * 4); - mFDY23x4 = _mm_set1_epi32(FDY23 * 4); - mFDY31x4 = _mm_set1_epi32(FDY31 * 4); - mFDX12 = _mm_set1_epi32(FDX12); - mFDX23 = _mm_set1_epi32(FDX23); - mFDX31 = _mm_set1_epi32(FDX31); - mC1 = _mm_set1_epi32(C1); - mC2 = _mm_set1_epi32(C2); - mC3 = _mm_set1_epi32(C3); - mDX12 = _mm_set1_epi32(DX12); - mDY12 = _mm_set1_epi32(DY12); - mDX23 = _mm_set1_epi32(DX23); - mDY23 = _mm_set1_epi32(DY23); - mDX31 = _mm_set1_epi32(DX31); - mDY31 = _mm_set1_epi32(DY31); + if (CPUType::HasSSE2 == 1) + { + mFDY12Offset = _mm_setr_epi32(0, FDY12, FDY12 * 2, FDY12 * 3); + mFDY23Offset = _mm_setr_epi32(0, FDY23, FDY23 * 2, FDY23 * 3); + mFDY31Offset = _mm_setr_epi32(0, FDY31, FDY31 * 2, FDY31 * 3); + mFDY12x4 = _mm_set1_epi32(FDY12 * 4); + mFDY23x4 = _mm_set1_epi32(FDY23 * 4); + mFDY31x4 = _mm_set1_epi32(FDY31 * 4); + mFDX12 = _mm_set1_epi32(FDX12); + mFDX23 = _mm_set1_epi32(FDX23); + mFDX31 = _mm_set1_epi32(FDX31); + mC1 = _mm_set1_epi32(C1); + mC2 = _mm_set1_epi32(C2); + mC3 = _mm_set1_epi32(C3); + mDX12 = _mm_set1_epi32(DX12); + mDY12 = _mm_set1_epi32(DY12); + mDX23 = _mm_set1_epi32(DX23); + mDY23 = _mm_set1_epi32(DY23); + mDX31 = _mm_set1_epi32(DX31); + mDY31 = _mm_set1_epi32(DY31); + } #endif } -void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thread) +template +void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thread) { // First block line for this thread int core = thread->core; @@ -236,9 +270,18 @@ void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thre bool writeColor = args->uniforms->WriteColor(); bool writeStencil = args->uniforms->WriteStencil(); bool writeSubsector = args->uniforms->WriteSubsector(); - int bmode = (int)args->uniforms->BlendMode(); + + // Find the drawer function for the given blend mode +#ifndef NO_SSE + void(*drawFunc)(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *); + if (CPUType::HasSSE2 == 1) + drawFunc = args->destBgra ? ScreenTriangle::TriDrawers32_SSE2[bmode] : ScreenTriangle::TriDrawers8[bmode]; + else + drawFunc = args->destBgra ? ScreenTriangle::TriDrawers32[bmode] : ScreenTriangle::TriDrawers8[bmode]; +#else auto drawFunc = args->destBgra ? ScreenTriangle::TriDrawers32[bmode] : ScreenTriangle::TriDrawers8[bmode]; +#endif // Loop through blocks for (int y = start_miny; y < maxy; y += q * num_cores) @@ -248,7 +291,11 @@ void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thre X = x; Y = y; - CoverageTest(); + if (CPUType::HasSSE2 == 1) + CoverageTestSSE2(); + else + CoverageTest(); + if (Mask0 == 0 && Mask1 == 0) continue; @@ -259,7 +306,11 @@ void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thre // To do: make the stencil test use its own flag for comparison mode instead of abusing the subsector test.. if (!subsectorTest) { - StencilEqualTest(); + if (CPUType::HasSSE2 == 1) + StencilEqualTestSSE2(); + else + StencilEqualTest(); + if (Mask0 == 0 && Mask1 == 0) continue; } @@ -269,7 +320,11 @@ void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thre if (Mask0 == 0 && Mask1 == 0) continue; - SubsectorTest(); + if (CPUType::HasSSE2 == 1) + SubsectorTestSSE2(); + else + SubsectorTest(); + if (Mask0 == 0 && Mask1 == 0) continue; } @@ -279,14 +334,18 @@ void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thre if (writeStencil) StencilWrite(); if (writeSubsector) - SubsectorWrite(); + { + if (CPUType::HasSSE2 == 1) + SubsectorWriteSSE2(); + else + SubsectorWrite(); + } } } } -#ifdef NO_SSE - -void TriangleBlock::SubsectorTest() +template +void TriangleBlock::SubsectorTest() { int block = (X >> 3) + (Y >> 3) * subsectorPitch; uint32_t *subsector = subsectorGBuffer + block * 64; @@ -312,9 +371,10 @@ void TriangleBlock::SubsectorTest() Mask1 = Mask1 & mask1; } -#else +#ifndef NO_SSE -void TriangleBlock::SubsectorTest() +template +void TriangleBlock::SubsectorTestSSE2() { int block = (X >> 3) + (Y >> 3) * subsectorPitch; uint32_t *subsector = subsectorGBuffer + block * 64; @@ -342,7 +402,8 @@ void TriangleBlock::SubsectorTest() #endif -void TriangleBlock::ClipTest() +template +void TriangleBlock::ClipTest() { static const uint32_t clipxmask[8] = { @@ -376,9 +437,8 @@ void TriangleBlock::ClipTest() Mask1 = Mask1 & xmask & ymask1; } -#ifdef NO_SSE - -void TriangleBlock::StencilEqualTest() +template +void TriangleBlock::StencilEqualTest() { // Stencil test the whole block, if possible int block = (X >> 3) + (Y >> 3) * stencilPitch; @@ -421,9 +481,10 @@ void TriangleBlock::StencilEqualTest() } } -#else +#ifndef NO_SSE -void TriangleBlock::StencilEqualTest() +template +void TriangleBlock::StencilEqualTestSSE2() { // Stencil test the whole block, if possible int block = (X >> 3) + (Y >> 3) * stencilPitch; @@ -489,7 +550,8 @@ void TriangleBlock::StencilEqualTest() #endif -void TriangleBlock::StencilGreaterEqualTest() +template +void TriangleBlock::StencilGreaterEqualTest() { // Stencil test the whole block, if possible int block = (X >> 3) + (Y >> 3) * stencilPitch; @@ -532,9 +594,8 @@ void TriangleBlock::StencilGreaterEqualTest() } } -#ifdef NO_SSE - -void TriangleBlock::CoverageTest() +template +void TriangleBlock::CoverageTest() { // Corners of block int x0 = X << 4; @@ -631,9 +692,10 @@ void TriangleBlock::CoverageTest() } } -#else +#ifndef NO_SSE -void TriangleBlock::CoverageTest() +template +void TriangleBlock::CoverageTestSSE2() { // Corners of block int x0 = X << 4; @@ -743,7 +805,8 @@ void TriangleBlock::CoverageTest() #endif -void TriangleBlock::StencilWrite() +template +void TriangleBlock::StencilWrite() { int block = (X >> 3) + (Y >> 3) * stencilPitch; uint8_t *stencilBlock = &stencilValues[block * 64]; @@ -793,9 +856,8 @@ void TriangleBlock::StencilWrite() } } -#ifdef NO_SSE - -void TriangleBlock::SubsectorWrite() +template +void TriangleBlock::SubsectorWrite() { int block = (X >> 3) + (Y >> 3) * subsectorPitch; uint32_t *subsector = subsectorGBuffer + block * 64; @@ -828,9 +890,10 @@ void TriangleBlock::SubsectorWrite() } } -#else +#ifndef NO_SSE -void TriangleBlock::SubsectorWrite() +template +void TriangleBlock::SubsectorWriteSSE2() { int block = (X >> 3) + (Y >> 3) * subsectorPitch; uint32_t *subsector = subsectorGBuffer + block * 64; @@ -887,8 +950,21 @@ void TriangleBlock::SubsectorWrite() void ScreenTriangle::Draw(const TriDrawTriangleArgs *args, WorkerThreadData *thread) { - TriangleBlock block(args); +#ifdef NO_SSE + TriangleBlock block(args); block.Loop(args, thread); +#else + if (CPU.bSSE2) + { + TriangleBlock block(args); + block.Loop(args, thread); + } + else + { + TriangleBlock block(args); + block.Loop(args, thread); + } +#endif } void(*ScreenTriangle::TriDrawers8[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *) = @@ -918,15 +994,6 @@ void(*ScreenTriangle::TriDrawers8[])(int, int, uint32_t, uint32_t, const TriDraw &TriScreenDrawer8::Execute // Fuzz }; -#ifdef NO_SSE - -void(*ScreenTriangle::TriDrawers32[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *) = -{ - nullptr -}; - -#else - void(*ScreenTriangle::TriDrawers32[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *) = { &TriScreenDrawer32::Execute, // TextureOpaque @@ -954,6 +1021,35 @@ void(*ScreenTriangle::TriDrawers32[])(int, int, uint32_t, uint32_t, const TriDra &TriScreenDrawer32::Execute // Fuzz }; +#ifndef NO_SSE + +void(*ScreenTriangle::TriDrawers32_SSE2[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *) = +{ + &TriScreenDrawer32_SSE2::Execute, // TextureOpaque + &TriScreenDrawer32_SSE2::Execute, // TextureMasked + &TriScreenDrawer32_SSE2::Execute, // TextureAdd + &TriScreenDrawer32_SSE2::Execute, // TextureSub + &TriScreenDrawer32_SSE2::Execute, // TextureRevSub + &TriScreenDrawer32_SSE2::Execute, // TextureAddSrcColor + &TriScreenDrawer32_SSE2::Execute, // TranslatedOpaque + &TriScreenDrawer32_SSE2::Execute, // TranslatedMasked + &TriScreenDrawer32_SSE2::Execute, // TranslatedAdd + &TriScreenDrawer32_SSE2::Execute, // TranslatedSub + &TriScreenDrawer32_SSE2::Execute, // TranslatedRevSub + &TriScreenDrawer32_SSE2::Execute, // TranslatedAddSrcColor + &TriScreenDrawer32_SSE2::Execute, // Shaded + &TriScreenDrawer32_SSE2::Execute, // AddShaded + &TriScreenDrawer32_SSE2::Execute, // Stencil + &TriScreenDrawer32_SSE2::Execute, // AddStencil + &TriScreenDrawer32_SSE2::Execute, // FillOpaque + &TriScreenDrawer32_SSE2::Execute, // FillAdd + &TriScreenDrawer32_SSE2::Execute, // FillSub + &TriScreenDrawer32_SSE2::Execute, // FillRevSub + &TriScreenDrawer32_SSE2::Execute, // FillAddSrcColor + &TriScreenDrawer32_SSE2::Execute, // Skycap + &TriScreenDrawer32_SSE2::Execute // Fuzz +}; + #endif void(*ScreenTriangle::RectDrawers8[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *) = @@ -983,15 +1079,6 @@ void(*ScreenTriangle::RectDrawers8[])(const void *, int, int, int, const RectDra &RectScreenDrawer8::Execute // Fuzz }; -#ifdef NO_SSE - -void(*ScreenTriangle::RectDrawers32[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *) = -{ - nullptr -}; - -#else - void(*ScreenTriangle::RectDrawers32[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *) = { &RectScreenDrawer32::Execute, // TextureOpaque @@ -1019,6 +1106,35 @@ void(*ScreenTriangle::RectDrawers32[])(const void *, int, int, int, const RectDr &RectScreenDrawer32::Execute // Fuzz }; +#ifndef NO_SSE + +void(*ScreenTriangle::RectDrawers32_SSE2[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *) = +{ + &RectScreenDrawer32_SSE2::Execute, // TextureOpaque + &RectScreenDrawer32_SSE2::Execute, // TextureMasked + &RectScreenDrawer32_SSE2::Execute, // TextureAdd + &RectScreenDrawer32_SSE2::Execute, // TextureSub + &RectScreenDrawer32_SSE2::Execute, // TextureRevSub + &RectScreenDrawer32_SSE2::Execute, // TextureAddSrcColor + &RectScreenDrawer32_SSE2::Execute, // TranslatedOpaque + &RectScreenDrawer32_SSE2::Execute, // TranslatedMasked + &RectScreenDrawer32_SSE2::Execute, // TranslatedAdd + &RectScreenDrawer32_SSE2::Execute, // TranslatedSub + &RectScreenDrawer32_SSE2::Execute, // TranslatedRevSub + &RectScreenDrawer32_SSE2::Execute, // TranslatedAddSrcColor + &RectScreenDrawer32_SSE2::Execute, // Shaded + &RectScreenDrawer32_SSE2::Execute, // AddShaded + &RectScreenDrawer32_SSE2::Execute, // Stencil + &RectScreenDrawer32_SSE2::Execute, // AddStencil + &RectScreenDrawer32_SSE2::Execute, // FillOpaque + &RectScreenDrawer32_SSE2::Execute, // FillAdd + &RectScreenDrawer32_SSE2::Execute, // FillSub + &RectScreenDrawer32_SSE2::Execute, // FillRevSub + &RectScreenDrawer32_SSE2::Execute, // FillAddSrcColor + &RectScreenDrawer32_SSE2::Execute, // Skycap + &RectScreenDrawer32_SSE2::Execute // Fuzz +}; + #endif int ScreenTriangle::FuzzStart = 0; diff --git a/src/polyrenderer/drawers/screen_triangle.h b/src/polyrenderer/drawers/screen_triangle.h index 3dd4c24eb..615a0c631 100644 --- a/src/polyrenderer/drawers/screen_triangle.h +++ b/src/polyrenderer/drawers/screen_triangle.h @@ -131,6 +131,11 @@ public: static void(*RectDrawers8[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *); static void(*RectDrawers32[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *); +#ifndef NO_SSE + static void(*TriDrawers32_SSE2[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *); + static void(*RectDrawers32_SSE2[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *); +#endif + static int FuzzStart; }; diff --git a/src/polyrenderer/math/tri_matrix.cpp b/src/polyrenderer/math/tri_matrix.cpp index f6bb03a5e..5c88a96ec 100644 --- a/src/polyrenderer/math/tri_matrix.cpp +++ b/src/polyrenderer/math/tri_matrix.cpp @@ -185,21 +185,42 @@ ShadedTriVertex TriMatrix::operator*(TriVertex v) const sv.y = vy; sv.z = vz; sv.w = vw; -#else - __m128 m0 = _mm_loadu_ps(matrix); - __m128 m1 = _mm_loadu_ps(matrix + 4); - __m128 m2 = _mm_loadu_ps(matrix + 8); - __m128 m3 = _mm_loadu_ps(matrix + 12); - __m128 mv = _mm_loadu_ps(&v.x); - m0 = _mm_mul_ps(m0, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(0, 0, 0, 0))); - m1 = _mm_mul_ps(m1, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(1, 1, 1, 1))); - m2 = _mm_mul_ps(m2, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(2, 2, 2, 2))); - m3 = _mm_mul_ps(m3, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(3, 3, 3, 3))); - mv = _mm_add_ps(_mm_add_ps(_mm_add_ps(m0, m1), m2), m3); - ShadedTriVertex sv; - _mm_storeu_ps(&sv.x, mv); -#endif sv.u = v.u; sv.v = v.v; return sv; +#else + if (CPU.bSSE2) + { + __m128 m0 = _mm_loadu_ps(matrix); + __m128 m1 = _mm_loadu_ps(matrix + 4); + __m128 m2 = _mm_loadu_ps(matrix + 8); + __m128 m3 = _mm_loadu_ps(matrix + 12); + __m128 mv = _mm_loadu_ps(&v.x); + m0 = _mm_mul_ps(m0, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(0, 0, 0, 0))); + m1 = _mm_mul_ps(m1, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(1, 1, 1, 1))); + m2 = _mm_mul_ps(m2, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(2, 2, 2, 2))); + m3 = _mm_mul_ps(m3, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(3, 3, 3, 3))); + mv = _mm_add_ps(_mm_add_ps(_mm_add_ps(m0, m1), m2), m3); + ShadedTriVertex sv; + _mm_storeu_ps(&sv.x, mv); + sv.u = v.u; + sv.v = v.v; + return sv; + } + else + { + float vx = matrix[0 * 4 + 0] * v.x + matrix[1 * 4 + 0] * v.y + matrix[2 * 4 + 0] * v.z + matrix[3 * 4 + 0] * v.w; + float vy = matrix[0 * 4 + 1] * v.x + matrix[1 * 4 + 1] * v.y + matrix[2 * 4 + 1] * v.z + matrix[3 * 4 + 1] * v.w; + float vz = matrix[0 * 4 + 2] * v.x + matrix[1 * 4 + 2] * v.y + matrix[2 * 4 + 2] * v.z + matrix[3 * 4 + 2] * v.w; + float vw = matrix[0 * 4 + 3] * v.x + matrix[1 * 4 + 3] * v.y + matrix[2 * 4 + 3] * v.z + matrix[3 * 4 + 3] * v.w; + ShadedTriVertex sv; + sv.x = vx; + sv.y = vy; + sv.z = vz; + sv.w = vw; + sv.u = v.u; + sv.v = v.v; + return sv; + } +#endif } diff --git a/src/sound/musicformats/music_dumb.cpp b/src/sound/musicformats/music_dumb.cpp index d688d1c2d..7706ff947 100644 --- a/src/sound/musicformats/music_dumb.cpp +++ b/src/sound/musicformats/music_dumb.cpp @@ -135,7 +135,6 @@ typedef struct MODMIDICFG // PUBLIC DATA DEFINITIONS ------------------------------------------------- -CVAR(Bool, mod_dumb, true, CVAR_ARCHIVE|CVAR_GLOBALCONFIG); CVAR(Int, mod_samplerate, 0, CVAR_ARCHIVE|CVAR_GLOBALCONFIG); CVAR(Int, mod_volramp, 2, CVAR_ARCHIVE|CVAR_GLOBALCONFIG); CVAR(Int, mod_interp, DUMB_LQ_CUBIC, CVAR_ARCHIVE|CVAR_GLOBALCONFIG); @@ -780,11 +779,6 @@ MusInfo *MOD_OpenSong(FileReader &reader) long fpos = 0; input_mod *state = NULL; - if (!mod_dumb) - { - return NULL; - } - bool is_it = false; bool is_dos = true; diff --git a/src/sound/oalsound.cpp b/src/sound/oalsound.cpp index b697282bb..0f20ab36b 100644 --- a/src/sound/oalsound.cpp +++ b/src/sound/oalsound.cpp @@ -1218,15 +1218,16 @@ std::pair OpenALSoundRenderer::LoadSound(uint8_t *sfxdata, int if(!decoder) return std::make_pair(retval, true); decoder->getInfo(&srate, &chans, &type); + int samplesize = 1; if(chans == ChannelConfig_Mono || monoize) { - if(type == SampleType_UInt8) format = AL_FORMAT_MONO8; - if(type == SampleType_Int16) format = AL_FORMAT_MONO16; + if(type == SampleType_UInt8) format = AL_FORMAT_MONO8, samplesize = 1; + if(type == SampleType_Int16) format = AL_FORMAT_MONO16, samplesize = 2; } else if(chans == ChannelConfig_Stereo) { - if(type == SampleType_UInt8) format = AL_FORMAT_STEREO8; - if(type == SampleType_Int16) format = AL_FORMAT_STEREO16; + if(type == SampleType_UInt8) format = AL_FORMAT_STEREO8, samplesize = 2; + if(type == SampleType_Int16) format = AL_FORMAT_STEREO16, samplesize = 4; } if(format == AL_NONE) @@ -1282,13 +1283,14 @@ std::pair OpenALSoundRenderer::LoadSound(uint8_t *sfxdata, int if (!startass) loop_start = Scale(loop_start, srate, 1000); if (!endass) loop_end = Scale(loop_end, srate, 1000); if (loop_start < 0) loop_start = 0; + if (loop_end >= data.Size() / samplesize) loop_end = data.Size() / samplesize - 1; if ((loop_start > 0 || loop_end > 0) && loop_end > loop_start && AL.SOFT_loop_points) { ALint loops[2] = { static_cast(loop_start), static_cast(loop_end) }; DPrintf(DMSG_NOTIFY, "Setting loop points %d -> %d\n", loops[0], loops[1]); alBufferiv(buffer, AL_LOOP_POINTS_SOFT, loops); - getALError(); + // no console messages here, please! } diff --git a/wadsrc/static/menudef.txt b/wadsrc/static/menudef.txt index dde57d560..843883637 100644 --- a/wadsrc/static/menudef.txt +++ b/wadsrc/static/menudef.txt @@ -1723,8 +1723,6 @@ OptionValue ModVolumeRamps OptionMenu ModReplayerOptions { Title "$MODMNU_TITLE" - Option "$MODMNU_REPLAYERENGINE", "mod_dumb", "ModReplayers" - StaticText " " Slider "$MODMNU_MASTERVOLUME", "mod_dumb_mastervolume", 1, 16, 0.5, 1 Option "$ADVSNDMNU_SAMPLERATE", "mod_samplerate", "SampleRates", "mod_dumb" Option "$MODMNU_QUALITY", "mod_interp", "ModQuality", "mod_dumb"