This commit is contained in:
Rachael Alexanderson 2017-04-22 22:02:20 -04:00
commit c71d23f519
16 changed files with 1960 additions and 210 deletions

View file

@ -2844,7 +2844,7 @@ static bool LoadDehSupp ()
sc.MustGetStringName(","); sc.MustGetStringName(",");
sc.MustGetNumber(); sc.MustGetNumber();
if (s.State == NULL || !actortype->OwnsState(s.State + sc.Number)) if (s.State == NULL || sc.Number < 1 || !actortype->OwnsState(s.State + sc.Number - 1))
{ {
sc.ScriptError("Invalid state range in '%s'", type->TypeName.GetChars()); sc.ScriptError("Invalid state range in '%s'", type->TypeName.GetChars());
} }

View file

@ -582,9 +582,9 @@ DEFINE_ACTION_FUNCTION(DEventHandler, SendNetworkEvent)
{ {
PARAM_PROLOGUE; PARAM_PROLOGUE;
PARAM_STRING(name); PARAM_STRING(name);
PARAM_INT(arg1); PARAM_INT_DEF(arg1);
PARAM_INT(arg2); PARAM_INT_DEF(arg2);
PARAM_INT(arg3); PARAM_INT_DEF(arg3);
// //
ACTION_RETURN_BOOL(E_SendNetworkEvent(name, arg1, arg2, arg3, false)); ACTION_RETURN_BOOL(E_SendNetworkEvent(name, arg1, arg2, arg3, false));

View file

@ -732,10 +732,6 @@ DIntermissionController::DIntermissionController(FIntermissionDescriptor *Desc,
mScreen = NULL; mScreen = NULL;
mFirst = true; mFirst = true;
mGameState = state; mGameState = state;
// If the intermission finishes straight away then cancel the wipe.
if(!NextPage())
wipegamestate = GS_FINALE;
} }
bool DIntermissionController::NextPage () bool DIntermissionController::NextPage ()
@ -898,6 +894,13 @@ void F_StartIntermission(FIntermissionDescriptor *desc, bool deleteme, uint8_t s
viewactive = false; viewactive = false;
automapactive = false; automapactive = false;
DIntermissionController::CurrentIntermission = Create<DIntermissionController>(desc, deleteme, state); DIntermissionController::CurrentIntermission = Create<DIntermissionController>(desc, deleteme, state);
// If the intermission finishes straight away then cancel the wipe.
if (!DIntermissionController::CurrentIntermission->NextPage())
{
wipegamestate = GS_FINALE;
}
GC::WriteBarrier(DIntermissionController::CurrentIntermission); GC::WriteBarrier(DIntermissionController::CurrentIntermission);
} }

View file

@ -304,6 +304,7 @@ public:
void OnDestroy() override; void OnDestroy() override;
friend void F_AdvanceIntermission(); friend void F_AdvanceIntermission();
friend void F_StartIntermission(FIntermissionDescriptor *, bool, uint8_t);
}; };

View file

@ -436,6 +436,7 @@ void AActor::Serialize(FSerializer &arc)
A("stamina", stamina) A("stamina", stamina)
("goal", goal) ("goal", goal)
A("waterlevel", waterlevel) A("waterlevel", waterlevel)
A("boomwaterlevel", boomwaterlevel)
A("minmissilechance", MinMissileChance) A("minmissilechance", MinMissileChance)
A("spawnflags", SpawnFlags) A("spawnflags", SpawnFlags)
("inventory", Inventory) ("inventory", Inventory)

View file

@ -0,0 +1,782 @@
/*
** Projected triangle drawer
** Copyright (c) 2016 Magnus Norddahl
**
** This software is provided 'as-is', without any express or implied
** warranty. In no event will the authors be held liable for any damages
** arising from the use of this software.
**
** Permission is granted to anyone to use this software for any purpose,
** including commercial applications, and to alter it and redistribute it
** freely, subject to the following restrictions:
**
** 1. The origin of this software must not be misrepresented; you must not
** claim that you wrote the original software. If you use this software
** in a product, an acknowledgment in the product documentation would be
** appreciated but is not required.
** 2. Altered source versions must be plainly marked as such, and must not be
** misrepresented as being the original software.
** 3. This notice may not be removed or altered from any source distribution.
**
*/
#pragma once
#include "screen_triangle.h"
namespace TriScreenDrawerModes
{
namespace
{
struct BgraColor
{
uint32_t b, g, r, a;
BgraColor() { }
BgraColor(uint32_t c) : b(BPART(c)), g(GPART(c)), r(RPART(c)), a(APART(c)) { }
BgraColor &operator=(uint32_t c) { b = BPART(c); g = GPART(c); r = RPART(c); a = APART(c); return *this; }
operator uint32_t() const { return MAKEARGB(a, r, g, b); }
};
}
template<typename SamplerT, typename FilterModeT>
FORCEINLINE unsigned int Sample32(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, uint32_t oneU, uint32_t oneV, uint32_t color, const uint32_t *translation)
{
uint32_t texel;
if (SamplerT::Mode == (int)Samplers::Shaded || SamplerT::Mode == (int)Samplers::Stencil || SamplerT::Mode == (int)Samplers::Fill || SamplerT::Mode == (int)Samplers::Fuzz)
{
return color;
}
else if (SamplerT::Mode == (int)Samplers::Translated)
{
const uint8_t *texpal = (const uint8_t *)texPixels;
uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16;
uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16;
return translation[texpal[texelX * texHeight + texelY]];
}
else if (FilterModeT::Mode == (int)FilterModes::Nearest)
{
uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16;
uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16;
texel = texPixels[texelX * texHeight + texelY];
}
else
{
u -= oneU >> 1;
v -= oneV >> 1;
unsigned int frac_x0 = (((uint32_t)u << 8) >> FRACBITS) * texWidth;
unsigned int frac_x1 = ((((uint32_t)u << 8) + oneU) >> FRACBITS) * texWidth;
unsigned int frac_y0 = (((uint32_t)v << 8) >> FRACBITS) * texHeight;
unsigned int frac_y1 = ((((uint32_t)v << 8) + oneV) >> FRACBITS) * texHeight;
unsigned int x0 = frac_x0 >> FRACBITS;
unsigned int x1 = frac_x1 >> FRACBITS;
unsigned int y0 = frac_y0 >> FRACBITS;
unsigned int y1 = frac_y1 >> FRACBITS;
unsigned int p00 = texPixels[x0 * texHeight + y0];
unsigned int p01 = texPixels[x0 * texHeight + y1];
unsigned int p10 = texPixels[x1 * texHeight + y0];
unsigned int p11 = texPixels[x1 * texHeight + y1];
unsigned int inv_a = (frac_x1 >> (FRACBITS - 4)) & 15;
unsigned int inv_b = (frac_y1 >> (FRACBITS - 4)) & 15;
unsigned int a = 16 - inv_a;
unsigned int b = 16 - inv_b;
unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8;
unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8;
unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8;
unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8;
texel = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue;
}
if (SamplerT::Mode == (int)Samplers::Skycap)
{
int start_fade = 2; // How fast it should fade out
int alpha_top = clamp(v >> (16 - start_fade), 0, 256);
int alpha_bottom = clamp(((2 << 24) - v) >> (16 - start_fade), 0, 256);
int a = MIN(alpha_top, alpha_bottom);
int inv_a = 256 - a;
uint32_t r = RPART(texel);
uint32_t g = GPART(texel);
uint32_t b = BPART(texel);
uint32_t fg_a = APART(texel);
uint32_t bg_red = RPART(color);
uint32_t bg_green = GPART(color);
uint32_t bg_blue = BPART(color);
r = (r * a + bg_red * inv_a + 127) >> 8;
g = (g * a + bg_green * inv_a + 127) >> 8;
b = (b * a + bg_blue * inv_a + 127) >> 8;
return MAKEARGB(fg_a, r, g, b);
}
else
{
return texel;
}
}
template<typename SamplerT>
FORCEINLINE unsigned int SampleShade32(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, int &fuzzpos)
{
if (SamplerT::Mode == (int)Samplers::Shaded)
{
const uint8_t *texpal = (const uint8_t *)texPixels;
uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16;
uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16;
unsigned int sampleshadeout = texpal[texelX * texHeight + texelY];
sampleshadeout += sampleshadeout >> 7; // 255 -> 256
return sampleshadeout;
}
else if (SamplerT::Mode == (int)Samplers::Stencil)
{
uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16;
uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16;
unsigned int sampleshadeout = APART(texPixels[texelX * texHeight + texelY]);
sampleshadeout += sampleshadeout >> 7; // 255 -> 256
return sampleshadeout;
}
else if (SamplerT::Mode == (int)Samplers::Fuzz)
{
uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16;
uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16;
unsigned int sampleshadeout = APART(texPixels[texelX * texHeight + texelY]);
sampleshadeout += sampleshadeout >> 7; // 255 -> 256
sampleshadeout = (sampleshadeout * fuzzcolormap[fuzzpos++]) >> 5;
if (fuzzpos >= FUZZTABLE) fuzzpos = 0;
return sampleshadeout;
}
else
{
return 0;
}
}
template<typename ShadeModeT>
FORCEINLINE BgraColor Shade32(BgraColor fgcolor, BgraColor mlight, uint32_t desaturate, uint32_t inv_desaturate, BgraColor shade_fade, BgraColor shade_light)
{
if (ShadeModeT::Mode == (int)ShadeMode::Simple)
{
fgcolor.r = (fgcolor.r * mlight.r) >> 8;
fgcolor.g = (fgcolor.g * mlight.g) >> 8;
fgcolor.b = (fgcolor.b * mlight.b) >> 8;
}
else if (ShadeModeT::Mode == (int)ShadeMode::Advanced)
{
uint32_t intensity = ((fgcolor.r * 77 + fgcolor.g * 143 + fgcolor.b * 37) >> 8) * desaturate;
fgcolor.r = (((shade_fade.r + ((fgcolor.r * inv_desaturate + intensity) >> 8) * mlight.r) >> 8) * shade_light.r) >> 8;
fgcolor.g = (((shade_fade.g + ((fgcolor.g * inv_desaturate + intensity) >> 8) * mlight.g) >> 8) * shade_light.g) >> 8;
fgcolor.b = (((shade_fade.b + ((fgcolor.b * inv_desaturate + intensity) >> 8) * mlight.b) >> 8) * shade_light.b) >> 8;
}
return fgcolor;
}
template<typename BlendT>
FORCEINLINE BgraColor Blend32(BgraColor fgcolor, BgraColor bgcolor, uint32_t ifgcolor, uint32_t ifgshade, uint32_t srcalpha, uint32_t destalpha)
{
if (BlendT::Mode == (int)BlendModes::Opaque)
{
return fgcolor;
}
else if (BlendT::Mode == (int)BlendModes::Masked)
{
return (ifgcolor == 0) ? bgcolor : fgcolor;
}
else if (BlendT::Mode == (int)BlendModes::AddSrcColorOneMinusSrcColor)
{
uint32_t srcred = fgcolor.r + (fgcolor.r >> 7);
uint32_t srcgreen = fgcolor.g + (fgcolor.g >> 7);
uint32_t srcblue = fgcolor.b + (fgcolor.b >> 7);
uint32_t inv_srcred = 256 - srcred;
uint32_t inv_srcgreen = 256 - srcgreen;
uint32_t inv_srcblue = 256 - srcblue;
BgraColor outcolor;
outcolor.r = (fgcolor.r * srcred + bgcolor.r * inv_srcred) >> 8;
outcolor.g = (fgcolor.g * srcgreen + bgcolor.g * inv_srcgreen) >> 8;
outcolor.b = (fgcolor.b * srcblue + bgcolor.b * inv_srcblue) >> 8;
outcolor.a = 255;
return outcolor;
}
else if (BlendT::Mode == (int)BlendModes::Shaded)
{
uint32_t alpha = ifgshade;
uint32_t inv_alpha = 256 - alpha;
BgraColor outcolor;
outcolor.r = (fgcolor.r * alpha + bgcolor.r * inv_alpha) >> 8;
outcolor.g = (fgcolor.g * alpha + bgcolor.g * inv_alpha) >> 8;
outcolor.b = (fgcolor.b * alpha + bgcolor.b * inv_alpha) >> 8;
outcolor.a = 255;
return outcolor;
}
else if (BlendT::Mode == (int)BlendModes::AddClampShaded)
{
uint32_t alpha = ifgshade;
BgraColor outcolor;
outcolor.r = ((fgcolor.r * alpha) >> 8) + bgcolor.r;
outcolor.g = ((fgcolor.g * alpha) >> 8) + bgcolor.g;
outcolor.b = ((fgcolor.b * alpha) >> 8) + bgcolor.b;
outcolor.a = 255;
return outcolor;
}
else
{
uint32_t alpha = APART(ifgcolor);
alpha += alpha >> 7; // 255->256
uint32_t inv_alpha = 256 - alpha;
uint32_t bgalpha = (destalpha * alpha + (inv_alpha << 8) + 128) >> 8;
uint32_t fgalpha = (srcalpha * alpha + 128) >> 8;
fgcolor.r *= fgalpha;
fgcolor.g *= fgalpha;
fgcolor.b *= fgalpha;
bgcolor.r *= bgalpha;
bgcolor.g *= bgalpha;
bgcolor.b *= bgalpha;
BgraColor outcolor;
if (BlendT::Mode == (int)BlendModes::AddClamp)
{
outcolor.r = MIN<uint32_t>((fgcolor.r + bgcolor.r) >> 8, 255);
outcolor.g = MIN<uint32_t>((fgcolor.g + bgcolor.g) >> 8, 255);
outcolor.b = MIN<uint32_t>((fgcolor.b + bgcolor.b) >> 8, 255);
}
else if (BlendT::Mode == (int)BlendModes::SubClamp)
{
outcolor.r = MAX(int32_t(fgcolor.r - bgcolor.r) >> 8, 0);
outcolor.g = MAX(int32_t(fgcolor.g - bgcolor.g) >> 8, 0);
outcolor.b = MAX(int32_t(fgcolor.b - bgcolor.b) >> 8, 0);
}
else if (BlendT::Mode == (int)BlendModes::RevSubClamp)
{
outcolor.r = MAX(int32_t(bgcolor.r - fgcolor.r) >> 8, 0);
outcolor.g = MAX(int32_t(bgcolor.g - fgcolor.g) >> 8, 0);
outcolor.b = MAX(int32_t(bgcolor.b - fgcolor.b) >> 8, 0);
}
outcolor.a = 255;
return outcolor;
}
}
}
template<typename BlendT, typename SamplerT>
class TriScreenDrawer32
{
public:
static void Execute(int x, int y, uint32_t mask0, uint32_t mask1, const TriDrawTriangleArgs *args)
{
using namespace TriScreenDrawerModes;
bool is_simple_shade = args->uniforms->SimpleShade();
if (SamplerT::Mode == (int)Samplers::Texture)
{
bool is_nearest_filter = args->uniforms->NearestFilter();
if (is_simple_shade)
{
if (is_nearest_filter)
DrawBlock<SimpleShade, NearestFilter>(x, y, mask0, mask1, args);
else
DrawBlock<SimpleShade, LinearFilter>(x, y, mask0, mask1, args);
}
else
{
if (is_nearest_filter)
DrawBlock<AdvancedShade, NearestFilter>(x, y, mask0, mask1, args);
else
DrawBlock<AdvancedShade, LinearFilter>(x, y, mask0, mask1, args);
}
}
else if (SamplerT::Mode == (int)Samplers::Fuzz)
{
DrawBlock<NoShade, NearestFilter>(x, y, mask0, mask1, args);
}
else // no linear filtering for translated, shaded, stencil, fill or skycap
{
if (is_simple_shade)
{
DrawBlock<SimpleShade, NearestFilter>(x, y, mask0, mask1, args);
}
else
{
DrawBlock<AdvancedShade, NearestFilter>(x, y, mask0, mask1, args);
}
}
}
private:
template<typename ShadeModeT, typename FilterModeT>
FORCEINLINE static void DrawBlock(int destX, int destY, uint32_t mask0, uint32_t mask1, const TriDrawTriangleArgs *args)
{
using namespace TriScreenDrawerModes;
bool is_fixed_light = args->uniforms->FixedLight();
uint32_t lightmask = is_fixed_light ? 0 : 0xffffffff;
uint32_t srcalpha = args->uniforms->SrcAlpha();
uint32_t destalpha = args->uniforms->DestAlpha();
int fuzzpos = (ScreenTriangle::FuzzStart + destX * 123 + destY) % FUZZTABLE;
// Calculate gradients
const TriVertex &v1 = *args->v1;
ScreenTriangleStepVariables gradientX = args->gradientX;
ScreenTriangleStepVariables gradientY = args->gradientY;
ScreenTriangleStepVariables blockPosY;
blockPosY.W = v1.w + gradientX.W * (destX - v1.x) + gradientY.W * (destY - v1.y);
blockPosY.U = v1.u * v1.w + gradientX.U * (destX - v1.x) + gradientY.U * (destY - v1.y);
blockPosY.V = v1.v * v1.w + gradientX.V * (destX - v1.x) + gradientY.V * (destY - v1.y);
gradientX.W *= 8.0f;
gradientX.U *= 8.0f;
gradientX.V *= 8.0f;
// Output
uint32_t * RESTRICT destOrg = (uint32_t*)args->dest;
int pitch = args->pitch;
uint32_t *dest = destOrg + destX + destY * pitch;
// Light
uint32_t light = args->uniforms->Light();
float shade = 2.0f - (light + 12.0f) / 128.0f;
float globVis = args->uniforms->GlobVis() * (1.0f / 32.0f);
light += (light >> 7); // 255 -> 256
// Sampling stuff
uint32_t color = args->uniforms->Color();
const uint32_t * RESTRICT translation = (const uint32_t *)args->uniforms->Translation();
const uint32_t * RESTRICT texPixels = (const uint32_t *)args->uniforms->TexturePixels();
uint32_t texWidth = args->uniforms->TextureWidth();
uint32_t texHeight = args->uniforms->TextureHeight();
uint32_t oneU, oneV;
if (SamplerT::Mode != (int)Samplers::Fill)
{
oneU = ((0x800000 + texWidth - 1) / texWidth) * 2 + 1;
oneV = ((0x800000 + texHeight - 1) / texHeight) * 2 + 1;
}
else
{
oneU = 0;
oneV = 0;
}
// Shade constants
int inv_desaturate;
BgraColor shade_fade, shade_light;
int desaturate;
if (ShadeModeT::Mode == (int)ShadeMode::Advanced)
{
shade_fade.r = args->uniforms->ShadeFadeRed();
shade_fade.g = args->uniforms->ShadeFadeGreen();
shade_fade.b = args->uniforms->ShadeFadeBlue();
shade_light.r = args->uniforms->ShadeLightRed();
shade_light.g = args->uniforms->ShadeLightGreen();
shade_light.b = args->uniforms->ShadeLightBlue();
desaturate = args->uniforms->ShadeDesaturate();
inv_desaturate = 256 - desaturate;
}
else
{
inv_desaturate = 0;
shade_fade.r = 0;
shade_fade.g = 0;
shade_fade.b = 0;
shade_light.r = 0;
shade_light.g = 0;
shade_light.b = 0;
desaturate = 0;
}
if (mask0 == 0xffffffff && mask1 == 0xffffffff)
{
for (int y = 0; y < 8; y++)
{
float rcpW = 0x01000000 / blockPosY.W;
int32_t posU = (int32_t)(blockPosY.U * rcpW);
int32_t posV = (int32_t)(blockPosY.V * rcpW);
fixed_t lightpos = FRACUNIT - (int)(clamp(shade - MIN(24.0f / 32.0f, globVis * blockPosY.W), 0.0f, 31.0f / 32.0f) * (float)FRACUNIT);
lightpos = (lightpos & lightmask) | ((light << 8) & ~lightmask);
ScreenTriangleStepVariables blockPosX = blockPosY;
blockPosX.W += gradientX.W;
blockPosX.U += gradientX.U;
blockPosX.V += gradientX.V;
rcpW = 0x01000000 / blockPosX.W;
int32_t nextU = (int32_t)(blockPosX.U * rcpW);
int32_t nextV = (int32_t)(blockPosX.V * rcpW);
int32_t stepU = (nextU - posU) / 8;
int32_t stepV = (nextV - posV) / 8;
fixed_t lightnext = FRACUNIT - (fixed_t)(clamp(shade - MIN(24.0f / 32.0f, globVis * blockPosX.W), 0.0f, 31.0f / 32.0f) * (float)FRACUNIT);
fixed_t lightstep = (lightnext - lightpos) / 8;
lightstep = lightstep & lightmask;
for (int ix = 0; ix < 8; ix++)
{
// Load bgcolor
BgraColor bgcolor;
if (BlendT::Mode != (int)BlendModes::Opaque)
bgcolor = dest[ix];
else
bgcolor = 0;
// Sample fgcolor
unsigned int ifgcolor = Sample32<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
unsigned int ifgshade = SampleShade32<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
posU += stepU;
posV += stepV;
// Setup light
int lightpos0 = lightpos >> 8;
lightpos += lightstep;
BgraColor mlight;
mlight.r = lightpos0;
mlight.g = lightpos0;
mlight.b = lightpos0;
BgraColor shade_fade_lit;
if (ShadeModeT::Mode == (int)ShadeMode::Advanced)
{
uint32_t inv_light = 256 - lightpos0;
shade_fade_lit.r = shade_fade.r * inv_light;
shade_fade_lit.g = shade_fade.g * inv_light;
shade_fade_lit.b = shade_fade.b * inv_light;
}
else
{
shade_fade_lit.r = 0;
shade_fade_lit.g = 0;
shade_fade_lit.b = 0;
}
// Shade and blend
BgraColor fgcolor = Shade32<ShadeModeT>(ifgcolor, mlight, desaturate, inv_desaturate, shade_fade_lit, shade_light);
BgraColor outcolor = Blend32<BlendT>(fgcolor, bgcolor, ifgcolor, ifgshade, srcalpha, destalpha);
// Store result
dest[ix] = outcolor;
}
blockPosY.W += gradientY.W;
blockPosY.U += gradientY.U;
blockPosY.V += gradientY.V;
dest += pitch;
}
}
else
{
// mask0 loop:
for (int y = 0; y < 4; y++)
{
float rcpW = 0x01000000 / blockPosY.W;
int32_t posU = (int32_t)(blockPosY.U * rcpW);
int32_t posV = (int32_t)(blockPosY.V * rcpW);
fixed_t lightpos = FRACUNIT - (fixed_t)(clamp(shade - MIN(24.0f / 32.0f, globVis * blockPosY.W), 0.0f, 31.0f / 32.0f) * (float)FRACUNIT);
lightpos = (lightpos & lightmask) | ((light << 8) & ~lightmask);
ScreenTriangleStepVariables blockPosX = blockPosY;
blockPosX.W += gradientX.W;
blockPosX.U += gradientX.U;
blockPosX.V += gradientX.V;
rcpW = 0x01000000 / blockPosX.W;
int32_t nextU = (int32_t)(blockPosX.U * rcpW);
int32_t nextV = (int32_t)(blockPosX.V * rcpW);
int32_t stepU = (nextU - posU) / 8;
int32_t stepV = (nextV - posV) / 8;
fixed_t lightnext = FRACUNIT - (fixed_t)(clamp(shade - MIN(24.0f / 32.0f, globVis * blockPosX.W), 0.0f, 31.0f / 32.0f) * (float)FRACUNIT);
fixed_t lightstep = (lightnext - lightpos) / 8;
lightstep = lightstep & lightmask;
for (int x = 0; x < 8; x++)
{
// Load bgcolor
BgraColor bgcolor;
if (BlendT::Mode != (int)BlendModes::Opaque)
{
if (mask0 & (1 << 31)) bgcolor = dest[x];
}
else
bgcolor = 0;
// Sample fgcolor
unsigned int ifgcolor = Sample32<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
unsigned int ifgshade = SampleShade32<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
posU += stepU;
posV += stepV;
// Setup light
int lightpos0 = lightpos >> 8;
lightpos += lightstep;
BgraColor mlight;
mlight.r = lightpos0;
mlight.g = lightpos0;
mlight.b = lightpos0;
BgraColor shade_fade_lit;
if (ShadeModeT::Mode == (int)ShadeMode::Advanced)
{
uint32_t inv_light = 256 - lightpos0;
shade_fade_lit.r = shade_fade.r * inv_light;
shade_fade_lit.g = shade_fade.g * inv_light;
shade_fade_lit.b = shade_fade.b * inv_light;
}
else
{
shade_fade_lit.r = 0;
shade_fade_lit.g = 0;
shade_fade_lit.b = 0;
}
// Shade and blend
BgraColor fgcolor = Shade32<ShadeModeT>(ifgcolor, mlight, desaturate, inv_desaturate, shade_fade_lit, shade_light);
BgraColor outcolor = Blend32<BlendT>(fgcolor, bgcolor, ifgcolor, ifgshade, srcalpha, destalpha);
// Store result
if (mask0 & (1 << 31)) dest[x] = outcolor;
mask0 <<= 1;
}
blockPosY.W += gradientY.W;
blockPosY.U += gradientY.U;
blockPosY.V += gradientY.V;
dest += pitch;
}
// mask1 loop:
for (int y = 0; y < 4; y++)
{
float rcpW = 0x01000000 / blockPosY.W;
int32_t posU = (int32_t)(blockPosY.U * rcpW);
int32_t posV = (int32_t)(blockPosY.V * rcpW);
fixed_t lightpos = FRACUNIT - (fixed_t)(clamp(shade - MIN(24.0f / 32.0f, globVis * blockPosY.W), 0.0f, 31.0f / 32.0f) * (float)FRACUNIT);
lightpos = (lightpos & lightmask) | ((light << 8) & ~lightmask);
ScreenTriangleStepVariables blockPosX = blockPosY;
blockPosX.W += gradientX.W;
blockPosX.U += gradientX.U;
blockPosX.V += gradientX.V;
rcpW = 0x01000000 / blockPosX.W;
int32_t nextU = (int32_t)(blockPosX.U * rcpW);
int32_t nextV = (int32_t)(blockPosX.V * rcpW);
int32_t stepU = (nextU - posU) / 8;
int32_t stepV = (nextV - posV) / 8;
fixed_t lightnext = FRACUNIT - (fixed_t)(clamp(shade - MIN(24.0f / 32.0f, globVis * blockPosX.W), 0.0f, 31.0f / 32.0f) * (float)FRACUNIT);
fixed_t lightstep = (lightnext - lightpos) / 8;
lightstep = lightstep & lightmask;
for (int x = 0; x < 8; x++)
{
// Load bgcolor
BgraColor bgcolor;
if (BlendT::Mode != (int)BlendModes::Opaque)
{
if (mask1 & (1 << 31)) bgcolor = dest[x];
}
else
bgcolor = 0;
// Sample fgcolor
unsigned int ifgcolor = Sample32<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
unsigned int ifgshade = SampleShade32<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
posU += stepU;
posV += stepV;
// Setup light
int lightpos0 = lightpos >> 8;
lightpos += lightstep;
BgraColor mlight;
mlight.r = lightpos0;
mlight.g = lightpos0;
mlight.b = lightpos0;
BgraColor shade_fade_lit;
if (ShadeModeT::Mode == (int)ShadeMode::Advanced)
{
uint32_t inv_light = 256 - lightpos0;
shade_fade_lit.r = shade_fade.r * inv_light;
shade_fade_lit.g = shade_fade.g * inv_light;
shade_fade_lit.b = shade_fade.b * inv_light;
}
else
{
shade_fade_lit.r = 0;
shade_fade_lit.g = 0;
shade_fade_lit.b = 0;
}
// Shade and blend
BgraColor fgcolor = Shade32<ShadeModeT>(ifgcolor, mlight, desaturate, inv_desaturate, shade_fade_lit, shade_light);
BgraColor outcolor = Blend32<BlendT>(fgcolor, bgcolor, ifgcolor, ifgshade, srcalpha, destalpha);
// Store result
if (mask1 & (1 << 31)) dest[x] = outcolor;
mask1 <<= 1;
}
blockPosY.W += gradientY.W;
blockPosY.U += gradientY.U;
blockPosY.V += gradientY.V;
dest += pitch;
}
}
}
};
template<typename BlendT, typename SamplerT>
class RectScreenDrawer32
{
public:
static void Execute(const void *destOrg, int destWidth, int destHeight, int destPitch, const RectDrawArgs *args, WorkerThreadData *thread)
{
using namespace TriScreenDrawerModes;
if (args->SimpleShade())
{
Loop<SimpleShade, NearestFilter>(destOrg, destWidth, destHeight, destPitch, args, thread);
}
else
{
Loop<AdvancedShade, NearestFilter>(destOrg, destWidth, destHeight, destPitch, args, thread);
}
}
private:
template<typename ShadeModeT, typename FilterModeT>
FORCEINLINE static void Loop(const void *destOrg, int destWidth, int destHeight, int destPitch, const RectDrawArgs *args, WorkerThreadData *thread)
{
using namespace TriScreenDrawerModes;
int x0 = clamp((int)(args->X0() + 0.5f), 0, destWidth);
int x1 = clamp((int)(args->X1() + 0.5f), 0, destWidth);
int y0 = clamp((int)(args->Y0() + 0.5f), 0, destHeight);
int y1 = clamp((int)(args->Y1() + 0.5f), 0, destHeight);
if (x1 <= x0 || y1 <= y0)
return;
uint32_t srcalpha = args->SrcAlpha();
uint32_t destalpha = args->DestAlpha();
// Setup step variables
float fstepU = (args->U1() - args->U0()) / (args->X1() - args->X0());
float fstepV = (args->V1() - args->V0()) / (args->Y1() - args->Y0());
uint32_t startU = (int32_t)((args->U0() + (x0 + 0.5f - args->X0()) * fstepU) * 0x1000000);
uint32_t startV = (int32_t)((args->V0() + (y0 + 0.5f - args->Y0()) * fstepV) * 0x1000000);
uint32_t stepU = (int32_t)(fstepU * 0x1000000);
uint32_t stepV = (int32_t)(fstepV * 0x1000000);
// Sampling stuff
uint32_t color = args->Color();
const uint32_t * RESTRICT translation = (const uint32_t *)args->Translation();
const uint32_t * RESTRICT texPixels = (const uint32_t *)args->TexturePixels();
uint32_t texWidth = args->TextureWidth();
uint32_t texHeight = args->TextureHeight();
uint32_t oneU, oneV;
if (SamplerT::Mode != (int)Samplers::Fill)
{
oneU = ((0x800000 + texWidth - 1) / texWidth) * 2 + 1;
oneV = ((0x800000 + texHeight - 1) / texHeight) * 2 + 1;
}
else
{
oneU = 0;
oneV = 0;
}
// Setup light
uint32_t lightpos = args->Light();
lightpos += lightpos >> 7; // 255 -> 256
BgraColor mlight;
// Shade constants
int inv_desaturate;
BgraColor shade_fade_lit, shade_light;
int desaturate;
if (ShadeModeT::Mode == (int)ShadeMode::Advanced)
{
uint32_t inv_light = 256 - lightpos;
shade_fade_lit.r = args->ShadeFadeRed() * inv_light;
shade_fade_lit.g = args->ShadeFadeGreen() * inv_light;
shade_fade_lit.b = args->ShadeFadeBlue() * inv_light;
shade_light.r = args->ShadeLightRed();
shade_light.g = args->ShadeLightGreen();
shade_light.b = args->ShadeLightBlue();
desaturate = args->ShadeDesaturate();
inv_desaturate = 256 - desaturate;
mlight.r = lightpos;
mlight.g = lightpos;
mlight.b = lightpos;
}
else
{
inv_desaturate = 0;
shade_fade_lit.r = 0;
shade_fade_lit.g = 0;
shade_fade_lit.b = 0;
shade_light.r = 0;
shade_light.g = 0;
shade_light.b = 0;
desaturate = 0;
mlight.r = lightpos;
mlight.g = lightpos;
mlight.b = lightpos;
}
int count = x1 - x0;
int fuzzpos = (ScreenTriangle::FuzzStart + x0 * 123 + y0) % FUZZTABLE;
uint32_t posV = startV;
for (int y = y0; y < y1; y++, posV += stepV)
{
int coreBlock = y / 8;
if (coreBlock % thread->num_cores != thread->core)
{
fuzzpos = (fuzzpos + count) % FUZZTABLE;
continue;
}
uint32_t *dest = ((uint32_t*)destOrg) + y * destPitch + x0;
uint32_t posU = startU;
for (int i = 0; i < count; i++)
{
// Load bgcolor
BgraColor bgcolor;
if (BlendT::Mode != (int)BlendModes::Opaque)
bgcolor = *dest;
else
bgcolor = 0;
// Sample fgcolor
unsigned int ifgcolor = Sample32<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
unsigned int ifgshade = SampleShade32<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
posU += stepU;
// Shade and blend
BgraColor fgcolor = Shade32<ShadeModeT>(ifgcolor, mlight, desaturate, inv_desaturate, shade_fade_lit, shade_light);
BgraColor outcolor = Blend32<BlendT>(fgcolor, bgcolor, ifgcolor, ifgshade, srcalpha, destalpha);
// Store result
*dest = outcolor;
dest++;
}
}
}
};

View file

@ -0,0 +1,739 @@
/*
** Projected triangle drawer
** Copyright (c) 2016 Magnus Norddahl
**
** This software is provided 'as-is', without any express or implied
** warranty. In no event will the authors be held liable for any damages
** arising from the use of this software.
**
** Permission is granted to anyone to use this software for any purpose,
** including commercial applications, and to alter it and redistribute it
** freely, subject to the following restrictions:
**
** 1. The origin of this software must not be misrepresented; you must not
** claim that you wrote the original software. If you use this software
** in a product, an acknowledgment in the product documentation would be
** appreciated but is not required.
** 2. Altered source versions must be plainly marked as such, and must not be
** misrepresented as being the original software.
** 3. This notice may not be removed or altered from any source distribution.
**
*/
#pragma once
#include "screen_triangle.h"
#ifdef _MSC_VER
#pragma warning(disable: 4752) // warning C4752 : found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
#endif
namespace TriScreenDrawerModes
{
template<typename SamplerT, typename FilterModeT>
FORCEINLINE unsigned int VECTORCALL Sample32_AVX2(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, uint32_t oneU, uint32_t oneV, uint32_t color, const uint32_t *translation)
{
uint32_t texel;
if (SamplerT::Mode == (int)Samplers::Shaded || SamplerT::Mode == (int)Samplers::Stencil || SamplerT::Mode == (int)Samplers::Fill || SamplerT::Mode == (int)Samplers::Fuzz)
{
return color;
}
else if (SamplerT::Mode == (int)Samplers::Translated)
{
const uint8_t *texpal = (const uint8_t *)texPixels;
uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16;
uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16;
return translation[texpal[texelX * texHeight + texelY]];
}
else if (FilterModeT::Mode == (int)FilterModes::Nearest)
{
uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16;
uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16;
texel = texPixels[texelX * texHeight + texelY];
}
else
{
u -= oneU >> 1;
v -= oneV >> 1;
unsigned int frac_x0 = (((uint32_t)u << 8) >> FRACBITS) * texWidth;
unsigned int frac_x1 = ((((uint32_t)u << 8) + oneU) >> FRACBITS) * texWidth;
unsigned int frac_y0 = (((uint32_t)v << 8) >> FRACBITS) * texHeight;
unsigned int frac_y1 = ((((uint32_t)v << 8) + oneV) >> FRACBITS) * texHeight;
unsigned int x0 = frac_x0 >> FRACBITS;
unsigned int x1 = frac_x1 >> FRACBITS;
unsigned int y0 = frac_y0 >> FRACBITS;
unsigned int y1 = frac_y1 >> FRACBITS;
unsigned int p00 = texPixels[x0 * texHeight + y0];
unsigned int p01 = texPixels[x0 * texHeight + y1];
unsigned int p10 = texPixels[x1 * texHeight + y0];
unsigned int p11 = texPixels[x1 * texHeight + y1];
unsigned int inv_a = (frac_x1 >> (FRACBITS - 4)) & 15;
unsigned int inv_b = (frac_y1 >> (FRACBITS - 4)) & 15;
unsigned int a = 16 - inv_a;
unsigned int b = 16 - inv_b;
unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8;
unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8;
unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8;
unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8;
texel = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue;
}
if (SamplerT::Mode == (int)Samplers::Skycap)
{
int start_fade = 2; // How fast it should fade out
int alpha_top = clamp(v >> (16 - start_fade), 0, 256);
int alpha_bottom = clamp(((2 << 24) - v) >> (16 - start_fade), 0, 256);
int a = MIN(alpha_top, alpha_bottom);
int inv_a = 256 - a;
uint32_t r = RPART(texel);
uint32_t g = GPART(texel);
uint32_t b = BPART(texel);
uint32_t fg_a = APART(texel);
uint32_t bg_red = RPART(color);
uint32_t bg_green = GPART(color);
uint32_t bg_blue = BPART(color);
r = (r * a + bg_red * inv_a + 127) >> 8;
g = (g * a + bg_green * inv_a + 127) >> 8;
b = (b * a + bg_blue * inv_a + 127) >> 8;
return MAKEARGB(fg_a, r, g, b);
}
else
{
return texel;
}
}
template<typename SamplerT>
FORCEINLINE unsigned int VECTORCALL SampleShade32_AVX2(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, int &fuzzpos)
{
if (SamplerT::Mode == (int)Samplers::Shaded)
{
const uint8_t *texpal = (const uint8_t *)texPixels;
uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16;
uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16;
unsigned int sampleshadeout = texpal[texelX * texHeight + texelY];
sampleshadeout += sampleshadeout >> 7; // 255 -> 256
return sampleshadeout;
}
else if (SamplerT::Mode == (int)Samplers::Stencil)
{
uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16;
uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16;
unsigned int sampleshadeout = APART(texPixels[texelX * texHeight + texelY]);
sampleshadeout += sampleshadeout >> 7; // 255 -> 256
return sampleshadeout;
}
else if (SamplerT::Mode == (int)Samplers::Fuzz)
{
uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16;
uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16;
unsigned int sampleshadeout = APART(texPixels[texelX * texHeight + texelY]);
sampleshadeout += sampleshadeout >> 7; // 255 -> 256
sampleshadeout = (sampleshadeout * fuzzcolormap[fuzzpos++]) >> 5;
if (fuzzpos >= FUZZTABLE) fuzzpos = 0;
return sampleshadeout;
}
else
{
return 0;
}
}
template<typename ShadeModeT>
FORCEINLINE __m256i VECTORCALL Shade32_AVX2(__m256i fgcolor, __m256i mlight, __m256i desaturate, __m256i inv_desaturate, __m256i shade_fade, __m256i shade_light)
{
if (ShadeModeT::Mode == (int)ShadeMode::Simple)
{
fgcolor = _mm256_srli_epi16(_mm256_mullo_epi16(fgcolor, mlight), 8);
}
else if (ShadeModeT::Mode == (int)ShadeMode::Advanced)
{
__m256i intensity = _mm256_mullo_epi16(fgcolor, _mm256_set_epi16(0, 77, 143, 37, 0, 77, 143, 37, 0, 77, 143, 37, 0, 77, 143, 37));
intensity = _mm256_add_epi16(intensity, _mm256_srli_epi64(intensity, 32));
intensity = _mm256_add_epi16(intensity, _mm256_srli_epi64(intensity, 16));
intensity = _mm256_srli_epi16(intensity, 8);
intensity = _mm256_mullo_epi16(intensity, desaturate);
intensity = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(intensity, _MM_SHUFFLE(3, 0, 0, 0)), _MM_SHUFFLE(3, 0, 0, 0));
fgcolor = _mm256_srli_epi16(_mm256_add_epi16(_mm256_mullo_epi16(fgcolor, inv_desaturate), intensity), 8);
fgcolor = _mm256_mullo_epi16(fgcolor, mlight);
fgcolor = _mm256_srli_epi16(_mm256_add_epi16(shade_fade, fgcolor), 8);
fgcolor = _mm256_srli_epi16(_mm256_mullo_epi16(fgcolor, shade_light), 8);
}
return fgcolor;
}
template<typename BlendT>
FORCEINLINE __m256i VECTORCALL Blend32_AVX2(__m256i fgcolor, __m256i bgcolor, __m256i ifgcolor, __m256i ifgshade, __m256i srcalpha, __m256i destalpha)
{
if (BlendT::Mode == (int)BlendModes::Opaque)
{
__m256i outcolor = fgcolor;
outcolor = _mm256_packus_epi16(outcolor, _mm256_setzero_si256());
return outcolor;
}
else if (BlendT::Mode == (int)BlendModes::Masked)
{
__m256i mask = _mm256_cmpeq_epi32(_mm256_packus_epi16(fgcolor, _mm256_setzero_si256()), _mm256_setzero_si256());
mask = _mm256_unpacklo_epi8(mask, _mm256_setzero_si256());
__m256i outcolor = _mm256_or_si256(_mm256_and_si256(mask, bgcolor), _mm256_andnot_si256(mask, fgcolor));
outcolor = _mm256_packus_epi16(outcolor, _mm256_setzero_si256());
outcolor = _mm256_or_si256(outcolor, _mm256_set1_epi32(0xff000000));
return outcolor;
}
else if (BlendT::Mode == (int)BlendModes::AddSrcColorOneMinusSrcColor)
{
__m256i inv_srccolor = _mm256_sub_epi16(_mm256_set1_epi16(256), _mm256_add_epi16(fgcolor, _mm256_srli_epi16(fgcolor, 7)));
__m256i outcolor = _mm256_add_epi16(fgcolor, _mm256_srli_epi16(_mm256_mullo_epi16(bgcolor, inv_srccolor), 8));
outcolor = _mm256_packus_epi16(outcolor, _mm256_setzero_si256());
return outcolor;
}
else if (BlendT::Mode == (int)BlendModes::Shaded)
{
ifgshade = _mm256_srli_epi32(_mm256_add_epi32(_mm256_mul_epu32(ifgshade, srcalpha), _mm256_set1_epi32(128)), 8);
__m256i alpha = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(ifgshade, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0));
__m256i inv_alpha = _mm256_sub_epi16(_mm256_set1_epi16(256), alpha);
fgcolor = _mm256_mullo_epi16(fgcolor, alpha);
bgcolor = _mm256_mullo_epi16(bgcolor, inv_alpha);
__m256i outcolor = _mm256_srli_epi16(_mm256_add_epi16(fgcolor, bgcolor), 8);
outcolor = _mm256_packus_epi16(outcolor, _mm256_setzero_si256());
outcolor = _mm256_or_si256(outcolor, _mm256_set1_epi32(0xff000000));
return outcolor;
}
else if (BlendT::Mode == (int)BlendModes::AddClampShaded)
{
ifgshade = _mm256_srli_epi32(_mm256_add_epi32(_mm256_mul_epu32(ifgshade, srcalpha), _mm256_set1_epi32(128)), 8);
__m256i alpha = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(ifgshade, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0));
__m256i inv_alpha = _mm256_sub_epi16(_mm256_set1_epi16(256), alpha);
fgcolor = _mm256_srli_epi16(_mm256_mullo_epi16(fgcolor, alpha), 8);
__m256i outcolor = _mm256_add_epi16(fgcolor, bgcolor);
outcolor = _mm256_packus_epi16(outcolor, _mm256_setzero_si256());
outcolor = _mm256_or_si256(outcolor, _mm256_set1_epi32(0xff000000));
return outcolor;
}
else
{
__m256i alpha = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(ifgcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
alpha = _mm256_srli_epi16(_mm256_add_epi16(alpha, _mm256_srli_epi16(alpha, 7)), 1); // 255->128
__m256i inv_alpha = _mm256_sub_epi16(_mm256_set1_epi16(128), alpha);
__m256i bgalpha = _mm256_srli_epi16(_mm256_add_epi16(_mm256_add_epi16(_mm256_mullo_epi16(destalpha, alpha), _mm256_slli_epi16(inv_alpha, 8)), _mm256_set1_epi32(64)), 7);
__m256i fgalpha = _mm256_srli_epi16(_mm256_add_epi16(_mm256_mullo_epi16(srcalpha, alpha), _mm256_set1_epi32(64)), 7);
fgcolor = _mm256_mullo_epi16(fgcolor, fgalpha);
bgcolor = _mm256_mullo_epi16(bgcolor, bgalpha);
__m256i fg_lo = _mm256_unpacklo_epi16(fgcolor, _mm256_setzero_si256());
__m256i bg_lo = _mm256_unpacklo_epi16(bgcolor, _mm256_setzero_si256());
__m256i fg_hi = _mm256_unpackhi_epi16(fgcolor, _mm256_setzero_si256());
__m256i bg_hi = _mm256_unpackhi_epi16(bgcolor, _mm256_setzero_si256());
__m256i out_lo, out_hi;
if (BlendT::Mode == (int)BlendModes::AddClamp)
{
out_lo = _mm256_add_epi32(fg_lo, bg_lo);
out_hi = _mm256_add_epi32(fg_hi, bg_hi);
}
else if (BlendT::Mode == (int)BlendModes::SubClamp)
{
out_lo = _mm256_sub_epi32(fg_lo, bg_lo);
out_hi = _mm256_sub_epi32(fg_hi, bg_hi);
}
else if (BlendT::Mode == (int)BlendModes::RevSubClamp)
{
out_lo = _mm256_sub_epi32(bg_lo, fg_lo);
out_hi = _mm256_sub_epi32(bg_hi, fg_hi);
}
out_lo = _mm256_srai_epi32(out_lo, 8);
out_hi = _mm256_srai_epi32(out_hi, 8);
__m256i outcolor = _mm256_packs_epi32(out_lo, out_hi);
outcolor = _mm256_packus_epi16(outcolor, _mm256_setzero_si256());
outcolor = _mm256_or_si256(outcolor, _mm256_set1_epi32(0xff000000));
return outcolor;
}
}
}
template<typename BlendT, typename SamplerT>
class TriScreenDrawer32_AVX2
{
public:
static void Execute(int x, int y, uint32_t mask0, uint32_t mask1, const TriDrawTriangleArgs *args)
{
using namespace TriScreenDrawerModes;
bool is_simple_shade = args->uniforms->SimpleShade();
if (SamplerT::Mode == (int)Samplers::Texture)
{
bool is_nearest_filter = args->uniforms->NearestFilter();
if (is_simple_shade)
{
if (is_nearest_filter)
DrawBlock<SimpleShade, NearestFilter>(x, y, mask0, mask1, args);
else
DrawBlock<SimpleShade, LinearFilter>(x, y, mask0, mask1, args);
}
else
{
if (is_nearest_filter)
DrawBlock<AdvancedShade, NearestFilter>(x, y, mask0, mask1, args);
else
DrawBlock<AdvancedShade, LinearFilter>(x, y, mask0, mask1, args);
}
}
else if (SamplerT::Mode == (int)Samplers::Fuzz)
{
DrawBlock<NoShade, NearestFilter>(x, y, mask0, mask1, args);
}
else // no linear filtering for translated, shaded, stencil, fill or skycap
{
if (is_simple_shade)
{
DrawBlock<SimpleShade, NearestFilter>(x, y, mask0, mask1, args);
}
else
{
DrawBlock<AdvancedShade, NearestFilter>(x, y, mask0, mask1, args);
}
}
}
private:
template<typename ShadeModeT, typename FilterModeT>
FORCEINLINE static void VECTORCALL DrawBlock(int destX, int destY, uint32_t mask0, uint32_t mask1, const TriDrawTriangleArgs *args)
{
using namespace TriScreenDrawerModes;
bool is_fixed_light = args->uniforms->FixedLight();
__m128i lightmask = _mm_set1_epi32(is_fixed_light ? 0 : 0xffffffff);
__m256i srcalpha = _mm256_set1_epi16(args->uniforms->SrcAlpha());
__m256i destalpha = _mm256_set1_epi16(args->uniforms->DestAlpha());
int fuzzpos = (ScreenTriangle::FuzzStart + destX * 123 + destY) % FUZZTABLE;
// Light
uint32_t light = args->uniforms->Light();
float shade = MIN(2.0f - (light + 12.0f) / 128.0f, 31.0f / 32.0f);
float globVis = args->uniforms->GlobVis() * (1.0f / 32.0f);
light += (light >> 7); // 255 -> 256
light <<= 8;
__m128i fixedlight = _mm_set1_epi32(light);
// Calculate gradients
const TriVertex &v1 = *args->v1;
__m128 gradientX = _mm_setr_ps(args->gradientX.W, args->gradientX.U, args->gradientX.V, 0.0f);
__m128 gradientY = _mm_setr_ps(args->gradientY.W, args->gradientY.U, args->gradientY.V, 0.0f);
__m128 blockPosY = _mm_add_ps(_mm_add_ps(
_mm_setr_ps(v1.w, v1.u * v1.w, v1.v * v1.w, globVis),
_mm_mul_ps(gradientX, _mm_set1_ps(destX - v1.x))),
_mm_mul_ps(gradientY, _mm_set1_ps(destY - v1.y)));
gradientX = _mm_mul_ps(gradientX, _mm_set1_ps(8.0f));
// Output
uint32_t * RESTRICT destOrg = (uint32_t*)args->dest;
int pitch = args->pitch;
uint32_t *dest = destOrg + destX + destY * pitch;
int offset_next_line = pitch - 8;
// Sampling stuff
uint32_t color = args->uniforms->Color();
const uint32_t * RESTRICT translation = (const uint32_t *)args->uniforms->Translation();
const uint32_t * RESTRICT texPixels = (const uint32_t *)args->uniforms->TexturePixels();
uint32_t texWidth = args->uniforms->TextureWidth();
uint32_t texHeight = args->uniforms->TextureHeight();
uint32_t oneU, oneV;
if (SamplerT::Mode != (int)Samplers::Fill)
{
oneU = ((0x800000 + texWidth - 1) / texWidth) * 2 + 1;
oneV = ((0x800000 + texHeight - 1) / texHeight) * 2 + 1;
}
else
{
oneU = 0;
oneV = 0;
}
// Shade constants
__m256i inv_desaturate, shade_fade, shade_light;
__m256i desaturate;
if (ShadeModeT::Mode == (int)ShadeMode::Advanced)
{
inv_desaturate = _mm256_setr_epi16(
256, 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate(),
256, 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate(),
256, 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate(),
256, 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate(), 256 - args->uniforms->ShadeDesaturate());
shade_fade = _mm256_set_epi16(
args->uniforms->ShadeFadeAlpha(), args->uniforms->ShadeFadeRed(), args->uniforms->ShadeFadeGreen(), args->uniforms->ShadeFadeBlue(),
args->uniforms->ShadeFadeAlpha(), args->uniforms->ShadeFadeRed(), args->uniforms->ShadeFadeGreen(), args->uniforms->ShadeFadeBlue(),
args->uniforms->ShadeFadeAlpha(), args->uniforms->ShadeFadeRed(), args->uniforms->ShadeFadeGreen(), args->uniforms->ShadeFadeBlue(),
args->uniforms->ShadeFadeAlpha(), args->uniforms->ShadeFadeRed(), args->uniforms->ShadeFadeGreen(), args->uniforms->ShadeFadeBlue());
shade_light = _mm256_set_epi16(
args->uniforms->ShadeLightAlpha(), args->uniforms->ShadeLightRed(), args->uniforms->ShadeLightGreen(), args->uniforms->ShadeLightBlue(),
args->uniforms->ShadeLightAlpha(), args->uniforms->ShadeLightRed(), args->uniforms->ShadeLightGreen(), args->uniforms->ShadeLightBlue(),
args->uniforms->ShadeLightAlpha(), args->uniforms->ShadeLightRed(), args->uniforms->ShadeLightGreen(), args->uniforms->ShadeLightBlue(),
args->uniforms->ShadeLightAlpha(), args->uniforms->ShadeLightRed(), args->uniforms->ShadeLightGreen(), args->uniforms->ShadeLightBlue());
desaturate = _mm256_sub_epi16(_mm256_set1_epi16(256), inv_desaturate);
}
else
{
inv_desaturate = _mm256_setzero_si256();
shade_fade = _mm256_setzero_si256();
shade_fade = _mm256_setzero_si256();
shade_light = _mm256_setzero_si256();
desaturate = _mm256_setzero_si256();
}
if (mask0 == 0xffffffff && mask1 == 0xffffffff)
{
for (int y = 0; y < 8; y++)
{
__m128 blockPosX = _mm_add_ps(blockPosY, gradientX);
__m128 W = _mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(0, 0, 0, 0));
__m128 rcpW = _mm_div_ps(_mm_set1_ps((float)0x01000000), W);
__m128i posUV = _mm_cvtps_epi32(_mm_mul_ps(_mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(2, 1, 2, 1)), rcpW));
__m128 vis = _mm_mul_ps(_mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(3, 3, 3, 3)), W);
__m128i lightpospair = _mm_sub_epi32(
_mm_set1_epi32(FRACUNIT),
_mm_cvtps_epi32(_mm_mul_ps(
_mm_max_ps(_mm_sub_ps(_mm_set1_ps(shade), _mm_min_ps(_mm_set1_ps(24.0f / 32.0f), vis)), _mm_setzero_ps()),
_mm_set1_ps((float)FRACUNIT))));
lightpospair = _mm_or_si128(_mm_and_si128(lightmask, lightpospair), _mm_andnot_si128(lightmask, fixedlight));
int32_t posU = _mm_cvtsi128_si32(posUV);
int32_t posV = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 4));
int32_t nextU = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 8));
int32_t nextV = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 12));
int32_t lightpos = _mm_cvtsi128_si32(lightpospair);
int32_t lightnext = _mm_cvtsi128_si32(_mm_srli_si128(lightpospair, 8));
int32_t stepU = (nextU - posU) >> 3;
int32_t stepV = (nextV - posV) >> 3;
fixed_t lightstep = (lightnext - lightpos) >> 3;
for (int ix = 0; ix < 2; ix++)
{
// Load bgcolor
__m256i bgcolor;
if (BlendT::Mode != (int)BlendModes::Opaque)
{
__m128i bgpacked = _mm_loadu_si128((__m128i*)dest);
bgcolor = _mm256_set_m128i(_mm_unpackhi_epi8(bgpacked, _mm_setzero_si128()), _mm_unpacklo_epi8(bgpacked, _mm_setzero_si128()));
}
else
bgcolor = _mm256_setzero_si256();
// Sample fgcolor
unsigned int ifgcolor0 = Sample32_AVX2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
unsigned int ifgshade0 = SampleShade32_AVX2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
posU += stepU;
posV += stepV;
unsigned int ifgcolor1 = Sample32_AVX2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
unsigned int ifgshade1 = SampleShade32_AVX2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
posU += stepU;
posV += stepV;
unsigned int ifgcolor2 = Sample32_AVX2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
unsigned int ifgshade2 = SampleShade32_AVX2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
posU += stepU;
posV += stepV;
unsigned int ifgcolor3 = Sample32_AVX2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
unsigned int ifgshade3 = SampleShade32_AVX2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
posU += stepU;
posV += stepV;
// Setup light
int lightpos0 = lightpos >> 8;
lightpos += lightstep;
int lightpos1 = lightpos >> 8;
lightpos += lightstep;
int lightpos2 = lightpos >> 8;
lightpos += lightstep;
int lightpos3 = lightpos >> 8;
lightpos += lightstep;
__m256i mlight = _mm256_set_epi16(
256, lightpos3, lightpos3, lightpos3,
256, lightpos2, lightpos2, lightpos2,
256, lightpos1, lightpos1, lightpos1,
256, lightpos0, lightpos0, lightpos0);
__m256i shade_fade_lit;
if (ShadeModeT::Mode == (int)ShadeMode::Advanced)
{
__m256i inv_light = _mm256_sub_epi16(_mm256_set_epi16(0, 256, 256, 256, 0, 256, 256, 256, 0, 256, 256, 256, 0, 256, 256, 256), mlight);
shade_fade_lit = _mm256_mullo_epi16(shade_fade, inv_light);
}
else
{
shade_fade_lit = _mm256_setzero_si256();
}
// Shade and blend
__m128i fgpacked = _mm_set_epi32(ifgcolor3, ifgcolor2, ifgcolor1, ifgcolor0);
__m128i shadepacked = _mm_set_epi32(ifgshade3, ifgshade2, ifgshade1, ifgshade0);
__m256i mifgcolor = _mm256_set_m128i(_mm_unpackhi_epi8(fgpacked, _mm_setzero_si128()), _mm_unpacklo_epi8(fgpacked, _mm_setzero_si128()));
__m256i mifgshade = _mm256_set_m128i(_mm_unpackhi_epi32(shadepacked, shadepacked), _mm_unpacklo_epi32(shadepacked, shadepacked));
__m256i fgcolor = mifgcolor;
fgcolor = Shade32_AVX2<ShadeModeT>(fgcolor, mlight, desaturate, inv_desaturate, shade_fade_lit, shade_light);
__m256i outcolor = Blend32_AVX2<BlendT>(fgcolor, bgcolor, mifgcolor, mifgshade, srcalpha, destalpha);
// Store result
_mm_storeu_si128((__m128i*)dest, _mm_or_si128(_mm256_extracti128_si256(outcolor, 0), _mm_slli_si128(_mm256_extracti128_si256(outcolor, 1), 8)));
dest += 4;
}
blockPosY = _mm_add_ps(blockPosY, gradientY);
dest += offset_next_line;
}
}
else
{
// mask0 loop:
for (int y = 0; y < 4; y++)
{
__m128 blockPosX = _mm_add_ps(blockPosY, gradientX);
__m128 W = _mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(0, 0, 0, 0));
__m128 rcpW = _mm_div_ps(_mm_set1_ps((float)0x01000000), W);
__m128i posUV = _mm_cvtps_epi32(_mm_mul_ps(_mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(2, 1, 2, 1)), rcpW));
__m128 vis = _mm_mul_ps(_mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(3, 3, 3, 3)), W);
__m128i lightpospair = _mm_sub_epi32(
_mm_set1_epi32(FRACUNIT),
_mm_cvtps_epi32(_mm_mul_ps(
_mm_max_ps(_mm_sub_ps(_mm_set1_ps(shade), _mm_min_ps(_mm_set1_ps(24.0f / 32.0f), vis)), _mm_setzero_ps()),
_mm_set1_ps((float)FRACUNIT))));
lightpospair = _mm_or_si128(_mm_and_si128(lightmask, lightpospair), _mm_andnot_si128(lightmask, fixedlight));
int32_t posU = _mm_cvtsi128_si32(posUV);
int32_t posV = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 4));
int32_t nextU = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 8));
int32_t nextV = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 12));
int32_t lightpos = _mm_cvtsi128_si32(lightpospair);
int32_t lightnext = _mm_cvtsi128_si32(_mm_srli_si128(lightpospair, 8));
int32_t stepU = (nextU - posU) >> 3;
int32_t stepV = (nextV - posV) >> 3;
fixed_t lightstep = (lightnext - lightpos) >> 3;
for (int x = 0; x < 2; x++)
{
// Load bgcolor
uint32_t desttmp[4];
__m256i bgcolor;
if (BlendT::Mode != (int)BlendModes::Opaque)
{
if (mask0 & (1 << 31)) desttmp[0] = dest[0];
if (mask0 & (1 << 30)) desttmp[1] = dest[1];
if (mask0 & (1 << 29)) desttmp[2] = dest[2];
if (mask0 & (1 << 28)) desttmp[3] = dest[3];
__m128i bgpacked = _mm_loadu_si128((__m128i*)(desttmp));
bgcolor = _mm256_set_m128i(_mm_unpackhi_epi8(bgpacked, _mm_setzero_si128()), _mm_unpacklo_epi8(bgpacked, _mm_setzero_si128()));
}
else
bgcolor = _mm256_setzero_si256();
// Sample fgcolor
unsigned int ifgcolor0 = Sample32_AVX2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
unsigned int ifgshade0 = SampleShade32_AVX2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
posU += stepU;
posV += stepV;
unsigned int ifgcolor1 = Sample32_AVX2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
unsigned int ifgshade1 = SampleShade32_AVX2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
posU += stepU;
posV += stepV;
unsigned int ifgcolor2 = Sample32_AVX2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
unsigned int ifgshade2 = SampleShade32_AVX2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
posU += stepU;
posV += stepV;
unsigned int ifgcolor3 = Sample32_AVX2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
unsigned int ifgshade3 = SampleShade32_AVX2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
posU += stepU;
posV += stepV;
// Setup light
int lightpos0 = lightpos >> 8;
lightpos += lightstep;
int lightpos1 = lightpos >> 8;
lightpos += lightstep;
int lightpos2 = lightpos >> 8;
lightpos += lightstep;
int lightpos3 = lightpos >> 8;
lightpos += lightstep;
__m256i mlight = _mm256_set_epi16(
256, lightpos3, lightpos3, lightpos3,
256, lightpos2, lightpos2, lightpos2,
256, lightpos1, lightpos1, lightpos1,
256, lightpos0, lightpos0, lightpos0);
__m256i shade_fade_lit;
if (ShadeModeT::Mode == (int)ShadeMode::Advanced)
{
__m256i inv_light = _mm256_sub_epi16(_mm256_set_epi16(0, 256, 256, 256, 0, 256, 256, 256, 0, 256, 256, 256, 0, 256, 256, 256), mlight);
shade_fade_lit = _mm256_mullo_epi16(shade_fade, inv_light);
}
else
{
shade_fade_lit = _mm256_setzero_si256();
}
// Shade and blend
__m128i fgpacked = _mm_set_epi32(ifgcolor3, ifgcolor2, ifgcolor1, ifgcolor0);
__m128i shadepacked = _mm_set_epi32(ifgshade3, ifgshade2, ifgshade1, ifgshade0);
__m256i mifgcolor = _mm256_set_m128i(_mm_unpackhi_epi8(fgpacked, _mm_setzero_si128()), _mm_unpacklo_epi8(fgpacked, _mm_setzero_si128()));
__m256i mifgshade = _mm256_set_m128i(_mm_unpackhi_epi32(shadepacked, shadepacked), _mm_unpacklo_epi32(shadepacked, shadepacked));
__m256i fgcolor = mifgcolor;
fgcolor = Shade32_AVX2<ShadeModeT>(fgcolor, mlight, desaturate, inv_desaturate, shade_fade_lit, shade_light);
__m256i outcolor = Blend32_AVX2<BlendT>(fgcolor, bgcolor, mifgcolor, mifgshade, srcalpha, destalpha);
// Store result
_mm_storeu_si128((__m128i*)desttmp, _mm_or_si128(_mm256_extracti128_si256(outcolor, 0), _mm_slli_si128(_mm256_extracti128_si256(outcolor, 1), 8)));
if (mask0 & (1 << 31)) dest[0] = desttmp[0];
if (mask0 & (1 << 30)) dest[1] = desttmp[1];
if (mask0 & (1 << 29)) dest[2] = desttmp[2];
if (mask0 & (1 << 28)) dest[3] = desttmp[3];
mask0 <<= 4;
dest += 4;
}
blockPosY = _mm_add_ps(blockPosY, gradientY);
dest += offset_next_line;
}
// mask1 loop:
for (int y = 0; y < 4; y++)
{
__m128 blockPosX = _mm_add_ps(blockPosY, gradientX);
__m128 W = _mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(0, 0, 0, 0));
__m128 rcpW = _mm_div_ps(_mm_set1_ps((float)0x01000000), W);
__m128i posUV = _mm_cvtps_epi32(_mm_mul_ps(_mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(2, 1, 2, 1)), rcpW));
__m128 vis = _mm_mul_ps(_mm_shuffle_ps(blockPosY, blockPosX, _MM_SHUFFLE(3, 3, 3, 3)), W);
__m128i lightpospair = _mm_sub_epi32(
_mm_set1_epi32(FRACUNIT),
_mm_cvtps_epi32(_mm_mul_ps(
_mm_max_ps(_mm_sub_ps(_mm_set1_ps(shade), _mm_min_ps(_mm_set1_ps(24.0f / 32.0f), vis)), _mm_setzero_ps()),
_mm_set1_ps((float)FRACUNIT))));
lightpospair = _mm_or_si128(_mm_and_si128(lightmask, lightpospair), _mm_andnot_si128(lightmask, fixedlight));
int32_t posU = _mm_cvtsi128_si32(posUV);
int32_t posV = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 4));
int32_t nextU = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 8));
int32_t nextV = _mm_cvtsi128_si32(_mm_srli_si128(posUV, 12));
int32_t lightpos = _mm_cvtsi128_si32(lightpospair);
int32_t lightnext = _mm_cvtsi128_si32(_mm_srli_si128(lightpospair, 8));
int32_t stepU = (nextU - posU) >> 3;
int32_t stepV = (nextV - posV) >> 3;
fixed_t lightstep = (lightnext - lightpos) >> 3;
for (int x = 0; x < 2; x++)
{
// Load bgcolor
uint32_t desttmp[4];
__m256i bgcolor;
if (BlendT::Mode != (int)BlendModes::Opaque)
{
if (mask1 & (1 << 31)) desttmp[0] = dest[0];
if (mask1 & (1 << 30)) desttmp[1] = dest[1];
if (mask1 & (1 << 29)) desttmp[2] = dest[2];
if (mask1 & (1 << 28)) desttmp[3] = dest[3];
__m128i bgpacked = _mm_loadu_si128((__m128i*)(desttmp));
bgcolor = _mm256_set_m128i(_mm_unpackhi_epi8(bgpacked, _mm_setzero_si128()), _mm_unpacklo_epi8(bgpacked, _mm_setzero_si128()));
}
else
bgcolor = _mm256_setzero_si256();
// Sample fgcolor
unsigned int ifgcolor0 = Sample32_AVX2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
unsigned int ifgshade0 = SampleShade32_AVX2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
posU += stepU;
posV += stepV;
unsigned int ifgcolor1 = Sample32_AVX2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
unsigned int ifgshade1 = SampleShade32_AVX2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
posU += stepU;
posV += stepV;
unsigned int ifgcolor2 = Sample32_AVX2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
unsigned int ifgshade2 = SampleShade32_AVX2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
posU += stepU;
posV += stepV;
unsigned int ifgcolor3 = Sample32_AVX2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
unsigned int ifgshade3 = SampleShade32_AVX2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
posU += stepU;
posV += stepV;
// Setup light
int lightpos0 = lightpos >> 8;
lightpos += lightstep;
int lightpos1 = lightpos >> 8;
lightpos += lightstep;
int lightpos2 = lightpos >> 8;
lightpos += lightstep;
int lightpos3 = lightpos >> 8;
lightpos += lightstep;
__m256i mlight = _mm256_set_epi16(
256, lightpos3, lightpos3, lightpos3,
256, lightpos2, lightpos2, lightpos2,
256, lightpos1, lightpos1, lightpos1,
256, lightpos0, lightpos0, lightpos0);
__m256i shade_fade_lit;
if (ShadeModeT::Mode == (int)ShadeMode::Advanced)
{
__m256i inv_light = _mm256_sub_epi16(_mm256_set_epi16(0, 256, 256, 256, 0, 256, 256, 256, 0, 256, 256, 256, 0, 256, 256, 256), mlight);
shade_fade_lit = _mm256_mullo_epi16(shade_fade, inv_light);
}
else
{
shade_fade_lit = _mm256_setzero_si256();
}
// Shade and blend
__m128i fgpacked = _mm_set_epi32(ifgcolor3, ifgcolor2, ifgcolor1, ifgcolor0);
__m128i shadepacked = _mm_set_epi32(ifgshade3, ifgshade2, ifgshade1, ifgshade0);
__m256i mifgcolor = _mm256_set_m128i(_mm_unpackhi_epi8(fgpacked, _mm_setzero_si128()), _mm_unpacklo_epi8(fgpacked, _mm_setzero_si128()));
__m256i mifgshade = _mm256_set_m128i(_mm_unpackhi_epi32(shadepacked, shadepacked), _mm_unpacklo_epi32(shadepacked, shadepacked));
__m256i fgcolor = mifgcolor;
fgcolor = Shade32_AVX2<ShadeModeT>(fgcolor, mlight, desaturate, inv_desaturate, shade_fade_lit, shade_light);
__m256i outcolor = Blend32_AVX2<BlendT>(fgcolor, bgcolor, mifgcolor, mifgshade, srcalpha, destalpha);
// Store result
_mm_storeu_si128((__m128i*)desttmp, _mm_or_si128(_mm256_extracti128_si256(outcolor, 0), _mm_slli_si128(_mm256_extracti128_si256(outcolor, 1), 8)));
if (mask1 & (1 << 31)) dest[0] = desttmp[0];
if (mask1 & (1 << 30)) dest[1] = desttmp[1];
if (mask1 & (1 << 29)) dest[2] = desttmp[2];
if (mask1 & (1 << 28)) dest[3] = desttmp[3];
mask1 <<= 4;
dest += 4;
}
blockPosY = _mm_add_ps(blockPosY, gradientY);
dest += offset_next_line;
}
}
}
};

View file

@ -27,7 +27,7 @@
namespace TriScreenDrawerModes namespace TriScreenDrawerModes
{ {
template<typename SamplerT, typename FilterModeT> template<typename SamplerT, typename FilterModeT>
FORCEINLINE unsigned int VECTORCALL Sample32(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, uint32_t oneU, uint32_t oneV, uint32_t color, const uint32_t *translation) FORCEINLINE unsigned int VECTORCALL Sample32_SSE2(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, uint32_t oneU, uint32_t oneV, uint32_t color, const uint32_t *translation)
{ {
uint32_t texel; uint32_t texel;
if (SamplerT::Mode == (int)Samplers::Shaded || SamplerT::Mode == (int)Samplers::Stencil || SamplerT::Mode == (int)Samplers::Fill || SamplerT::Mode == (int)Samplers::Fuzz) if (SamplerT::Mode == (int)Samplers::Shaded || SamplerT::Mode == (int)Samplers::Stencil || SamplerT::Mode == (int)Samplers::Fill || SamplerT::Mode == (int)Samplers::Fuzz)
@ -107,7 +107,7 @@ namespace TriScreenDrawerModes
} }
template<typename SamplerT> template<typename SamplerT>
FORCEINLINE unsigned int VECTORCALL SampleShade32(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, int &fuzzpos) FORCEINLINE unsigned int VECTORCALL SampleShade32_SSE2(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, int &fuzzpos)
{ {
if (SamplerT::Mode == (int)Samplers::Shaded) if (SamplerT::Mode == (int)Samplers::Shaded)
{ {
@ -143,7 +143,7 @@ namespace TriScreenDrawerModes
} }
template<typename ShadeModeT> template<typename ShadeModeT>
FORCEINLINE __m128i VECTORCALL Shade32(__m128i fgcolor, __m128i mlight, unsigned int ifgcolor0, unsigned int ifgcolor1, int desaturate, __m128i inv_desaturate, __m128i shade_fade, __m128i shade_light) FORCEINLINE __m128i VECTORCALL Shade32_SSE2(__m128i fgcolor, __m128i mlight, unsigned int ifgcolor0, unsigned int ifgcolor1, int desaturate, __m128i inv_desaturate, __m128i shade_fade, __m128i shade_light)
{ {
if (ShadeModeT::Mode == (int)ShadeMode::Simple) if (ShadeModeT::Mode == (int)ShadeMode::Simple)
{ {
@ -172,7 +172,7 @@ namespace TriScreenDrawerModes
} }
template<typename BlendT> template<typename BlendT>
FORCEINLINE __m128i VECTORCALL Blend32(__m128i fgcolor, __m128i bgcolor, unsigned int ifgcolor0, unsigned int ifgcolor1, unsigned int ifgshade0, unsigned int ifgshade1, uint32_t srcalpha, uint32_t destalpha) FORCEINLINE __m128i VECTORCALL Blend32_SSE2(__m128i fgcolor, __m128i bgcolor, unsigned int ifgcolor0, unsigned int ifgcolor1, unsigned int ifgshade0, unsigned int ifgshade1, uint32_t srcalpha, uint32_t destalpha)
{ {
if (BlendT::Mode == (int)BlendModes::Opaque) if (BlendT::Mode == (int)BlendModes::Opaque)
{ {
@ -275,7 +275,7 @@ namespace TriScreenDrawerModes
} }
template<typename BlendT, typename SamplerT> template<typename BlendT, typename SamplerT>
class TriScreenDrawer32 class TriScreenDrawer32_SSE2
{ {
public: public:
static void Execute(int x, int y, uint32_t mask0, uint32_t mask1, const TriDrawTriangleArgs *args) static void Execute(int x, int y, uint32_t mask0, uint32_t mask1, const TriDrawTriangleArgs *args)
@ -430,13 +430,13 @@ private:
// Sample fgcolor // Sample fgcolor
unsigned int ifgcolor[2], ifgshade[2]; unsigned int ifgcolor[2], ifgshade[2];
ifgcolor[0] = Sample32<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); ifgcolor[0] = Sample32_SSE2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
ifgshade[0] = SampleShade32<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos); ifgshade[0] = SampleShade32_SSE2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
posU += stepU; posU += stepU;
posV += stepV; posV += stepV;
ifgcolor[1] = Sample32<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); ifgcolor[1] = Sample32_SSE2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
ifgshade[1] = SampleShade32<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos); ifgshade[1] = SampleShade32_SSE2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
posU += stepU; posU += stepU;
posV += stepV; posV += stepV;
@ -460,8 +460,8 @@ private:
// Shade and blend // Shade and blend
__m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128());
fgcolor = Shade32<ShadeModeT>(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); fgcolor = Shade32_SSE2<ShadeModeT>(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light);
__m128i outcolor = Blend32<BlendT>(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); __m128i outcolor = Blend32_SSE2<BlendT>(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha);
// Store result // Store result
_mm_storel_epi64((__m128i*)(dest + ix * 2), outcolor); _mm_storel_epi64((__m128i*)(dest + ix * 2), outcolor);
@ -517,13 +517,13 @@ private:
// Sample fgcolor // Sample fgcolor
unsigned int ifgcolor[2], ifgshade[2]; unsigned int ifgcolor[2], ifgshade[2];
ifgcolor[0] = Sample32<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); ifgcolor[0] = Sample32_SSE2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
ifgshade[0] = SampleShade32<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos); ifgshade[0] = SampleShade32_SSE2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
posU += stepU; posU += stepU;
posV += stepV; posV += stepV;
ifgcolor[1] = Sample32<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); ifgcolor[1] = Sample32_SSE2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
ifgshade[1] = SampleShade32<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos); ifgshade[1] = SampleShade32_SSE2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
posU += stepU; posU += stepU;
posV += stepV; posV += stepV;
@ -547,8 +547,8 @@ private:
// Shade and blend // Shade and blend
__m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128());
fgcolor = Shade32<ShadeModeT>(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); fgcolor = Shade32_SSE2<ShadeModeT>(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light);
__m128i outcolor = Blend32<BlendT>(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); __m128i outcolor = Blend32_SSE2<BlendT>(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha);
// Store result // Store result
_mm_storel_epi64((__m128i*)desttmp, outcolor); _mm_storel_epi64((__m128i*)desttmp, outcolor);
@ -606,13 +606,13 @@ private:
// Sample fgcolor // Sample fgcolor
unsigned int ifgcolor[2], ifgshade[2]; unsigned int ifgcolor[2], ifgshade[2];
ifgcolor[0] = Sample32<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); ifgcolor[0] = Sample32_SSE2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
ifgshade[0] = SampleShade32<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos); ifgshade[0] = SampleShade32_SSE2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
posU += stepU; posU += stepU;
posV += stepV; posV += stepV;
ifgcolor[1] = Sample32<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); ifgcolor[1] = Sample32_SSE2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
ifgshade[1] = SampleShade32<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos); ifgshade[1] = SampleShade32_SSE2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
posU += stepU; posU += stepU;
posV += stepV; posV += stepV;
@ -636,8 +636,8 @@ private:
// Shade and blend // Shade and blend
__m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128());
fgcolor = Shade32<ShadeModeT>(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); fgcolor = Shade32_SSE2<ShadeModeT>(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light);
__m128i outcolor = Blend32<BlendT>(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); __m128i outcolor = Blend32_SSE2<BlendT>(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha);
// Store result // Store result
_mm_storel_epi64((__m128i*)desttmp, outcolor); _mm_storel_epi64((__m128i*)desttmp, outcolor);
@ -658,7 +658,7 @@ private:
}; };
template<typename BlendT, typename SamplerT> template<typename BlendT, typename SamplerT>
class RectScreenDrawer32 class RectScreenDrawer32_SSE2
{ {
public: public:
static void Execute(const void *destOrg, int destWidth, int destHeight, int destPitch, const RectDrawArgs *args, WorkerThreadData *thread) static void Execute(const void *destOrg, int destWidth, int destHeight, int destPitch, const RectDrawArgs *args, WorkerThreadData *thread)
@ -780,18 +780,18 @@ private:
// Sample fgcolor // Sample fgcolor
unsigned int ifgcolor[2], ifgshade[2]; unsigned int ifgcolor[2], ifgshade[2];
ifgcolor[0] = Sample32<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); ifgcolor[0] = Sample32_SSE2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
ifgshade[0] = SampleShade32<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos); ifgshade[0] = SampleShade32_SSE2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
posU += stepU; posU += stepU;
ifgcolor[1] = Sample32<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); ifgcolor[1] = Sample32_SSE2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
ifgshade[1] = SampleShade32<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos); ifgshade[1] = SampleShade32_SSE2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
posU += stepU; posU += stepU;
// Shade and blend // Shade and blend
__m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128());
fgcolor = Shade32<ShadeModeT>(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); fgcolor = Shade32_SSE2<ShadeModeT>(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light);
__m128i outcolor = Blend32<BlendT>(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); __m128i outcolor = Blend32_SSE2<BlendT>(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha);
// Store result // Store result
_mm_storel_epi64((__m128i*)dest, outcolor); _mm_storel_epi64((__m128i*)dest, outcolor);
@ -809,16 +809,16 @@ private:
// Sample fgcolor // Sample fgcolor
unsigned int ifgcolor[2], ifgshade[2]; unsigned int ifgcolor[2], ifgshade[2];
ifgcolor[0] = Sample32<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); ifgcolor[0] = Sample32_SSE2<SamplerT, FilterModeT>(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation);
ifgshade[0] = SampleShade32<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos); ifgshade[0] = SampleShade32_SSE2<SamplerT>(posU, posV, texPixels, texWidth, texHeight, fuzzpos);
ifgcolor[1] = 0; ifgcolor[1] = 0;
ifgshade[1] = 0; ifgshade[1] = 0;
posU += stepU; posU += stepU;
// Shade and blend // Shade and blend
__m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128());
fgcolor = Shade32<ShadeModeT>(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); fgcolor = Shade32_SSE2<ShadeModeT>(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light);
__m128i outcolor = Blend32<BlendT>(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); __m128i outcolor = Blend32_SSE2<BlendT>(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha);
// Store result // Store result
*dest = _mm_cvtsi128_si32(outcolor); *dest = _mm_cvtsi128_si32(outcolor);

View file

@ -37,6 +37,7 @@
#include "polyrenderer/poly_renderer.h" #include "polyrenderer/poly_renderer.h"
#include "swrenderer/drawers/r_draw_rgba.h" #include "swrenderer/drawers/r_draw_rgba.h"
#include "screen_triangle.h" #include "screen_triangle.h"
#include "x86.h"
int PolyTriangleDrawer::viewport_x; int PolyTriangleDrawer::viewport_x;
int PolyTriangleDrawer::viewport_y; int PolyTriangleDrawer::viewport_y;
@ -151,14 +152,8 @@ ShadedTriVertex PolyTriangleDrawer::shade_vertex(const TriMatrix &objectToClip,
return sv; return sv;
} }
void PolyTriangleDrawer::draw_shaded_triangle(const ShadedTriVertex *vert, bool ccw, TriDrawTriangleArgs *args, WorkerThreadData *thread) void PolyTriangleDrawer::clip_to_viewport(TriVertex *clippedvert, int numclipvert)
{ {
// Cull, clip and generate additional vertices as needed
TriVertex clippedvert[max_additional_vertices];
int numclipvert = clipedge(vert, clippedvert);
#ifdef NO_SSE
// Map to 2D viewport:
for (int j = 0; j < numclipvert; j++) for (int j = 0; j < numclipvert; j++)
{ {
auto &v = clippedvert[j]; auto &v = clippedvert[j];
@ -173,8 +168,11 @@ void PolyTriangleDrawer::draw_shaded_triangle(const ShadedTriVertex *vert, bool
v.x = viewport_x + viewport_width * (1.0f + v.x) * 0.5f; v.x = viewport_x + viewport_width * (1.0f + v.x) * 0.5f;
v.y = viewport_y + viewport_height * (1.0f - v.y) * 0.5f; v.y = viewport_y + viewport_height * (1.0f - v.y) * 0.5f;
} }
#else }
// Map to 2D viewport:
#ifndef NO_SSE
void PolyTriangleDrawer::clip_to_viewport_sse2(TriVertex *clippedvert, int numclipvert)
{
__m128 mviewport_x = _mm_set1_ps((float)viewport_x); __m128 mviewport_x = _mm_set1_ps((float)viewport_x);
__m128 mviewport_y = _mm_set1_ps((float)viewport_y); __m128 mviewport_y = _mm_set1_ps((float)viewport_y);
__m128 mviewport_halfwidth = _mm_set1_ps(viewport_width * 0.5f); __m128 mviewport_halfwidth = _mm_set1_ps(viewport_width * 0.5f);
@ -205,8 +203,21 @@ void PolyTriangleDrawer::draw_shaded_triangle(const ShadedTriVertex *vert, bool
_mm_storeu_ps(&clippedvert[j + 2].x, vz); _mm_storeu_ps(&clippedvert[j + 2].x, vz);
_mm_storeu_ps(&clippedvert[j + 3].x, vw); _mm_storeu_ps(&clippedvert[j + 3].x, vw);
} }
}
#endif #endif
void PolyTriangleDrawer::draw_shaded_triangle(const ShadedTriVertex *vert, bool ccw, TriDrawTriangleArgs *args, WorkerThreadData *thread)
{
// Cull, clip and generate additional vertices as needed
TriVertex clippedvert[max_additional_vertices];
int numclipvert = CPU.bSSE2 ? clipedge_sse2(vert, clippedvert) : clipedge(vert, clippedvert);
// Map to 2D viewport:
if (CPU.bSSE2)
clip_to_viewport_sse2(clippedvert, numclipvert);
else
clip_to_viewport(clippedvert, numclipvert);
// Keep varyings in -128 to 128 range if possible // Keep varyings in -128 to 128 range if possible
if (numclipvert > 0) if (numclipvert > 0)
{ {
@ -255,7 +266,6 @@ int PolyTriangleDrawer::clipedge(const ShadedTriVertex *verts, TriVertex *clippe
// halfspace clip distances // halfspace clip distances
static const int numclipdistances = 7; static const int numclipdistances = 7;
#ifdef NO_SSE
float clipdistance[numclipdistances * 3]; float clipdistance[numclipdistances * 3];
bool needsclipping = false; bool needsclipping = false;
float *clipd = clipdistance; float *clipd = clipdistance;
@ -282,43 +292,6 @@ int PolyTriangleDrawer::clipedge(const ShadedTriVertex *verts, TriVertex *clippe
} }
return 3; return 3;
} }
#else
__m128 mx = _mm_loadu_ps(&verts[0].x);
__m128 my = _mm_loadu_ps(&verts[1].x);
__m128 mz = _mm_loadu_ps(&verts[2].x);
__m128 mw = _mm_setzero_ps();
_MM_TRANSPOSE4_PS(mx, my, mz, mw);
__m128 clipd0 = _mm_add_ps(mx, mw);
__m128 clipd1 = _mm_sub_ps(mw, mx);
__m128 clipd2 = _mm_add_ps(my, mw);
__m128 clipd3 = _mm_sub_ps(mw, my);
__m128 clipd4 = _mm_add_ps(mz, mw);
__m128 clipd5 = _mm_sub_ps(mw, mz);
__m128 clipd6 = _mm_setr_ps(verts[0].clipDistance0, verts[1].clipDistance0, verts[2].clipDistance0, 0.0f);
__m128 mneedsclipping = _mm_cmplt_ps(clipd0, _mm_setzero_ps());
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd1, _mm_setzero_ps()));
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd2, _mm_setzero_ps()));
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd3, _mm_setzero_ps()));
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd4, _mm_setzero_ps()));
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd5, _mm_setzero_ps()));
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd6, _mm_setzero_ps()));
if (_mm_movemask_ps(mneedsclipping) == 0)
{
for (int i = 0; i < 3; i++)
{
memcpy(clippedvert + i, verts + i, sizeof(TriVertex));
}
return 3;
}
float clipdistance[numclipdistances * 4];
_mm_storeu_ps(clipdistance, clipd0);
_mm_storeu_ps(clipdistance + 4, clipd1);
_mm_storeu_ps(clipdistance + 8, clipd2);
_mm_storeu_ps(clipdistance + 12, clipd3);
_mm_storeu_ps(clipdistance + 16, clipd4);
_mm_storeu_ps(clipdistance + 20, clipd5);
_mm_storeu_ps(clipdistance + 24, clipd6);
#endif
// use barycentric weights while clipping vertices // use barycentric weights while clipping vertices
float weights[max_additional_vertices * 3 * 2]; float weights[max_additional_vertices * 3 * 2];
@ -341,7 +314,6 @@ int PolyTriangleDrawer::clipedge(const ShadedTriVertex *verts, TriVertex *clippe
for (int i = 0; i < inputverts; i++) for (int i = 0; i < inputverts; i++)
{ {
int j = (i + 1) % inputverts; int j = (i + 1) % inputverts;
#ifdef NO_SSE
float clipdistance1 = float clipdistance1 =
clipdistance[0 * numclipdistances + p] * input[i * 3 + 0] + clipdistance[0 * numclipdistances + p] * input[i * 3 + 0] +
clipdistance[1 * numclipdistances + p] * input[i * 3 + 1] + clipdistance[1 * numclipdistances + p] * input[i * 3 + 1] +
@ -351,17 +323,6 @@ int PolyTriangleDrawer::clipedge(const ShadedTriVertex *verts, TriVertex *clippe
clipdistance[0 * numclipdistances + p] * input[j * 3 + 0] + clipdistance[0 * numclipdistances + p] * input[j * 3 + 0] +
clipdistance[1 * numclipdistances + p] * input[j * 3 + 1] + clipdistance[1 * numclipdistances + p] * input[j * 3 + 1] +
clipdistance[2 * numclipdistances + p] * input[j * 3 + 2]; clipdistance[2 * numclipdistances + p] * input[j * 3 + 2];
#else
float clipdistance1 =
clipdistance[0 + p * 4] * input[i * 3 + 0] +
clipdistance[1 + p * 4] * input[i * 3 + 1] +
clipdistance[2 + p * 4] * input[i * 3 + 2];
float clipdistance2 =
clipdistance[0 + p * 4] * input[j * 3 + 0] +
clipdistance[1 + p * 4] * input[j * 3 + 1] +
clipdistance[2 + p * 4] * input[j * 3 + 2];
#endif
// Clip halfspace // Clip halfspace
if ((clipdistance1 >= 0.0f || clipdistance2 >= 0.0f) && outputverts + 1 < max_additional_vertices) if ((clipdistance1 >= 0.0f || clipdistance2 >= 0.0f) && outputverts + 1 < max_additional_vertices)
@ -408,6 +369,129 @@ int PolyTriangleDrawer::clipedge(const ShadedTriVertex *verts, TriVertex *clippe
return inputverts; return inputverts;
} }
#ifndef NO_SSE
int PolyTriangleDrawer::clipedge_sse2(const ShadedTriVertex *verts, TriVertex *clippedvert)
{
// Clip and cull so that the following is true for all vertices:
// -v.w <= v.x <= v.w
// -v.w <= v.y <= v.w
// -v.w <= v.z <= v.w
// halfspace clip distances
static const int numclipdistances = 7;
__m128 mx = _mm_loadu_ps(&verts[0].x);
__m128 my = _mm_loadu_ps(&verts[1].x);
__m128 mz = _mm_loadu_ps(&verts[2].x);
__m128 mw = _mm_setzero_ps();
_MM_TRANSPOSE4_PS(mx, my, mz, mw);
__m128 clipd0 = _mm_add_ps(mx, mw);
__m128 clipd1 = _mm_sub_ps(mw, mx);
__m128 clipd2 = _mm_add_ps(my, mw);
__m128 clipd3 = _mm_sub_ps(mw, my);
__m128 clipd4 = _mm_add_ps(mz, mw);
__m128 clipd5 = _mm_sub_ps(mw, mz);
__m128 clipd6 = _mm_setr_ps(verts[0].clipDistance0, verts[1].clipDistance0, verts[2].clipDistance0, 0.0f);
__m128 mneedsclipping = _mm_cmplt_ps(clipd0, _mm_setzero_ps());
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd1, _mm_setzero_ps()));
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd2, _mm_setzero_ps()));
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd3, _mm_setzero_ps()));
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd4, _mm_setzero_ps()));
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd5, _mm_setzero_ps()));
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd6, _mm_setzero_ps()));
if (_mm_movemask_ps(mneedsclipping) == 0)
{
for (int i = 0; i < 3; i++)
{
memcpy(clippedvert + i, verts + i, sizeof(TriVertex));
}
return 3;
}
float clipdistance[numclipdistances * 4];
_mm_storeu_ps(clipdistance, clipd0);
_mm_storeu_ps(clipdistance + 4, clipd1);
_mm_storeu_ps(clipdistance + 8, clipd2);
_mm_storeu_ps(clipdistance + 12, clipd3);
_mm_storeu_ps(clipdistance + 16, clipd4);
_mm_storeu_ps(clipdistance + 20, clipd5);
_mm_storeu_ps(clipdistance + 24, clipd6);
// use barycentric weights while clipping vertices
float weights[max_additional_vertices * 3 * 2];
for (int i = 0; i < 3; i++)
{
weights[i * 3 + 0] = 0.0f;
weights[i * 3 + 1] = 0.0f;
weights[i * 3 + 2] = 0.0f;
weights[i * 3 + i] = 1.0f;
}
// Clip against each halfspace
float *input = weights;
float *output = weights + max_additional_vertices * 3;
int inputverts = 3;
for (int p = 0; p < numclipdistances; p++)
{
// Clip each edge
int outputverts = 0;
for (int i = 0; i < inputverts; i++)
{
int j = (i + 1) % inputverts;
float clipdistance1 =
clipdistance[0 + p * 4] * input[i * 3 + 0] +
clipdistance[1 + p * 4] * input[i * 3 + 1] +
clipdistance[2 + p * 4] * input[i * 3 + 2];
float clipdistance2 =
clipdistance[0 + p * 4] * input[j * 3 + 0] +
clipdistance[1 + p * 4] * input[j * 3 + 1] +
clipdistance[2 + p * 4] * input[j * 3 + 2];
// Clip halfspace
if ((clipdistance1 >= 0.0f || clipdistance2 >= 0.0f) && outputverts + 1 < max_additional_vertices)
{
float t1 = (clipdistance1 < 0.0f) ? MAX(-clipdistance1 / (clipdistance2 - clipdistance1), 0.0f) : 0.0f;
float t2 = (clipdistance2 < 0.0f) ? MIN(1.0f + clipdistance2 / (clipdistance1 - clipdistance2), 1.0f) : 1.0f;
// add t1 vertex
for (int k = 0; k < 3; k++)
output[outputverts * 3 + k] = input[i * 3 + k] * (1.0f - t1) + input[j * 3 + k] * t1;
outputverts++;
if (t2 != 1.0f && t2 > t1)
{
// add t2 vertex
for (int k = 0; k < 3; k++)
output[outputverts * 3 + k] = input[i * 3 + k] * (1.0f - t2) + input[j * 3 + k] * t2;
outputverts++;
}
}
}
std::swap(input, output);
inputverts = outputverts;
if (inputverts == 0)
break;
}
// Convert barycentric weights to actual vertices
for (int i = 0; i < inputverts; i++)
{
auto &v = clippedvert[i];
memset(&v, 0, sizeof(TriVertex));
for (int w = 0; w < 3; w++)
{
float weight = input[i * 3 + w];
v.x += verts[w].x * weight;
v.y += verts[w].y * weight;
v.z += verts[w].z * weight;
v.w += verts[w].w * weight;
v.u += verts[w].u * weight;
v.v += verts[w].v * weight;
}
}
return inputverts;
}
#endif
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
DrawPolyTrianglesCommand::DrawPolyTrianglesCommand(const PolyDrawArgs &args, bool mirror) DrawPolyTrianglesCommand::DrawPolyTrianglesCommand(const PolyDrawArgs &args, bool mirror)

View file

@ -47,8 +47,12 @@ private:
static ShadedTriVertex shade_vertex(const TriMatrix &objectToClip, const float *clipPlane, const TriVertex &v); static ShadedTriVertex shade_vertex(const TriMatrix &objectToClip, const float *clipPlane, const TriVertex &v);
static void draw_arrays(const PolyDrawArgs &args, WorkerThreadData *thread); static void draw_arrays(const PolyDrawArgs &args, WorkerThreadData *thread);
static void draw_shaded_triangle(const ShadedTriVertex *vertices, bool ccw, TriDrawTriangleArgs *args, WorkerThreadData *thread); static void draw_shaded_triangle(const ShadedTriVertex *vertices, bool ccw, TriDrawTriangleArgs *args, WorkerThreadData *thread);
static void clip_to_viewport(TriVertex *clippedvert, int numclipvert);
static int clipedge(const ShadedTriVertex *verts, TriVertex *clippedvert); static int clipedge(const ShadedTriVertex *verts, TriVertex *clippedvert);
#ifndef NO_SSE
static void clip_to_viewport_sse2(TriVertex *clippedvert, int numclipvert);
static int clipedge_sse2(const ShadedTriVertex *verts, TriVertex *clippedvert);
#endif
static int viewport_x, viewport_y, viewport_width, viewport_height, dest_pitch, dest_width, dest_height; static int viewport_x, viewport_y, viewport_width, viewport_height, dest_pitch, dest_width, dest_height;
static bool dest_bgra; static bool dest_bgra;

View file

@ -36,11 +36,20 @@
#include "poly_triangle.h" #include "poly_triangle.h"
#include "swrenderer/drawers/r_draw_rgba.h" #include "swrenderer/drawers/r_draw_rgba.h"
#include "screen_triangle.h" #include "screen_triangle.h"
#include "poly_drawer32.h"
#include "poly_drawer8.h"
#ifndef NO_SSE #ifndef NO_SSE
#include "poly_drawer32_sse2.h" #include "poly_drawer32_sse2.h"
#endif #endif
#include "poly_drawer8.h" #include "x86.h"
namespace
{
class SSE2CPU { public: static const int HasSSE2 = 1; };
class GenericCPU { public: static const int HasSSE2 = 0; };
}
template<typename CPUType>
class TriangleBlock class TriangleBlock
{ {
public: public:
@ -114,9 +123,17 @@ private:
void ClipTest(); void ClipTest();
void StencilWrite(); void StencilWrite();
void SubsectorWrite(); void SubsectorWrite();
#ifndef NO_SSE
void CoverageTestSSE2();
void StencilEqualTestSSE2();
void SubsectorTestSSE2();
void SubsectorWriteSSE2();
#endif
}; };
TriangleBlock::TriangleBlock(const TriDrawTriangleArgs *args) template<typename CPUType>
TriangleBlock<CPUType>::TriangleBlock(const TriDrawTriangleArgs *args)
{ {
const TriVertex &v1 = *args->v1; const TriVertex &v1 = *args->v1;
const TriVertex &v2 = *args->v2; const TriVertex &v2 = *args->v2;
@ -145,6 +162,9 @@ TriangleBlock::TriangleBlock(const TriDrawTriangleArgs *args)
const int X2 = (int)round(16.0f * v2.x); const int X2 = (int)round(16.0f * v2.x);
const int X3 = (int)round(16.0f * v3.x); const int X3 = (int)round(16.0f * v3.x);
#else #else
int Y1, Y2, Y3, X1, X2, X3;
if (CPUType::HasSSE2 == 1)
{
int tempround[4 * 3]; int tempround[4 * 3];
__m128 m16 = _mm_set1_ps(16.0f); __m128 m16 = _mm_set1_ps(16.0f);
__m128 mhalf = _mm_set1_ps(65536.5f); __m128 mhalf = _mm_set1_ps(65536.5f);
@ -152,12 +172,22 @@ TriangleBlock::TriangleBlock(const TriDrawTriangleArgs *args)
_mm_storeu_si128((__m128i*)tempround, _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v1), m16), mhalf)), m65536)); _mm_storeu_si128((__m128i*)tempround, _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v1), m16), mhalf)), m65536));
_mm_storeu_si128((__m128i*)(tempround + 4), _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v2), m16), mhalf)), m65536)); _mm_storeu_si128((__m128i*)(tempround + 4), _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v2), m16), mhalf)), m65536));
_mm_storeu_si128((__m128i*)(tempround + 8), _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v3), m16), mhalf)), m65536)); _mm_storeu_si128((__m128i*)(tempround + 8), _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v3), m16), mhalf)), m65536));
const int X1 = tempround[0]; X1 = tempround[0];
const int X2 = tempround[4]; X2 = tempround[4];
const int X3 = tempround[8]; X3 = tempround[8];
const int Y1 = tempround[1]; Y1 = tempround[1];
const int Y2 = tempround[5]; Y2 = tempround[5];
const int Y3 = tempround[9]; Y3 = tempround[9];
}
else
{
Y1 = (int)round(16.0f * v1.y);
Y2 = (int)round(16.0f * v2.y);
Y3 = (int)round(16.0f * v3.y);
X1 = (int)round(16.0f * v1.x);
X2 = (int)round(16.0f * v2.x);
X3 = (int)round(16.0f * v3.x);
}
#endif #endif
// Deltas // Deltas
@ -203,6 +233,8 @@ TriangleBlock::TriangleBlock(const TriDrawTriangleArgs *args)
if (DY31 < 0 || (DY31 == 0 && DX31 > 0)) C3++; if (DY31 < 0 || (DY31 == 0 && DX31 > 0)) C3++;
#ifndef NO_SSE #ifndef NO_SSE
if (CPUType::HasSSE2 == 1)
{
mFDY12Offset = _mm_setr_epi32(0, FDY12, FDY12 * 2, FDY12 * 3); mFDY12Offset = _mm_setr_epi32(0, FDY12, FDY12 * 2, FDY12 * 3);
mFDY23Offset = _mm_setr_epi32(0, FDY23, FDY23 * 2, FDY23 * 3); mFDY23Offset = _mm_setr_epi32(0, FDY23, FDY23 * 2, FDY23 * 3);
mFDY31Offset = _mm_setr_epi32(0, FDY31, FDY31 * 2, FDY31 * 3); mFDY31Offset = _mm_setr_epi32(0, FDY31, FDY31 * 2, FDY31 * 3);
@ -221,10 +253,12 @@ TriangleBlock::TriangleBlock(const TriDrawTriangleArgs *args)
mDY23 = _mm_set1_epi32(DY23); mDY23 = _mm_set1_epi32(DY23);
mDX31 = _mm_set1_epi32(DX31); mDX31 = _mm_set1_epi32(DX31);
mDY31 = _mm_set1_epi32(DY31); mDY31 = _mm_set1_epi32(DY31);
}
#endif #endif
} }
void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thread) template<typename CPUType>
void TriangleBlock<CPUType>::Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thread)
{ {
// First block line for this thread // First block line for this thread
int core = thread->core; int core = thread->core;
@ -236,9 +270,18 @@ void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thre
bool writeColor = args->uniforms->WriteColor(); bool writeColor = args->uniforms->WriteColor();
bool writeStencil = args->uniforms->WriteStencil(); bool writeStencil = args->uniforms->WriteStencil();
bool writeSubsector = args->uniforms->WriteSubsector(); bool writeSubsector = args->uniforms->WriteSubsector();
int bmode = (int)args->uniforms->BlendMode(); int bmode = (int)args->uniforms->BlendMode();
// Find the drawer function for the given blend mode
#ifndef NO_SSE
void(*drawFunc)(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *);
if (CPUType::HasSSE2 == 1)
drawFunc = args->destBgra ? ScreenTriangle::TriDrawers32_SSE2[bmode] : ScreenTriangle::TriDrawers8[bmode];
else
drawFunc = args->destBgra ? ScreenTriangle::TriDrawers32[bmode] : ScreenTriangle::TriDrawers8[bmode];
#else
auto drawFunc = args->destBgra ? ScreenTriangle::TriDrawers32[bmode] : ScreenTriangle::TriDrawers8[bmode]; auto drawFunc = args->destBgra ? ScreenTriangle::TriDrawers32[bmode] : ScreenTriangle::TriDrawers8[bmode];
#endif
// Loop through blocks // Loop through blocks
for (int y = start_miny; y < maxy; y += q * num_cores) for (int y = start_miny; y < maxy; y += q * num_cores)
@ -248,7 +291,11 @@ void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thre
X = x; X = x;
Y = y; Y = y;
if (CPUType::HasSSE2 == 1)
CoverageTestSSE2();
else
CoverageTest(); CoverageTest();
if (Mask0 == 0 && Mask1 == 0) if (Mask0 == 0 && Mask1 == 0)
continue; continue;
@ -259,7 +306,11 @@ void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thre
// To do: make the stencil test use its own flag for comparison mode instead of abusing the subsector test.. // To do: make the stencil test use its own flag for comparison mode instead of abusing the subsector test..
if (!subsectorTest) if (!subsectorTest)
{ {
if (CPUType::HasSSE2 == 1)
StencilEqualTestSSE2();
else
StencilEqualTest(); StencilEqualTest();
if (Mask0 == 0 && Mask1 == 0) if (Mask0 == 0 && Mask1 == 0)
continue; continue;
} }
@ -269,7 +320,11 @@ void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thre
if (Mask0 == 0 && Mask1 == 0) if (Mask0 == 0 && Mask1 == 0)
continue; continue;
if (CPUType::HasSSE2 == 1)
SubsectorTestSSE2();
else
SubsectorTest(); SubsectorTest();
if (Mask0 == 0 && Mask1 == 0) if (Mask0 == 0 && Mask1 == 0)
continue; continue;
} }
@ -279,14 +334,18 @@ void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thre
if (writeStencil) if (writeStencil)
StencilWrite(); StencilWrite();
if (writeSubsector) if (writeSubsector)
{
if (CPUType::HasSSE2 == 1)
SubsectorWriteSSE2();
else
SubsectorWrite(); SubsectorWrite();
} }
} }
}
} }
#ifdef NO_SSE template<typename CPUType>
void TriangleBlock<CPUType>::SubsectorTest()
void TriangleBlock::SubsectorTest()
{ {
int block = (X >> 3) + (Y >> 3) * subsectorPitch; int block = (X >> 3) + (Y >> 3) * subsectorPitch;
uint32_t *subsector = subsectorGBuffer + block * 64; uint32_t *subsector = subsectorGBuffer + block * 64;
@ -312,9 +371,10 @@ void TriangleBlock::SubsectorTest()
Mask1 = Mask1 & mask1; Mask1 = Mask1 & mask1;
} }
#else #ifndef NO_SSE
void TriangleBlock::SubsectorTest() template<typename CPUType>
void TriangleBlock<CPUType>::SubsectorTestSSE2()
{ {
int block = (X >> 3) + (Y >> 3) * subsectorPitch; int block = (X >> 3) + (Y >> 3) * subsectorPitch;
uint32_t *subsector = subsectorGBuffer + block * 64; uint32_t *subsector = subsectorGBuffer + block * 64;
@ -342,7 +402,8 @@ void TriangleBlock::SubsectorTest()
#endif #endif
void TriangleBlock::ClipTest() template<typename CPUType>
void TriangleBlock<CPUType>::ClipTest()
{ {
static const uint32_t clipxmask[8] = static const uint32_t clipxmask[8] =
{ {
@ -376,9 +437,8 @@ void TriangleBlock::ClipTest()
Mask1 = Mask1 & xmask & ymask1; Mask1 = Mask1 & xmask & ymask1;
} }
#ifdef NO_SSE template<typename CPUType>
void TriangleBlock<CPUType>::StencilEqualTest()
void TriangleBlock::StencilEqualTest()
{ {
// Stencil test the whole block, if possible // Stencil test the whole block, if possible
int block = (X >> 3) + (Y >> 3) * stencilPitch; int block = (X >> 3) + (Y >> 3) * stencilPitch;
@ -421,9 +481,10 @@ void TriangleBlock::StencilEqualTest()
} }
} }
#else #ifndef NO_SSE
void TriangleBlock::StencilEqualTest() template<typename CPUType>
void TriangleBlock<CPUType>::StencilEqualTestSSE2()
{ {
// Stencil test the whole block, if possible // Stencil test the whole block, if possible
int block = (X >> 3) + (Y >> 3) * stencilPitch; int block = (X >> 3) + (Y >> 3) * stencilPitch;
@ -489,7 +550,8 @@ void TriangleBlock::StencilEqualTest()
#endif #endif
void TriangleBlock::StencilGreaterEqualTest() template<typename CPUType>
void TriangleBlock<CPUType>::StencilGreaterEqualTest()
{ {
// Stencil test the whole block, if possible // Stencil test the whole block, if possible
int block = (X >> 3) + (Y >> 3) * stencilPitch; int block = (X >> 3) + (Y >> 3) * stencilPitch;
@ -532,9 +594,8 @@ void TriangleBlock::StencilGreaterEqualTest()
} }
} }
#ifdef NO_SSE template<typename CPUType>
void TriangleBlock<CPUType>::CoverageTest()
void TriangleBlock::CoverageTest()
{ {
// Corners of block // Corners of block
int x0 = X << 4; int x0 = X << 4;
@ -631,9 +692,10 @@ void TriangleBlock::CoverageTest()
} }
} }
#else #ifndef NO_SSE
void TriangleBlock::CoverageTest() template<typename CPUType>
void TriangleBlock<CPUType>::CoverageTestSSE2()
{ {
// Corners of block // Corners of block
int x0 = X << 4; int x0 = X << 4;
@ -743,7 +805,8 @@ void TriangleBlock::CoverageTest()
#endif #endif
void TriangleBlock::StencilWrite() template<typename CPUType>
void TriangleBlock<CPUType>::StencilWrite()
{ {
int block = (X >> 3) + (Y >> 3) * stencilPitch; int block = (X >> 3) + (Y >> 3) * stencilPitch;
uint8_t *stencilBlock = &stencilValues[block * 64]; uint8_t *stencilBlock = &stencilValues[block * 64];
@ -793,9 +856,8 @@ void TriangleBlock::StencilWrite()
} }
} }
#ifdef NO_SSE template<typename CPUType>
void TriangleBlock<CPUType>::SubsectorWrite()
void TriangleBlock::SubsectorWrite()
{ {
int block = (X >> 3) + (Y >> 3) * subsectorPitch; int block = (X >> 3) + (Y >> 3) * subsectorPitch;
uint32_t *subsector = subsectorGBuffer + block * 64; uint32_t *subsector = subsectorGBuffer + block * 64;
@ -828,9 +890,10 @@ void TriangleBlock::SubsectorWrite()
} }
} }
#else #ifndef NO_SSE
void TriangleBlock::SubsectorWrite() template<typename CPUType>
void TriangleBlock<CPUType>::SubsectorWriteSSE2()
{ {
int block = (X >> 3) + (Y >> 3) * subsectorPitch; int block = (X >> 3) + (Y >> 3) * subsectorPitch;
uint32_t *subsector = subsectorGBuffer + block * 64; uint32_t *subsector = subsectorGBuffer + block * 64;
@ -887,8 +950,21 @@ void TriangleBlock::SubsectorWrite()
void ScreenTriangle::Draw(const TriDrawTriangleArgs *args, WorkerThreadData *thread) void ScreenTriangle::Draw(const TriDrawTriangleArgs *args, WorkerThreadData *thread)
{ {
TriangleBlock block(args); #ifdef NO_SSE
TriangleBlock<GenericCPU> block(args);
block.Loop(args, thread); block.Loop(args, thread);
#else
if (CPU.bSSE2)
{
TriangleBlock<SSE2CPU> block(args);
block.Loop(args, thread);
}
else
{
TriangleBlock<GenericCPU> block(args);
block.Loop(args, thread);
}
#endif
} }
void(*ScreenTriangle::TriDrawers8[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *) = void(*ScreenTriangle::TriDrawers8[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *) =
@ -918,15 +994,6 @@ void(*ScreenTriangle::TriDrawers8[])(int, int, uint32_t, uint32_t, const TriDraw
&TriScreenDrawer8<TriScreenDrawerModes::ShadedBlend, TriScreenDrawerModes::FuzzSampler>::Execute // Fuzz &TriScreenDrawer8<TriScreenDrawerModes::ShadedBlend, TriScreenDrawerModes::FuzzSampler>::Execute // Fuzz
}; };
#ifdef NO_SSE
void(*ScreenTriangle::TriDrawers32[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *) =
{
nullptr
};
#else
void(*ScreenTriangle::TriDrawers32[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *) = void(*ScreenTriangle::TriDrawers32[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *) =
{ {
&TriScreenDrawer32<TriScreenDrawerModes::OpaqueBlend, TriScreenDrawerModes::TextureSampler>::Execute, // TextureOpaque &TriScreenDrawer32<TriScreenDrawerModes::OpaqueBlend, TriScreenDrawerModes::TextureSampler>::Execute, // TextureOpaque
@ -954,6 +1021,35 @@ void(*ScreenTriangle::TriDrawers32[])(int, int, uint32_t, uint32_t, const TriDra
&TriScreenDrawer32<TriScreenDrawerModes::ShadedBlend, TriScreenDrawerModes::FuzzSampler>::Execute // Fuzz &TriScreenDrawer32<TriScreenDrawerModes::ShadedBlend, TriScreenDrawerModes::FuzzSampler>::Execute // Fuzz
}; };
#ifndef NO_SSE
void(*ScreenTriangle::TriDrawers32_SSE2[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *) =
{
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::OpaqueBlend, TriScreenDrawerModes::TextureSampler>::Execute, // TextureOpaque
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::MaskedBlend, TriScreenDrawerModes::TextureSampler>::Execute, // TextureMasked
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::AddClampBlend, TriScreenDrawerModes::TextureSampler>::Execute, // TextureAdd
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::SubClampBlend, TriScreenDrawerModes::TextureSampler>::Execute, // TextureSub
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::RevSubClampBlend, TriScreenDrawerModes::TextureSampler>::Execute, // TextureRevSub
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::AddSrcColorBlend, TriScreenDrawerModes::TextureSampler>::Execute, // TextureAddSrcColor
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::OpaqueBlend, TriScreenDrawerModes::TranslatedSampler>::Execute, // TranslatedOpaque
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::MaskedBlend, TriScreenDrawerModes::TranslatedSampler>::Execute, // TranslatedMasked
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::AddClampBlend, TriScreenDrawerModes::TranslatedSampler>::Execute, // TranslatedAdd
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::SubClampBlend, TriScreenDrawerModes::TranslatedSampler>::Execute, // TranslatedSub
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::RevSubClampBlend, TriScreenDrawerModes::TranslatedSampler>::Execute, // TranslatedRevSub
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::AddSrcColorBlend, TriScreenDrawerModes::TranslatedSampler>::Execute, // TranslatedAddSrcColor
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::ShadedBlend, TriScreenDrawerModes::ShadedSampler>::Execute, // Shaded
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::AddClampShadedBlend, TriScreenDrawerModes::ShadedSampler>::Execute, // AddShaded
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::ShadedBlend, TriScreenDrawerModes::StencilSampler>::Execute, // Stencil
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::AddClampShadedBlend, TriScreenDrawerModes::StencilSampler>::Execute, // AddStencil
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::OpaqueBlend, TriScreenDrawerModes::FillSampler>::Execute, // FillOpaque
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::AddClampBlend, TriScreenDrawerModes::FillSampler>::Execute, // FillAdd
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::SubClampBlend, TriScreenDrawerModes::FillSampler>::Execute, // FillSub
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::RevSubClampBlend, TriScreenDrawerModes::FillSampler>::Execute, // FillRevSub
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::AddSrcColorBlend, TriScreenDrawerModes::FillSampler>::Execute, // FillAddSrcColor
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::OpaqueBlend, TriScreenDrawerModes::SkycapSampler>::Execute, // Skycap
&TriScreenDrawer32_SSE2<TriScreenDrawerModes::ShadedBlend, TriScreenDrawerModes::FuzzSampler>::Execute // Fuzz
};
#endif #endif
void(*ScreenTriangle::RectDrawers8[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *) = void(*ScreenTriangle::RectDrawers8[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *) =
@ -983,15 +1079,6 @@ void(*ScreenTriangle::RectDrawers8[])(const void *, int, int, int, const RectDra
&RectScreenDrawer8<TriScreenDrawerModes::ShadedBlend, TriScreenDrawerModes::FuzzSampler>::Execute // Fuzz &RectScreenDrawer8<TriScreenDrawerModes::ShadedBlend, TriScreenDrawerModes::FuzzSampler>::Execute // Fuzz
}; };
#ifdef NO_SSE
void(*ScreenTriangle::RectDrawers32[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *) =
{
nullptr
};
#else
void(*ScreenTriangle::RectDrawers32[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *) = void(*ScreenTriangle::RectDrawers32[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *) =
{ {
&RectScreenDrawer32<TriScreenDrawerModes::OpaqueBlend, TriScreenDrawerModes::TextureSampler>::Execute, // TextureOpaque &RectScreenDrawer32<TriScreenDrawerModes::OpaqueBlend, TriScreenDrawerModes::TextureSampler>::Execute, // TextureOpaque
@ -1019,6 +1106,35 @@ void(*ScreenTriangle::RectDrawers32[])(const void *, int, int, int, const RectDr
&RectScreenDrawer32<TriScreenDrawerModes::ShadedBlend, TriScreenDrawerModes::FuzzSampler>::Execute // Fuzz &RectScreenDrawer32<TriScreenDrawerModes::ShadedBlend, TriScreenDrawerModes::FuzzSampler>::Execute // Fuzz
}; };
#ifndef NO_SSE
void(*ScreenTriangle::RectDrawers32_SSE2[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *) =
{
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::OpaqueBlend, TriScreenDrawerModes::TextureSampler>::Execute, // TextureOpaque
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::MaskedBlend, TriScreenDrawerModes::TextureSampler>::Execute, // TextureMasked
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::AddClampBlend, TriScreenDrawerModes::TextureSampler>::Execute, // TextureAdd
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::SubClampBlend, TriScreenDrawerModes::TextureSampler>::Execute, // TextureSub
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::RevSubClampBlend, TriScreenDrawerModes::TextureSampler>::Execute, // TextureRevSub
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::AddSrcColorBlend, TriScreenDrawerModes::TextureSampler>::Execute, // TextureAddSrcColor
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::OpaqueBlend, TriScreenDrawerModes::TranslatedSampler>::Execute, // TranslatedOpaque
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::MaskedBlend, TriScreenDrawerModes::TranslatedSampler>::Execute, // TranslatedMasked
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::AddClampBlend, TriScreenDrawerModes::TranslatedSampler>::Execute, // TranslatedAdd
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::SubClampBlend, TriScreenDrawerModes::TranslatedSampler>::Execute, // TranslatedSub
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::RevSubClampBlend, TriScreenDrawerModes::TranslatedSampler>::Execute, // TranslatedRevSub
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::AddSrcColorBlend, TriScreenDrawerModes::TranslatedSampler>::Execute, // TranslatedAddSrcColor
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::ShadedBlend, TriScreenDrawerModes::ShadedSampler>::Execute, // Shaded
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::AddClampShadedBlend, TriScreenDrawerModes::ShadedSampler>::Execute, // AddShaded
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::ShadedBlend, TriScreenDrawerModes::StencilSampler>::Execute, // Stencil
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::AddClampShadedBlend, TriScreenDrawerModes::StencilSampler>::Execute, // AddStencil
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::OpaqueBlend, TriScreenDrawerModes::FillSampler>::Execute, // FillOpaque
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::AddClampBlend, TriScreenDrawerModes::FillSampler>::Execute, // FillAdd
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::SubClampBlend, TriScreenDrawerModes::FillSampler>::Execute, // FillSub
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::RevSubClampBlend, TriScreenDrawerModes::FillSampler>::Execute, // FillRevSub
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::AddSrcColorBlend, TriScreenDrawerModes::FillSampler>::Execute, // FillAddSrcColor
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::OpaqueBlend, TriScreenDrawerModes::SkycapSampler>::Execute, // Skycap
&RectScreenDrawer32_SSE2<TriScreenDrawerModes::ShadedBlend, TriScreenDrawerModes::FuzzSampler>::Execute // Fuzz
};
#endif #endif
int ScreenTriangle::FuzzStart = 0; int ScreenTriangle::FuzzStart = 0;

View file

@ -131,6 +131,11 @@ public:
static void(*RectDrawers8[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *); static void(*RectDrawers8[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *);
static void(*RectDrawers32[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *); static void(*RectDrawers32[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *);
#ifndef NO_SSE
static void(*TriDrawers32_SSE2[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *);
static void(*RectDrawers32_SSE2[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *);
#endif
static int FuzzStart; static int FuzzStart;
}; };

View file

@ -185,7 +185,12 @@ ShadedTriVertex TriMatrix::operator*(TriVertex v) const
sv.y = vy; sv.y = vy;
sv.z = vz; sv.z = vz;
sv.w = vw; sv.w = vw;
sv.u = v.u;
sv.v = v.v;
return sv;
#else #else
if (CPU.bSSE2)
{
__m128 m0 = _mm_loadu_ps(matrix); __m128 m0 = _mm_loadu_ps(matrix);
__m128 m1 = _mm_loadu_ps(matrix + 4); __m128 m1 = _mm_loadu_ps(matrix + 4);
__m128 m2 = _mm_loadu_ps(matrix + 8); __m128 m2 = _mm_loadu_ps(matrix + 8);
@ -198,8 +203,24 @@ ShadedTriVertex TriMatrix::operator*(TriVertex v) const
mv = _mm_add_ps(_mm_add_ps(_mm_add_ps(m0, m1), m2), m3); mv = _mm_add_ps(_mm_add_ps(_mm_add_ps(m0, m1), m2), m3);
ShadedTriVertex sv; ShadedTriVertex sv;
_mm_storeu_ps(&sv.x, mv); _mm_storeu_ps(&sv.x, mv);
#endif
sv.u = v.u; sv.u = v.u;
sv.v = v.v; sv.v = v.v;
return sv; return sv;
}
else
{
float vx = matrix[0 * 4 + 0] * v.x + matrix[1 * 4 + 0] * v.y + matrix[2 * 4 + 0] * v.z + matrix[3 * 4 + 0] * v.w;
float vy = matrix[0 * 4 + 1] * v.x + matrix[1 * 4 + 1] * v.y + matrix[2 * 4 + 1] * v.z + matrix[3 * 4 + 1] * v.w;
float vz = matrix[0 * 4 + 2] * v.x + matrix[1 * 4 + 2] * v.y + matrix[2 * 4 + 2] * v.z + matrix[3 * 4 + 2] * v.w;
float vw = matrix[0 * 4 + 3] * v.x + matrix[1 * 4 + 3] * v.y + matrix[2 * 4 + 3] * v.z + matrix[3 * 4 + 3] * v.w;
ShadedTriVertex sv;
sv.x = vx;
sv.y = vy;
sv.z = vz;
sv.w = vw;
sv.u = v.u;
sv.v = v.v;
return sv;
}
#endif
} }

View file

@ -135,7 +135,6 @@ typedef struct MODMIDICFG
// PUBLIC DATA DEFINITIONS ------------------------------------------------- // PUBLIC DATA DEFINITIONS -------------------------------------------------
CVAR(Bool, mod_dumb, true, CVAR_ARCHIVE|CVAR_GLOBALCONFIG);
CVAR(Int, mod_samplerate, 0, CVAR_ARCHIVE|CVAR_GLOBALCONFIG); CVAR(Int, mod_samplerate, 0, CVAR_ARCHIVE|CVAR_GLOBALCONFIG);
CVAR(Int, mod_volramp, 2, CVAR_ARCHIVE|CVAR_GLOBALCONFIG); CVAR(Int, mod_volramp, 2, CVAR_ARCHIVE|CVAR_GLOBALCONFIG);
CVAR(Int, mod_interp, DUMB_LQ_CUBIC, CVAR_ARCHIVE|CVAR_GLOBALCONFIG); CVAR(Int, mod_interp, DUMB_LQ_CUBIC, CVAR_ARCHIVE|CVAR_GLOBALCONFIG);
@ -780,11 +779,6 @@ MusInfo *MOD_OpenSong(FileReader &reader)
long fpos = 0; long fpos = 0;
input_mod *state = NULL; input_mod *state = NULL;
if (!mod_dumb)
{
return NULL;
}
bool is_it = false; bool is_it = false;
bool is_dos = true; bool is_dos = true;

View file

@ -1218,15 +1218,16 @@ std::pair<SoundHandle,bool> OpenALSoundRenderer::LoadSound(uint8_t *sfxdata, int
if(!decoder) return std::make_pair(retval, true); if(!decoder) return std::make_pair(retval, true);
decoder->getInfo(&srate, &chans, &type); decoder->getInfo(&srate, &chans, &type);
int samplesize = 1;
if(chans == ChannelConfig_Mono || monoize) if(chans == ChannelConfig_Mono || monoize)
{ {
if(type == SampleType_UInt8) format = AL_FORMAT_MONO8; if(type == SampleType_UInt8) format = AL_FORMAT_MONO8, samplesize = 1;
if(type == SampleType_Int16) format = AL_FORMAT_MONO16; if(type == SampleType_Int16) format = AL_FORMAT_MONO16, samplesize = 2;
} }
else if(chans == ChannelConfig_Stereo) else if(chans == ChannelConfig_Stereo)
{ {
if(type == SampleType_UInt8) format = AL_FORMAT_STEREO8; if(type == SampleType_UInt8) format = AL_FORMAT_STEREO8, samplesize = 2;
if(type == SampleType_Int16) format = AL_FORMAT_STEREO16; if(type == SampleType_Int16) format = AL_FORMAT_STEREO16, samplesize = 4;
} }
if(format == AL_NONE) if(format == AL_NONE)
@ -1282,13 +1283,14 @@ std::pair<SoundHandle,bool> OpenALSoundRenderer::LoadSound(uint8_t *sfxdata, int
if (!startass) loop_start = Scale(loop_start, srate, 1000); if (!startass) loop_start = Scale(loop_start, srate, 1000);
if (!endass) loop_end = Scale(loop_end, srate, 1000); if (!endass) loop_end = Scale(loop_end, srate, 1000);
if (loop_start < 0) loop_start = 0; if (loop_start < 0) loop_start = 0;
if (loop_end >= data.Size() / samplesize) loop_end = data.Size() / samplesize - 1;
if ((loop_start > 0 || loop_end > 0) && loop_end > loop_start && AL.SOFT_loop_points) if ((loop_start > 0 || loop_end > 0) && loop_end > loop_start && AL.SOFT_loop_points)
{ {
ALint loops[2] = { static_cast<ALint>(loop_start), static_cast<ALint>(loop_end) }; ALint loops[2] = { static_cast<ALint>(loop_start), static_cast<ALint>(loop_end) };
DPrintf(DMSG_NOTIFY, "Setting loop points %d -> %d\n", loops[0], loops[1]); DPrintf(DMSG_NOTIFY, "Setting loop points %d -> %d\n", loops[0], loops[1]);
alBufferiv(buffer, AL_LOOP_POINTS_SOFT, loops); alBufferiv(buffer, AL_LOOP_POINTS_SOFT, loops);
getALError(); // no console messages here, please!
} }

View file

@ -1723,8 +1723,6 @@ OptionValue ModVolumeRamps
OptionMenu ModReplayerOptions OptionMenu ModReplayerOptions
{ {
Title "$MODMNU_TITLE" Title "$MODMNU_TITLE"
Option "$MODMNU_REPLAYERENGINE", "mod_dumb", "ModReplayers"
StaticText " "
Slider "$MODMNU_MASTERVOLUME", "mod_dumb_mastervolume", 1, 16, 0.5, 1 Slider "$MODMNU_MASTERVOLUME", "mod_dumb_mastervolume", 1, 16, 0.5, 1
Option "$ADVSNDMNU_SAMPLERATE", "mod_samplerate", "SampleRates", "mod_dumb" Option "$ADVSNDMNU_SAMPLERATE", "mod_samplerate", "SampleRates", "mod_dumb"
Option "$MODMNU_QUALITY", "mod_interp", "ModQuality", "mod_dumb" Option "$MODMNU_QUALITY", "mod_interp", "ModQuality", "mod_dumb"