diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index fdfb5d0ec..700d1c609 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -708,7 +708,11 @@ set ( SWRENDER_SOURCES set( POLYRENDER_SOURCES rendering/polyrenderer/drawers/poly_triangle.cpp + rendering/polyrenderer/drawers/poly_thread.cpp rendering/polyrenderer/drawers/screen_triangle.cpp + rendering/polyrenderer/drawers/screen_scanline_setup.cpp + rendering/polyrenderer/drawers/screen_shader.cpp + rendering/polyrenderer/drawers/screen_blend.cpp rendering/polyrenderer/math/gpu_types.cpp ) diff --git a/src/rendering/polyrenderer/backend/poly_buffers.cpp b/src/rendering/polyrenderer/backend/poly_buffers.cpp index 75f3c99dc..2aaec2da9 100644 --- a/src/rendering/polyrenderer/backend/poly_buffers.cpp +++ b/src/rendering/polyrenderer/backend/poly_buffers.cpp @@ -2,6 +2,7 @@ #include "poly_buffers.h" #include "poly_framebuffer.h" #include "poly_renderstate.h" +#include "rendering/polyrenderer/drawers/poly_thread.h" #include "doomerrors.h" PolyBuffer *PolyBuffer::First = nullptr; diff --git a/src/rendering/polyrenderer/drawers/poly_thread.cpp b/src/rendering/polyrenderer/drawers/poly_thread.cpp new file mode 100644 index 000000000..e90c02879 --- /dev/null +++ b/src/rendering/polyrenderer/drawers/poly_thread.cpp @@ -0,0 +1,821 @@ +/* +** Polygon Doom software renderer +** Copyright (c) 2016 Magnus Norddahl +** +** This software is provided 'as-is', without any express or implied +** warranty. In no event will the authors be held liable for any damages +** arising from the use of this software. +** +** Permission is granted to anyone to use this software for any purpose, +** including commercial applications, and to alter it and redistribute it +** freely, subject to the following restrictions: +** +** 1. The origin of this software must not be misrepresented; you must not +** claim that you wrote the original software. If you use this software +** in a product, an acknowledgment in the product documentation would be +** appreciated but is not required. +** 2. Altered source versions must be plainly marked as such, and must not be +** misrepresented as being the original software. +** 3. This notice may not be removed or altered from any source distribution. +** +*/ + +#include +#include "templates.h" +#include "doomdef.h" + +#include "w_wad.h" +#include "v_video.h" +#include "doomstat.h" +#include "st_stuff.h" +#include "g_game.h" +#include "g_level.h" +#include "r_data/r_translate.h" +#include "r_data/models/models.h" +#include "v_palette.h" +#include "r_data/colormaps.h" +#include "poly_thread.h" +#include "swrenderer/drawers/r_draw_rgba.h" +#include "screen_triangle.h" +#include "x86.h" + +PolyTriangleThreadData::PolyTriangleThreadData(int32_t core, int32_t num_cores, int32_t numa_node, int32_t num_numa_nodes, int numa_start_y, int numa_end_y) + : core(core), num_cores(num_cores), numa_node(numa_node), num_numa_nodes(num_numa_nodes), numa_start_y(numa_start_y), numa_end_y(numa_end_y) +{ +} + +void PolyTriangleThreadData::ClearDepth(float value) +{ + int width = depthstencil->Width(); + int height = depthstencil->Height(); + float *data = depthstencil->DepthValues(); + + int skip = skipped_by_thread(0); + int count = count_for_thread(0, height); + + data += skip * width; + for (int i = 0; i < count; i++) + { + for (int x = 0; x < width; x++) + data[x] = value; + data += num_cores * width; + } +} + +void PolyTriangleThreadData::ClearStencil(uint8_t value) +{ + int width = depthstencil->Width(); + int height = depthstencil->Height(); + uint8_t *data = depthstencil->StencilValues(); + + int skip = skipped_by_thread(0); + int count = count_for_thread(0, height); + + data += skip * width; + for (int i = 0; i < count; i++) + { + memset(data, value, width); + data += num_cores * width; + } +} + +void PolyTriangleThreadData::SetViewport(int x, int y, int width, int height, uint8_t *new_dest, int new_dest_width, int new_dest_height, int new_dest_pitch, bool new_dest_bgra, PolyDepthStencil *new_depthstencil, bool new_topdown) +{ + viewport_x = x; + viewport_y = y; + viewport_width = width; + viewport_height = height; + dest = new_dest; + dest_width = new_dest_width; + dest_height = new_dest_height; + dest_pitch = new_dest_pitch; + dest_bgra = new_dest_bgra; + depthstencil = new_depthstencil; + topdown = new_topdown; + UpdateClip(); +} + +void PolyTriangleThreadData::SetScissor(int x, int y, int w, int h) +{ + scissor.left = x; + scissor.right = x + w; + scissor.top = y; + scissor.bottom = y + h; + UpdateClip(); +} + +void PolyTriangleThreadData::UpdateClip() +{ + clip.left = MAX(MAX(viewport_x, scissor.left), 0); + clip.top = MAX(MAX(viewport_y, scissor.top), 0); + clip.right = MIN(MIN(viewport_x + viewport_width, scissor.right), dest_width); + clip.bottom = MIN(MIN(viewport_y + viewport_height, scissor.bottom), dest_height); +} + +void PolyTriangleThreadData::PushStreamData(const StreamData &data, const PolyPushConstants &constants) +{ + mainVertexShader.Data = data; + mainVertexShader.uClipSplit = constants.uClipSplit; + + PushConstants = &constants; + + AlphaThreshold = clamp((int)(PushConstants->uAlphaThreshold * 255.0f + 0.5f), 0, 255) << 24; + + numPolyLights = 0; + if (constants.uLightIndex >= 0) + { + const FVector4 &lightRange = lights[constants.uLightIndex]; + static_assert(sizeof(FVector4) == 16, "sizeof(FVector4) is not 16 bytes"); + if (lightRange.Y > lightRange.X) + { + int start = constants.uLightIndex + 1; + int modulatedStart = static_cast(lightRange.X) + start; + int modulatedEnd = static_cast(lightRange.Y) + start; + for (int i = modulatedStart; i < modulatedEnd; i += 4) + { + if (numPolyLights == maxPolyLights) + break; + + auto &lightpos = lights[i]; + auto &lightcolor = lights[i + 1]; + //auto &lightspot1 = lights[i + 2]; + //auto &lightspot2 = lights[i + 3]; + uint32_t r = (int)clamp(lightcolor.X * 255.0f, 0.0f, 255.0f); + uint32_t g = (int)clamp(lightcolor.Y * 255.0f, 0.0f, 255.0f); + uint32_t b = (int)clamp(lightcolor.Z * 255.0f, 0.0f, 255.0f); + + auto& polylight = polyLights[numPolyLights++]; + polylight.x = lightpos.X; + polylight.y = lightpos.Y; + polylight.z = lightpos.Z; + polylight.radius = 256.0f / lightpos.W; + polylight.color = (r << 16) | (g << 8) | b; + if (lightcolor.W < 0.0f) + polylight.radius = -polylight.radius; + } + } + } +} + +void PolyTriangleThreadData::PushMatrices(const VSMatrix &modelMatrix, const VSMatrix &normalModelMatrix, const VSMatrix &textureMatrix) +{ + mainVertexShader.ModelMatrix = modelMatrix; + mainVertexShader.NormalModelMatrix = normalModelMatrix; + mainVertexShader.TextureMatrix = textureMatrix; +} + +void PolyTriangleThreadData::SetViewpointUniforms(const HWViewpointUniforms *uniforms) +{ + mainVertexShader.Viewpoint = uniforms; +} + +void PolyTriangleThreadData::SetDepthClamp(bool on) +{ +} + +void PolyTriangleThreadData::SetDepthMask(bool on) +{ + WriteDepth = on; +} + +void PolyTriangleThreadData::SetDepthFunc(int func) +{ + if (func == DF_LEqual || func == DF_Less) + { + DepthTest = true; + } + else // if (func == DF_Always) + { + DepthTest = false; + } +} + +void PolyTriangleThreadData::SetDepthRange(float min, float max) +{ + // The only two variants used by hwrenderer layer + if (min == 0.0f && max == 1.0f) + { + } + else if (min == 1.0f && max == 1.0f) + { + } +} + +void PolyTriangleThreadData::SetDepthBias(float depthBiasConstantFactor, float depthBiasSlopeFactor) +{ + depthbias = (float)(depthBiasConstantFactor / 2500.0); +} + +void PolyTriangleThreadData::SetColorMask(bool r, bool g, bool b, bool a) +{ + WriteColor = r; +} + +void PolyTriangleThreadData::SetStencil(int stencilRef, int op) +{ + StencilTestValue = stencilRef; + if (op == SOP_Increment) + { + WriteStencil = StencilTest; + StencilWriteValue = MIN(stencilRef + 1, (int)255); + } + else if (op == SOP_Decrement) + { + WriteStencil = StencilTest; + StencilWriteValue = MAX(stencilRef - 1, (int)0); + } + else // SOP_Keep + { + WriteStencil = false; + StencilWriteValue = stencilRef; + } +} + +void PolyTriangleThreadData::SetCulling(int mode) +{ + SetTwoSided(mode == Cull_None); + SetCullCCW(mode == Cull_CCW); +} + +void PolyTriangleThreadData::EnableStencil(bool on) +{ + StencilTest = on; + WriteStencil = on && (StencilTestValue != StencilWriteValue); +} + +void PolyTriangleThreadData::SetRenderStyle(FRenderStyle style) +{ + RenderStyle = style; +} + +void PolyTriangleThreadData::SetShader(int specialEffect, int effectState, bool alphaTest) +{ + SpecialEffect = specialEffect; + EffectState = effectState; + AlphaTest = alphaTest; +} + +void PolyTriangleThreadData::SetTexture(int unit, const void *pixels, int width, int height, bool bgra) +{ + textures[unit].pixels = pixels; + textures[unit].width = width; + textures[unit].height = height; + textures[unit].bgra = bgra; +} + +void PolyTriangleThreadData::DrawIndexed(int index, int vcount, PolyDrawMode drawmode) +{ + if (vcount < 3) + return; + + elements += index; + + ShadedTriVertex vertbuffer[3]; + ShadedTriVertex *vert[3] = { &vertbuffer[0], &vertbuffer[1], &vertbuffer[2] }; + if (drawmode == PolyDrawMode::Triangles) + { + for (int i = 0; i < vcount / 3; i++) + { + for (int j = 0; j < 3; j++) + *vert[j] = ShadeVertex(*(elements++)); + DrawShadedTriangle(vert, ccw); + } + } + else if (drawmode == PolyDrawMode::TriangleFan) + { + *vert[0] = ShadeVertex(*(elements++)); + *vert[1] = ShadeVertex(*(elements++)); + for (int i = 2; i < vcount; i++) + { + *vert[2] = ShadeVertex(*(elements++)); + DrawShadedTriangle(vert, ccw); + std::swap(vert[1], vert[2]); + } + } + else if (drawmode == PolyDrawMode::TriangleStrip) + { + bool toggleccw = ccw; + *vert[0] = ShadeVertex(*(elements++)); + *vert[1] = ShadeVertex(*(elements++)); + for (int i = 2; i < vcount; i++) + { + *vert[2] = ShadeVertex(*(elements++)); + DrawShadedTriangle(vert, toggleccw); + ShadedTriVertex *vtmp = vert[0]; + vert[0] = vert[1]; + vert[1] = vert[2]; + vert[2] = vtmp; + toggleccw = !toggleccw; + } + } + else if (drawmode == PolyDrawMode::Lines) + { + for (int i = 0; i < vcount / 2; i++) + { + *vert[0] = ShadeVertex(*(elements++)); + *vert[1] = ShadeVertex(*(elements++)); + DrawShadedLine(vert); + } + } + else if (drawmode == PolyDrawMode::Points) + { + for (int i = 0; i < vcount; i++) + { + *vert[0] = ShadeVertex(*(elements++)); + DrawShadedPoint(vert); + } + } +} + +void PolyTriangleThreadData::Draw(int index, int vcount, PolyDrawMode drawmode) +{ + if (vcount < 3) + return; + + int vinput = index; + + ShadedTriVertex vertbuffer[3]; + ShadedTriVertex *vert[3] = { &vertbuffer[0], &vertbuffer[1], &vertbuffer[2] }; + if (drawmode == PolyDrawMode::Triangles) + { + for (int i = 0; i < vcount / 3; i++) + { + for (int j = 0; j < 3; j++) + *vert[j] = ShadeVertex(vinput++); + DrawShadedTriangle(vert, ccw); + } + } + else if (drawmode == PolyDrawMode::TriangleFan) + { + *vert[0] = ShadeVertex(vinput++); + *vert[1] = ShadeVertex(vinput++); + for (int i = 2; i < vcount; i++) + { + *vert[2] = ShadeVertex(vinput++); + DrawShadedTriangle(vert, ccw); + std::swap(vert[1], vert[2]); + } + } + else if (drawmode == PolyDrawMode::TriangleStrip) + { + bool toggleccw = ccw; + *vert[0] = ShadeVertex(vinput++); + *vert[1] = ShadeVertex(vinput++); + for (int i = 2; i < vcount; i++) + { + *vert[2] = ShadeVertex(vinput++); + DrawShadedTriangle(vert, toggleccw); + ShadedTriVertex *vtmp = vert[0]; + vert[0] = vert[1]; + vert[1] = vert[2]; + vert[2] = vtmp; + toggleccw = !toggleccw; + } + } + else if (drawmode == PolyDrawMode::Lines) + { + for (int i = 0; i < vcount / 2; i++) + { + *vert[0] = ShadeVertex(vinput++); + *vert[1] = ShadeVertex(vinput++); + DrawShadedLine(vert); + } + } + else if (drawmode == PolyDrawMode::Points) + { + for (int i = 0; i < vcount; i++) + { + *vert[0] = ShadeVertex(vinput++); + DrawShadedPoint(vert); + } + } +} + +ShadedTriVertex PolyTriangleThreadData::ShadeVertex(int index) +{ + inputAssembly->Load(this, vertices, index); + mainVertexShader.SIMPLE = (SpecialEffect == EFF_BURN) || (SpecialEffect == EFF_STENCIL); + mainVertexShader.SPHEREMAP = (SpecialEffect == EFF_SPHEREMAP); + mainVertexShader.main(); + return mainVertexShader; +} + +bool PolyTriangleThreadData::IsDegenerate(const ShadedTriVertex *const* vert) +{ + // A degenerate triangle has a zero cross product for two of its sides. + float ax = vert[1]->gl_Position.X - vert[0]->gl_Position.X; + float ay = vert[1]->gl_Position.Y - vert[0]->gl_Position.Y; + float az = vert[1]->gl_Position.W - vert[0]->gl_Position.W; + float bx = vert[2]->gl_Position.X - vert[0]->gl_Position.X; + float by = vert[2]->gl_Position.Y - vert[0]->gl_Position.Y; + float bz = vert[2]->gl_Position.W - vert[0]->gl_Position.W; + float crossx = ay * bz - az * by; + float crossy = az * bx - ax * bz; + float crossz = ax * by - ay * bx; + float crosslengthsqr = crossx * crossx + crossy * crossy + crossz * crossz; + return crosslengthsqr <= 1.e-8f; +} + +bool PolyTriangleThreadData::IsFrontfacing(TriDrawTriangleArgs *args) +{ + float a = + args->v1->x * args->v2->y - args->v2->x * args->v1->y + + args->v2->x * args->v3->y - args->v3->x * args->v2->y + + args->v3->x * args->v1->y - args->v1->x * args->v3->y; + return a <= 0.0f; +} + +void PolyTriangleThreadData::DrawShadedPoint(const ShadedTriVertex *const* vertex) +{ +} + +void PolyTriangleThreadData::DrawShadedLine(const ShadedTriVertex *const* vert) +{ + static const int numclipdistances = 9; + float clipdistance[numclipdistances * 2]; + float *clipd = clipdistance; + for (int i = 0; i < 2; i++) + { + const auto &v = *vert[i]; + clipd[0] = v.gl_Position.X + v.gl_Position.W; + clipd[1] = v.gl_Position.W - v.gl_Position.X; + clipd[2] = v.gl_Position.Y + v.gl_Position.W; + clipd[3] = v.gl_Position.W - v.gl_Position.Y; + clipd[4] = v.gl_Position.Z + v.gl_Position.W; + clipd[5] = v.gl_Position.W - v.gl_Position.Z; + clipd[6] = v.gl_ClipDistance[0]; + clipd[7] = v.gl_ClipDistance[1]; + clipd[8] = v.gl_ClipDistance[2]; + clipd += numclipdistances; + } + + float t1 = 0.0f; + float t2 = 1.0f; + for (int p = 0; p < numclipdistances; p++) + { + float clipdistance1 = clipdistance[0 * numclipdistances + p]; + float clipdistance2 = clipdistance[1 * numclipdistances + p]; + if (clipdistance1 < 0.0f) t1 = MAX(-clipdistance1 / (clipdistance2 - clipdistance1), t1); + if (clipdistance2 < 0.0f) t2 = MIN(1.0f + clipdistance2 / (clipdistance1 - clipdistance2), t2); + if (t1 >= t2) + return; + } + + float weights[] = { 1.0f - t1, t1, 1.0f - t2, t2 }; + + ScreenTriVertex clippedvert[2]; + for (int i = 0; i < 2; i++) + { + auto &v = clippedvert[i]; + memset(&v, 0, sizeof(ScreenTriVertex)); + for (int w = 0; w < 2; w++) + { + float weight = weights[i * 2 + w]; + v.x += vert[w]->gl_Position.X * weight; + v.y += vert[w]->gl_Position.Y * weight; + v.z += vert[w]->gl_Position.Z * weight; + v.w += vert[w]->gl_Position.W * weight; + } + + // Calculate normalized device coordinates: + v.w = 1.0f / v.w; + v.x *= v.w; + v.y *= v.w; + v.z *= v.w; + + // Apply viewport scale to get screen coordinates: + v.x = viewport_x + viewport_width * (1.0f + v.x) * 0.5f; + if (topdown) + v.y = viewport_y + viewport_height * (1.0f - v.y) * 0.5f; + else + v.y = viewport_y + viewport_height * (1.0f + v.y) * 0.5f; + } + + uint32_t vColorA = (int)(vert[0]->vColor.W * 255.0f + 0.5f); + uint32_t vColorR = (int)(vert[0]->vColor.X * 255.0f + 0.5f); + uint32_t vColorG = (int)(vert[0]->vColor.Y * 255.0f + 0.5f); + uint32_t vColorB = (int)(vert[0]->vColor.Z * 255.0f + 0.5f); + uint32_t color = MAKEARGB(vColorA, vColorR, vColorG, vColorB); + + // Slow and naive implementation. Hopefully fast enough.. + + float x1 = clippedvert[0].x; + float y1 = clippedvert[0].y; + float x2 = clippedvert[1].x; + float y2 = clippedvert[1].y; + float dx = x2 - x1; + float dy = y2 - y1; + float step = (abs(dx) >= abs(dy)) ? abs(dx) : abs(dy); + dx /= step; + dy /= step; + float x = x1; + float y = y1; + int istep = (int)step; + int pixelsize = dest_bgra ? 4 : 1; + for (int i = 0; i <= istep; i++) + { + int scrx = (int)x; + int scry = (int)y; + if (scrx >= clip.left && scrx < clip.right && scry >= clip.top && scry < clip.bottom && !line_skipped_by_thread(scry)) + { + uint8_t *destpixel = dest + (scrx + scry * dest_width) * pixelsize; + if (pixelsize == 4) + { + *reinterpret_cast(destpixel) = color; + } + else + { + *destpixel = color; + } + } + x += dx; + y += dy; + } +} + +void PolyTriangleThreadData::DrawShadedTriangle(const ShadedTriVertex *const* vert, bool ccw) +{ + // Reject triangle if degenerate + if (IsDegenerate(vert)) + return; + + // Cull, clip and generate additional vertices as needed + ScreenTriVertex clippedvert[max_additional_vertices]; + int numclipvert = ClipEdge(vert); + + // Convert barycentric weights to actual vertices + for (int i = 0; i < numclipvert; i++) + { + auto &v = clippedvert[i]; + memset(&v, 0, sizeof(ScreenTriVertex)); + for (int w = 0; w < 3; w++) + { + float weight = weights[i * 3 + w]; + v.x += vert[w]->gl_Position.X * weight; + v.y += vert[w]->gl_Position.Y * weight; + v.z += vert[w]->gl_Position.Z * weight; + v.w += vert[w]->gl_Position.W * weight; + v.u += vert[w]->vTexCoord.X * weight; + v.v += vert[w]->vTexCoord.Y * weight; + v.worldX += vert[w]->pixelpos.X * weight; + v.worldY += vert[w]->pixelpos.Y * weight; + v.worldZ += vert[w]->pixelpos.Z * weight; + v.a += vert[w]->vColor.W * weight; + v.r += vert[w]->vColor.X * weight; + v.g += vert[w]->vColor.Y * weight; + v.b += vert[w]->vColor.Z * weight; + v.gradientdistZ += vert[w]->gradientdist.Z * weight; + } + } + +#ifdef NO_SSE + // Map to 2D viewport: + for (int j = 0; j < numclipvert; j++) + { + auto &v = clippedvert[j]; + + // Calculate normalized device coordinates: + v.w = 1.0f / v.w; + v.x *= v.w; + v.y *= v.w; + v.z *= v.w; + + // Apply viewport scale to get screen coordinates: + v.x = viewport_x + viewport_width * (1.0f + v.x) * 0.5f; + if (topdown) + v.y = viewport_y + viewport_height * (1.0f - v.y) * 0.5f; + else + v.y = viewport_y + viewport_height * (1.0f + v.y) * 0.5f; + } +#else + // Map to 2D viewport: + __m128 mviewport_x = _mm_set1_ps((float)viewport_x); + __m128 mviewport_y = _mm_set1_ps((float)viewport_y); + __m128 mviewport_halfwidth = _mm_set1_ps(viewport_width * 0.5f); + __m128 mviewport_halfheight = _mm_set1_ps(viewport_height * 0.5f); + __m128 mone = _mm_set1_ps(1.0f); + int sse_length = (numclipvert + 3) / 4 * 4; + for (int j = 0; j < sse_length; j += 4) + { + __m128 vx = _mm_loadu_ps(&clippedvert[j].x); + __m128 vy = _mm_loadu_ps(&clippedvert[j + 1].x); + __m128 vz = _mm_loadu_ps(&clippedvert[j + 2].x); + __m128 vw = _mm_loadu_ps(&clippedvert[j + 3].x); + _MM_TRANSPOSE4_PS(vx, vy, vz, vw); + + // Calculate normalized device coordinates: + vw = _mm_div_ps(mone, vw); + vx = _mm_mul_ps(vx, vw); + vy = _mm_mul_ps(vy, vw); + vz = _mm_mul_ps(vz, vw); + + // Apply viewport scale to get screen coordinates: + vx = _mm_add_ps(mviewport_x, _mm_mul_ps(mviewport_halfwidth, _mm_add_ps(mone, vx))); + if (topdown) + vy = _mm_add_ps(mviewport_y, _mm_mul_ps(mviewport_halfheight, _mm_sub_ps(mone, vy))); + else + vy = _mm_add_ps(mviewport_y, _mm_mul_ps(mviewport_halfheight, _mm_add_ps(mone, vy))); + + _MM_TRANSPOSE4_PS(vx, vy, vz, vw); + _mm_storeu_ps(&clippedvert[j].x, vx); + _mm_storeu_ps(&clippedvert[j + 1].x, vy); + _mm_storeu_ps(&clippedvert[j + 2].x, vz); + _mm_storeu_ps(&clippedvert[j + 3].x, vw); + } +#endif + + if (!topdown) ccw = !ccw; + + TriDrawTriangleArgs args; + + if (twosided && numclipvert > 2) + { + args.v1 = &clippedvert[0]; + args.v2 = &clippedvert[1]; + args.v3 = &clippedvert[2]; + ccw = !IsFrontfacing(&args); + } + + // Draw screen triangles + if (ccw) + { + for (int i = numclipvert - 1; i > 1; i--) + { + args.v1 = &clippedvert[numclipvert - 1]; + args.v2 = &clippedvert[i - 1]; + args.v3 = &clippedvert[i - 2]; + if (IsFrontfacing(&args) == ccw && args.CalculateGradients()) + { + ScreenTriangle::Draw(&args, this); + } + } + } + else + { + for (int i = 2; i < numclipvert; i++) + { + args.v1 = &clippedvert[0]; + args.v2 = &clippedvert[i - 1]; + args.v3 = &clippedvert[i]; + if (IsFrontfacing(&args) != ccw && args.CalculateGradients()) + { + ScreenTriangle::Draw(&args, this); + } + } + } +} + +int PolyTriangleThreadData::ClipEdge(const ShadedTriVertex *const* verts) +{ + // use barycentric weights for clipped vertices + weights = weightsbuffer; + for (int i = 0; i < 3; i++) + { + weights[i * 3 + 0] = 0.0f; + weights[i * 3 + 1] = 0.0f; + weights[i * 3 + 2] = 0.0f; + weights[i * 3 + i] = 1.0f; + } + + // Clip and cull so that the following is true for all vertices: + // -v.w <= v.x <= v.w + // -v.w <= v.y <= v.w + // -v.w <= v.z <= v.w + + // halfspace clip distances + static const int numclipdistances = 9; +#ifdef NO_SSE + float clipdistance[numclipdistances * 3]; + bool needsclipping = false; + float *clipd = clipdistance; + for (int i = 0; i < 3; i++) + { + const auto &v = *verts[i]; + clipd[0] = v.gl_Position.X + v.gl_Position.W; + clipd[1] = v.gl_Position.W - v.gl_Position.X; + clipd[2] = v.gl_Position.Y + v.gl_Position.W; + clipd[3] = v.gl_Position.W - v.gl_Position.Y; + clipd[4] = v.gl_Position.Z + v.gl_Position.W; + clipd[5] = v.gl_Position.W - v.gl_Position.Z; + clipd[6] = v.gl_ClipDistance[0]; + clipd[7] = v.gl_ClipDistance[1]; + clipd[8] = v.gl_ClipDistance[2]; + for (int j = 0; j < 9; j++) + needsclipping = needsclipping || clipd[i]; + clipd += numclipdistances; + } + + // If all halfspace clip distances are positive then the entire triangle is visible. Skip the expensive clipping step. + if (!needsclipping) + { + return 3; + } +#else + __m128 mx = _mm_loadu_ps(&verts[0]->gl_Position.X); + __m128 my = _mm_loadu_ps(&verts[1]->gl_Position.X); + __m128 mz = _mm_loadu_ps(&verts[2]->gl_Position.X); + __m128 mw = _mm_setzero_ps(); + _MM_TRANSPOSE4_PS(mx, my, mz, mw); + __m128 clipd0 = _mm_add_ps(mx, mw); + __m128 clipd1 = _mm_sub_ps(mw, mx); + __m128 clipd2 = _mm_add_ps(my, mw); + __m128 clipd3 = _mm_sub_ps(mw, my); + __m128 clipd4 = _mm_add_ps(mz, mw); + __m128 clipd5 = _mm_sub_ps(mw, mz); + __m128 clipd6 = _mm_setr_ps(verts[0]->gl_ClipDistance[0], verts[1]->gl_ClipDistance[0], verts[2]->gl_ClipDistance[0], 0.0f); + __m128 clipd7 = _mm_setr_ps(verts[0]->gl_ClipDistance[1], verts[1]->gl_ClipDistance[1], verts[2]->gl_ClipDistance[1], 0.0f); + __m128 clipd8 = _mm_setr_ps(verts[0]->gl_ClipDistance[2], verts[1]->gl_ClipDistance[2], verts[2]->gl_ClipDistance[2], 0.0f); + __m128 mneedsclipping = _mm_cmplt_ps(clipd0, _mm_setzero_ps()); + mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd1, _mm_setzero_ps())); + mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd2, _mm_setzero_ps())); + mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd3, _mm_setzero_ps())); + mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd4, _mm_setzero_ps())); + mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd5, _mm_setzero_ps())); + mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd6, _mm_setzero_ps())); + mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd7, _mm_setzero_ps())); + mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd8, _mm_setzero_ps())); + if (_mm_movemask_ps(mneedsclipping) == 0) + { + return 3; + } + float clipdistance[numclipdistances * 4]; + _mm_storeu_ps(clipdistance, clipd0); + _mm_storeu_ps(clipdistance + 4, clipd1); + _mm_storeu_ps(clipdistance + 8, clipd2); + _mm_storeu_ps(clipdistance + 12, clipd3); + _mm_storeu_ps(clipdistance + 16, clipd4); + _mm_storeu_ps(clipdistance + 20, clipd5); + _mm_storeu_ps(clipdistance + 24, clipd6); + _mm_storeu_ps(clipdistance + 28, clipd7); + _mm_storeu_ps(clipdistance + 32, clipd8); +#endif + + // Clip against each halfspace + float *input = weights; + float *output = weights + max_additional_vertices * 3; + int inputverts = 3; + for (int p = 0; p < numclipdistances; p++) + { + // Clip each edge + int outputverts = 0; + for (int i = 0; i < inputverts; i++) + { + int j = (i + 1) % inputverts; +#ifdef NO_SSE + float clipdistance1 = + clipdistance[0 * numclipdistances + p] * input[i * 3 + 0] + + clipdistance[1 * numclipdistances + p] * input[i * 3 + 1] + + clipdistance[2 * numclipdistances + p] * input[i * 3 + 2]; + + float clipdistance2 = + clipdistance[0 * numclipdistances + p] * input[j * 3 + 0] + + clipdistance[1 * numclipdistances + p] * input[j * 3 + 1] + + clipdistance[2 * numclipdistances + p] * input[j * 3 + 2]; +#else + float clipdistance1 = + clipdistance[0 + p * 4] * input[i * 3 + 0] + + clipdistance[1 + p * 4] * input[i * 3 + 1] + + clipdistance[2 + p * 4] * input[i * 3 + 2]; + + float clipdistance2 = + clipdistance[0 + p * 4] * input[j * 3 + 0] + + clipdistance[1 + p * 4] * input[j * 3 + 1] + + clipdistance[2 + p * 4] * input[j * 3 + 2]; +#endif + + // Clip halfspace + if ((clipdistance1 >= 0.0f || clipdistance2 >= 0.0f) && outputverts + 1 < max_additional_vertices) + { + float t1 = (clipdistance1 < 0.0f) ? MAX(-clipdistance1 / (clipdistance2 - clipdistance1), 0.0f) : 0.0f; + float t2 = (clipdistance2 < 0.0f) ? MIN(1.0f + clipdistance2 / (clipdistance1 - clipdistance2), 1.0f) : 1.0f; + + // add t1 vertex + for (int k = 0; k < 3; k++) + output[outputverts * 3 + k] = input[i * 3 + k] * (1.0f - t1) + input[j * 3 + k] * t1; + outputverts++; + + if (t2 != 1.0f && t2 > t1) + { + // add t2 vertex + for (int k = 0; k < 3; k++) + output[outputverts * 3 + k] = input[i * 3 + k] * (1.0f - t2) + input[j * 3 + k] * t2; + outputverts++; + } + } + } + std::swap(input, output); + inputverts = outputverts; + if (inputverts == 0) + break; + } + + weights = input; + return inputverts; +} + +PolyTriangleThreadData *PolyTriangleThreadData::Get(DrawerThread *thread) +{ + if (!thread->poly) + thread->poly = std::make_shared(thread->core, thread->num_cores, thread->numa_node, thread->num_numa_nodes, thread->numa_start_y, thread->numa_end_y); + return thread->poly.get(); +} diff --git a/src/rendering/polyrenderer/drawers/poly_thread.h b/src/rendering/polyrenderer/drawers/poly_thread.h new file mode 100644 index 000000000..2dafaa2a1 --- /dev/null +++ b/src/rendering/polyrenderer/drawers/poly_thread.h @@ -0,0 +1,197 @@ +/* +** Polygon Doom software renderer +** Copyright (c) 2016 Magnus Norddahl +** +** This software is provided 'as-is', without any express or implied +** warranty. In no event will the authors be held liable for any damages +** arising from the use of this software. +** +** Permission is granted to anyone to use this software for any purpose, +** including commercial applications, and to alter it and redistribute it +** freely, subject to the following restrictions: +** +** 1. The origin of this software must not be misrepresented; you must not +** claim that you wrote the original software. If you use this software +** in a product, an acknowledgment in the product documentation would be +** appreciated but is not required. +** 2. Altered source versions must be plainly marked as such, and must not be +** misrepresented as being the original software. +** 3. This notice may not be removed or altered from any source distribution. +** +*/ + +#pragma once + +#include "poly_triangle.h" + +struct PolyLight +{ + uint32_t color; + float x, y, z; + float radius; +}; + +class PolyTriangleThreadData +{ +public: + PolyTriangleThreadData(int32_t core, int32_t num_cores, int32_t numa_node, int32_t num_numa_nodes, int numa_start_y, int numa_end_y); + + void ClearDepth(float value); + void ClearStencil(uint8_t value); + void SetViewport(int x, int y, int width, int height, uint8_t *dest, int dest_width, int dest_height, int dest_pitch, bool dest_bgra, PolyDepthStencil *depthstencil, bool topdown); + + void SetCullCCW(bool value) { ccw = value; } + void SetTwoSided(bool value) { twosided = value; } + + void SetInputAssembly(PolyInputAssembly *input) { inputAssembly = input; } + void SetVertexBuffer(const void *data) { vertices = data; } + void SetIndexBuffer(const void *data) { elements = (const unsigned int *)data; } + void SetLightBuffer(const void *data) { lights = (const FVector4 *)data; } + void SetViewpointUniforms(const HWViewpointUniforms *uniforms); + void SetDepthClamp(bool on); + void SetDepthMask(bool on); + void SetDepthFunc(int func); + void SetDepthRange(float min, float max); + void SetDepthBias(float depthBiasConstantFactor, float depthBiasSlopeFactor); + void SetColorMask(bool r, bool g, bool b, bool a); + void SetStencil(int stencilRef, int op); + void SetCulling(int mode); + void EnableStencil(bool on); + void SetScissor(int x, int y, int w, int h); + void SetRenderStyle(FRenderStyle style); + void SetTexture(int unit, const void *pixels, int width, int height, bool bgra); + void SetShader(int specialEffect, int effectState, bool alphaTest); + + void UpdateClip(); + + void PushStreamData(const StreamData &data, const PolyPushConstants &constants); + void PushMatrices(const VSMatrix &modelMatrix, const VSMatrix &normalModelMatrix, const VSMatrix &textureMatrix); + + void DrawIndexed(int index, int count, PolyDrawMode mode); + void Draw(int index, int vcount, PolyDrawMode mode); + + int32_t core; + int32_t num_cores; + int32_t numa_node; + int32_t num_numa_nodes; + + int numa_start_y; + int numa_end_y; + + bool line_skipped_by_thread(int line) + { + return line < numa_start_y || line >= numa_end_y || line % num_cores != core; + } + + int skipped_by_thread(int first_line) + { + int clip_first_line = MAX(first_line, numa_start_y); + int core_skip = (num_cores - (clip_first_line - core) % num_cores) % num_cores; + return clip_first_line + core_skip - first_line; + } + + int count_for_thread(int first_line, int count) + { + count = MIN(count, numa_end_y - first_line); + int c = (count - skipped_by_thread(first_line) + num_cores - 1) / num_cores; + return MAX(c, 0); + } + + struct Scanline + { + float W[MAXWIDTH]; + uint16_t U[MAXWIDTH]; + uint16_t V[MAXWIDTH]; + float WorldX[MAXWIDTH]; + float WorldY[MAXWIDTH]; + float WorldZ[MAXWIDTH]; + uint8_t vColorA[MAXWIDTH]; + uint8_t vColorR[MAXWIDTH]; + uint8_t vColorG[MAXWIDTH]; + uint8_t vColorB[MAXWIDTH]; + float GradientdistZ[MAXWIDTH]; + uint32_t FragColor[MAXWIDTH]; + uint16_t lightarray[MAXWIDTH]; + uint32_t dynlights[MAXWIDTH]; + uint8_t discard[MAXWIDTH]; + } scanline; + + static PolyTriangleThreadData *Get(DrawerThread *thread); + + int dest_pitch = 0; + int dest_width = 0; + int dest_height = 0; + bool dest_bgra = false; + uint8_t *dest = nullptr; + PolyDepthStencil *depthstencil = nullptr; + bool topdown = true; + + float depthbias = 0.0f; + + int viewport_y = 0; + + struct ClipRect + { + int left = 0; + int top = 0; + int right = 0; + int bottom = 0; + } clip, scissor; + + FRenderStyle RenderStyle; + int SpecialEffect = EFF_NONE; + int EffectState = 0; + bool AlphaTest = false; + uint32_t AlphaThreshold = 0x7f000000; + const PolyPushConstants* PushConstants = nullptr; + + const void *vertices = nullptr; + const unsigned int *elements = nullptr; + const FVector4 *lights = nullptr; + + enum { maxPolyLights = 16 }; + PolyLight polyLights[maxPolyLights]; + int numPolyLights = 0; + + PolyMainVertexShader mainVertexShader; + + struct TextureUnit + { + const void* pixels = nullptr; + int width = 0; + int height = 0; + bool bgra = true; + } textures[16]; + + bool DepthTest = false; + bool StencilTest = true; + bool WriteStencil = true; + bool WriteColor = true; + bool WriteDepth = true; + uint8_t StencilTestValue = 0; + uint8_t StencilWriteValue = 0; + + void (*FragmentShader)(int x0, int x1, PolyTriangleThreadData* thread) = nullptr; + void (*WriteColorFunc)(int y, int x0, int x1, PolyTriangleThreadData* thread) = nullptr; + +private: + ShadedTriVertex ShadeVertex(int index); + void DrawShadedPoint(const ShadedTriVertex *const* vertex); + void DrawShadedLine(const ShadedTriVertex *const* vertices); + void DrawShadedTriangle(const ShadedTriVertex *const* vertices, bool ccw); + static bool IsDegenerate(const ShadedTriVertex *const* vertices); + static bool IsFrontfacing(TriDrawTriangleArgs *args); + + int ClipEdge(const ShadedTriVertex *const* verts); + + int viewport_x = 0; + int viewport_width = 0; + int viewport_height = 0; + bool ccw = true; + bool twosided = true; + PolyInputAssembly *inputAssembly = nullptr; + + enum { max_additional_vertices = 16 }; + float weightsbuffer[max_additional_vertices * 3 * 2]; + float *weights = nullptr; +}; diff --git a/src/rendering/polyrenderer/drawers/poly_triangle.cpp b/src/rendering/polyrenderer/drawers/poly_triangle.cpp index d4b4610cb..cae71b2af 100644 --- a/src/rendering/polyrenderer/drawers/poly_triangle.cpp +++ b/src/rendering/polyrenderer/drawers/poly_triangle.cpp @@ -35,10 +35,304 @@ #include "v_palette.h" #include "r_data/colormaps.h" #include "poly_triangle.h" +#include "poly_thread.h" #include "swrenderer/drawers/r_draw_rgba.h" #include "screen_triangle.h" #include "x86.h" +///////////////////////////////////////////////////////////////////////////// + +class PolyDrawerCommand : public DrawerCommand +{ +public: +}; + +class PolySetDepthClampCommand : public PolyDrawerCommand +{ +public: + PolySetDepthClampCommand(bool on) : on(on) { } + void Execute(DrawerThread* thread) override { PolyTriangleThreadData::Get(thread)->SetDepthClamp(on); } + +private: + bool on; +}; + +class PolySetDepthMaskCommand : public PolyDrawerCommand +{ +public: + PolySetDepthMaskCommand(bool on) : on(on) { } + void Execute(DrawerThread* thread) override { PolyTriangleThreadData::Get(thread)->SetDepthMask(on); } + +private: + bool on; +}; + +class PolySetDepthFuncCommand : public PolyDrawerCommand +{ +public: + PolySetDepthFuncCommand(int func) : func(func) { } + void Execute(DrawerThread* thread) override { PolyTriangleThreadData::Get(thread)->SetDepthFunc(func); } + +private: + int func; +}; + +class PolySetDepthRangeCommand : public PolyDrawerCommand +{ +public: + PolySetDepthRangeCommand(float min, float max) : min(min), max(max) { } + void Execute(DrawerThread* thread) override { PolyTriangleThreadData::Get(thread)->SetDepthRange(min, max); } + +private: + float min; + float max; +}; + +class PolySetDepthBiasCommand : public PolyDrawerCommand +{ +public: + PolySetDepthBiasCommand(float depthBiasConstantFactor, float depthBiasSlopeFactor) : depthBiasConstantFactor(depthBiasConstantFactor), depthBiasSlopeFactor(depthBiasSlopeFactor) { } + void Execute(DrawerThread* thread) override { PolyTriangleThreadData::Get(thread)->SetDepthBias(depthBiasConstantFactor, depthBiasSlopeFactor); } + +private: + float depthBiasConstantFactor; + float depthBiasSlopeFactor; +}; + +class PolySetColorMaskCommand : public PolyDrawerCommand +{ +public: + PolySetColorMaskCommand(bool r, bool g, bool b, bool a) : r(r), g(g), b(b), a(a) { } + void Execute(DrawerThread* thread) override { PolyTriangleThreadData::Get(thread)->SetColorMask(r, g, b, a); } + +private: + bool r; + bool g; + bool b; + bool a; +}; + +class PolySetStencilCommand : public PolyDrawerCommand +{ +public: + PolySetStencilCommand(int stencilRef, int op) : stencilRef(stencilRef), op(op) { } + void Execute(DrawerThread* thread) override { PolyTriangleThreadData::Get(thread)->SetStencil(stencilRef, op); } + +private: + int stencilRef; + int op; +}; + +class PolySetCullingCommand : public PolyDrawerCommand +{ +public: + PolySetCullingCommand(int mode) : mode(mode) { } + void Execute(DrawerThread* thread) override { PolyTriangleThreadData::Get(thread)->SetCulling(mode); } + +private: + int mode; +}; + +class PolyEnableStencilCommand : public PolyDrawerCommand +{ +public: + PolyEnableStencilCommand(bool on) : on(on) { } + void Execute(DrawerThread* thread) override { PolyTriangleThreadData::Get(thread)->EnableStencil(on); } + +private: + bool on; +}; + +class PolySetScissorCommand : public PolyDrawerCommand +{ +public: + PolySetScissorCommand(int x, int y, int w, int h) : x(x), y(y), w(w), h(h) { } + void Execute(DrawerThread* thread) override { PolyTriangleThreadData::Get(thread)->SetScissor(x, y, w, h); } + +private: + int x; + int y; + int w; + int h; +}; + +class PolySetRenderStyleCommand : public PolyDrawerCommand +{ +public: + PolySetRenderStyleCommand(FRenderStyle style) : style(style) { } + void Execute(DrawerThread* thread) override { PolyTriangleThreadData::Get(thread)->SetRenderStyle(style); } + +private: + FRenderStyle style; +}; + +class PolySetTextureCommand : public PolyDrawerCommand +{ +public: + PolySetTextureCommand(int unit, void* pixels, int width, int height, bool bgra) : unit(unit), pixels(pixels), width(width), height(height), bgra(bgra) { } + void Execute(DrawerThread* thread) override { PolyTriangleThreadData::Get(thread)->SetTexture(unit, pixels, width, height, bgra); } + +private: + int unit; + void* pixels; + int width; + int height; + bool bgra; +}; + +class PolySetShaderCommand : public PolyDrawerCommand +{ +public: + PolySetShaderCommand(int specialEffect, int effectState, bool alphaTest) : specialEffect(specialEffect), effectState(effectState), alphaTest(alphaTest) { } + void Execute(DrawerThread* thread) override { PolyTriangleThreadData::Get(thread)->SetShader(specialEffect, effectState, alphaTest); } + +private: + int specialEffect; + int effectState; + bool alphaTest; +}; + +class PolySetVertexBufferCommand : public PolyDrawerCommand +{ +public: + PolySetVertexBufferCommand(const void* vertices) : vertices(vertices) { } + void Execute(DrawerThread* thread) override { PolyTriangleThreadData::Get(thread)->SetVertexBuffer(vertices); } + +private: + const void* vertices; +}; + +class PolySetIndexBufferCommand : public PolyDrawerCommand +{ +public: + PolySetIndexBufferCommand(const void* indices) : indices(indices) { } + void Execute(DrawerThread* thread) override { PolyTriangleThreadData::Get(thread)->SetIndexBuffer(indices); } + +private: + const void* indices; +}; + +class PolySetLightBufferCommand : public PolyDrawerCommand +{ +public: + PolySetLightBufferCommand(const void* lights) : lights(lights) { } + void Execute(DrawerThread* thread) override { PolyTriangleThreadData::Get(thread)->SetLightBuffer(lights); } + +private: + const void* lights; +}; + +class PolySetInputAssemblyCommand : public PolyDrawerCommand +{ +public: + PolySetInputAssemblyCommand(PolyInputAssembly* input) : input(input) { } + void Execute(DrawerThread* thread) override { PolyTriangleThreadData::Get(thread)->SetInputAssembly(input); } + +private: + PolyInputAssembly* input; +}; + +class PolyClearDepthCommand : public PolyDrawerCommand +{ +public: + PolyClearDepthCommand(float value) : value(value) { } + void Execute(DrawerThread* thread) override { PolyTriangleThreadData::Get(thread)->ClearDepth(value); } + +private: + float value; +}; + +class PolyClearStencilCommand : public PolyDrawerCommand +{ +public: + PolyClearStencilCommand(uint8_t value) : value(value) { } + void Execute(DrawerThread* thread) override { PolyTriangleThreadData::Get(thread)->ClearStencil(value); } + +private: + uint8_t value; +}; + +class PolySetViewportCommand : public PolyDrawerCommand +{ +public: + PolySetViewportCommand(int x, int y, int width, int height, uint8_t* dest, int dest_width, int dest_height, int dest_pitch, bool dest_bgra, PolyDepthStencil* depthstencil, bool topdown) + : x(x), y(y), width(width), height(height), dest(dest), dest_width(dest_width), dest_height(dest_height), dest_pitch(dest_pitch), dest_bgra(dest_bgra), depthstencil(depthstencil), topdown(topdown) { } + void Execute(DrawerThread* thread) override { PolyTriangleThreadData::Get(thread)->SetViewport(x, y, width, height, dest, dest_width, dest_height, dest_pitch, dest_bgra, depthstencil, topdown); } + +private: + int x; + int y; + int width; + int height; + uint8_t* dest; + int dest_width; + int dest_height; + int dest_pitch; + bool dest_bgra; + PolyDepthStencil* depthstencil; + bool topdown; +}; + +class PolySetViewpointUniformsCommand : public PolyDrawerCommand +{ +public: + PolySetViewpointUniformsCommand(const HWViewpointUniforms* uniforms) : uniforms(uniforms) {} + void Execute(DrawerThread* thread) override { PolyTriangleThreadData::Get(thread)->SetViewpointUniforms(uniforms); } + +private: + const HWViewpointUniforms* uniforms; +}; + +class PolyPushMatricesCommand : public PolyDrawerCommand +{ +public: + PolyPushMatricesCommand(const VSMatrix& modelMatrix, const VSMatrix& normalModelMatrix, const VSMatrix& textureMatrix) + : modelMatrix(modelMatrix), normalModelMatrix(normalModelMatrix), textureMatrix(textureMatrix) { } + void Execute(DrawerThread* thread) override { PolyTriangleThreadData::Get(thread)->PushMatrices(modelMatrix, normalModelMatrix, textureMatrix); } + +private: + VSMatrix modelMatrix; + VSMatrix normalModelMatrix; + VSMatrix textureMatrix; +}; + +class PolyPushStreamDataCommand : public PolyDrawerCommand +{ +public: + PolyPushStreamDataCommand(const StreamData& data, const PolyPushConstants& constants) : data(data), constants(constants) { } + void Execute(DrawerThread* thread) override { PolyTriangleThreadData::Get(thread)->PushStreamData(data, constants); } + +private: + StreamData data; + PolyPushConstants constants; +}; + +class PolyDrawCommand : public PolyDrawerCommand +{ +public: + PolyDrawCommand(int index, int count, PolyDrawMode mode) : index(index), count(count), mode(mode) { } + void Execute(DrawerThread* thread) override { PolyTriangleThreadData::Get(thread)->Draw(index, count, mode); } + +private: + int index; + int count; + PolyDrawMode mode; +}; + +class PolyDrawIndexedCommand : public PolyDrawerCommand +{ +public: + PolyDrawIndexedCommand(int index, int count, PolyDrawMode mode) : index(index), count(count), mode(mode) { } + void Execute(DrawerThread* thread) override { PolyTriangleThreadData::Get(thread)->DrawIndexed(index, count, mode); } + +private: + int index; + int count; + PolyDrawMode mode; +}; + +///////////////////////////////////////////////////////////////////////////// + PolyCommandBuffer::PolyCommandBuffer(RenderMemory* frameMemory) { mQueue = std::make_shared(frameMemory); @@ -179,781 +473,3 @@ void PolyCommandBuffer::Submit() { DrawerThreads::Execute(mQueue); } - -///////////////////////////////////////////////////////////////////////////// - -void PolyTriangleThreadData::ClearDepth(float value) -{ - int width = depthstencil->Width(); - int height = depthstencil->Height(); - float *data = depthstencil->DepthValues(); - - int skip = skipped_by_thread(0); - int count = count_for_thread(0, height); - - data += skip * width; - for (int i = 0; i < count; i++) - { - for (int x = 0; x < width; x++) - data[x] = value; - data += num_cores * width; - } -} - -void PolyTriangleThreadData::ClearStencil(uint8_t value) -{ - int width = depthstencil->Width(); - int height = depthstencil->Height(); - uint8_t *data = depthstencil->StencilValues(); - - int skip = skipped_by_thread(0); - int count = count_for_thread(0, height); - - data += skip * width; - for (int i = 0; i < count; i++) - { - memset(data, value, width); - data += num_cores * width; - } -} - -void PolyTriangleThreadData::SetViewport(int x, int y, int width, int height, uint8_t *new_dest, int new_dest_width, int new_dest_height, int new_dest_pitch, bool new_dest_bgra, PolyDepthStencil *new_depthstencil, bool new_topdown) -{ - viewport_x = x; - viewport_y = y; - viewport_width = width; - viewport_height = height; - dest = new_dest; - dest_width = new_dest_width; - dest_height = new_dest_height; - dest_pitch = new_dest_pitch; - dest_bgra = new_dest_bgra; - depthstencil = new_depthstencil; - topdown = new_topdown; - UpdateClip(); -} - -void PolyTriangleThreadData::SetScissor(int x, int y, int w, int h) -{ - scissor.left = x; - scissor.right = x + w; - scissor.top = y; - scissor.bottom = y + h; - UpdateClip(); -} - -void PolyTriangleThreadData::UpdateClip() -{ - clip.left = MAX(MAX(viewport_x, scissor.left), 0); - clip.top = MAX(MAX(viewport_y, scissor.top), 0); - clip.right = MIN(MIN(viewport_x + viewport_width, scissor.right), dest_width); - clip.bottom = MIN(MIN(viewport_y + viewport_height, scissor.bottom), dest_height); -} - -void PolyTriangleThreadData::PushStreamData(const StreamData &data, const PolyPushConstants &constants) -{ - mainVertexShader.Data = data; - mainVertexShader.uClipSplit = constants.uClipSplit; - - PushConstants = &constants; - - AlphaThreshold = clamp((int)(PushConstants->uAlphaThreshold * 255.0f + 0.5f), 0, 255) << 24; - - numPolyLights = 0; - if (constants.uLightIndex >= 0) - { - const FVector4 &lightRange = lights[constants.uLightIndex]; - static_assert(sizeof(FVector4) == 16, "sizeof(FVector4) is not 16 bytes"); - if (lightRange.Y > lightRange.X) - { - int start = constants.uLightIndex + 1; - int modulatedStart = static_cast(lightRange.X) + start; - int modulatedEnd = static_cast(lightRange.Y) + start; - for (int i = modulatedStart; i < modulatedEnd; i += 4) - { - if (numPolyLights == maxPolyLights) - break; - - auto &lightpos = lights[i]; - auto &lightcolor = lights[i + 1]; - //auto &lightspot1 = lights[i + 2]; - //auto &lightspot2 = lights[i + 3]; - uint32_t r = (int)clamp(lightcolor.X * 255.0f, 0.0f, 255.0f); - uint32_t g = (int)clamp(lightcolor.Y * 255.0f, 0.0f, 255.0f); - uint32_t b = (int)clamp(lightcolor.Z * 255.0f, 0.0f, 255.0f); - - auto& polylight = polyLights[numPolyLights++]; - polylight.x = lightpos.X; - polylight.y = lightpos.Y; - polylight.z = lightpos.Z; - polylight.radius = 256.0f / lightpos.W; - polylight.color = (r << 16) | (g << 8) | b; - if (lightcolor.W < 0.0f) - polylight.radius = -polylight.radius; - } - } - } -} - -void PolyTriangleThreadData::PushMatrices(const VSMatrix &modelMatrix, const VSMatrix &normalModelMatrix, const VSMatrix &textureMatrix) -{ - mainVertexShader.ModelMatrix = modelMatrix; - mainVertexShader.NormalModelMatrix = normalModelMatrix; - mainVertexShader.TextureMatrix = textureMatrix; -} - -void PolyTriangleThreadData::SetViewpointUniforms(const HWViewpointUniforms *uniforms) -{ - mainVertexShader.Viewpoint = uniforms; -} - -void PolyTriangleThreadData::SetDepthClamp(bool on) -{ -} - -void PolyTriangleThreadData::SetDepthMask(bool on) -{ - WriteDepth = on; -} - -void PolyTriangleThreadData::SetDepthFunc(int func) -{ - if (func == DF_LEqual || func == DF_Less) - { - DepthTest = true; - } - else // if (func == DF_Always) - { - DepthTest = false; - } -} - -void PolyTriangleThreadData::SetDepthRange(float min, float max) -{ - // The only two variants used by hwrenderer layer - if (min == 0.0f && max == 1.0f) - { - } - else if (min == 1.0f && max == 1.0f) - { - } -} - -void PolyTriangleThreadData::SetDepthBias(float depthBiasConstantFactor, float depthBiasSlopeFactor) -{ - depthbias = (float)(depthBiasConstantFactor / 2500.0); -} - -void PolyTriangleThreadData::SetColorMask(bool r, bool g, bool b, bool a) -{ - WriteColor = r; -} - -void PolyTriangleThreadData::SetStencil(int stencilRef, int op) -{ - StencilTestValue = stencilRef; - if (op == SOP_Increment) - { - WriteStencil = StencilTest; - StencilWriteValue = MIN(stencilRef + 1, (int)255); - } - else if (op == SOP_Decrement) - { - WriteStencil = StencilTest; - StencilWriteValue = MAX(stencilRef - 1, (int)0); - } - else // SOP_Keep - { - WriteStencil = false; - StencilWriteValue = stencilRef; - } -} - -void PolyTriangleThreadData::SetCulling(int mode) -{ - SetTwoSided(mode == Cull_None); - SetCullCCW(mode == Cull_CCW); -} - -void PolyTriangleThreadData::EnableStencil(bool on) -{ - StencilTest = on; - WriteStencil = on && (StencilTestValue != StencilWriteValue); -} - -void PolyTriangleThreadData::SetRenderStyle(FRenderStyle style) -{ - RenderStyle = style; -} - -void PolyTriangleThreadData::SetShader(int specialEffect, int effectState, bool alphaTest) -{ - SpecialEffect = specialEffect; - EffectState = effectState; - AlphaTest = alphaTest; -} - -void PolyTriangleThreadData::SetTexture(int unit, const void *pixels, int width, int height, bool bgra) -{ - textures[unit].pixels = pixels; - textures[unit].width = width; - textures[unit].height = height; - textures[unit].bgra = bgra; -} - -void PolyTriangleThreadData::DrawIndexed(int index, int vcount, PolyDrawMode drawmode) -{ - if (vcount < 3) - return; - - elements += index; - - ShadedTriVertex vertbuffer[3]; - ShadedTriVertex *vert[3] = { &vertbuffer[0], &vertbuffer[1], &vertbuffer[2] }; - if (drawmode == PolyDrawMode::Triangles) - { - for (int i = 0; i < vcount / 3; i++) - { - for (int j = 0; j < 3; j++) - *vert[j] = ShadeVertex(*(elements++)); - DrawShadedTriangle(vert, ccw); - } - } - else if (drawmode == PolyDrawMode::TriangleFan) - { - *vert[0] = ShadeVertex(*(elements++)); - *vert[1] = ShadeVertex(*(elements++)); - for (int i = 2; i < vcount; i++) - { - *vert[2] = ShadeVertex(*(elements++)); - DrawShadedTriangle(vert, ccw); - std::swap(vert[1], vert[2]); - } - } - else if (drawmode == PolyDrawMode::TriangleStrip) - { - bool toggleccw = ccw; - *vert[0] = ShadeVertex(*(elements++)); - *vert[1] = ShadeVertex(*(elements++)); - for (int i = 2; i < vcount; i++) - { - *vert[2] = ShadeVertex(*(elements++)); - DrawShadedTriangle(vert, toggleccw); - ShadedTriVertex *vtmp = vert[0]; - vert[0] = vert[1]; - vert[1] = vert[2]; - vert[2] = vtmp; - toggleccw = !toggleccw; - } - } - else if (drawmode == PolyDrawMode::Lines) - { - for (int i = 0; i < vcount / 2; i++) - { - *vert[0] = ShadeVertex(*(elements++)); - *vert[1] = ShadeVertex(*(elements++)); - DrawShadedLine(vert); - } - } - else if (drawmode == PolyDrawMode::Points) - { - for (int i = 0; i < vcount; i++) - { - *vert[0] = ShadeVertex(*(elements++)); - DrawShadedPoint(vert); - } - } -} - -void PolyTriangleThreadData::Draw(int index, int vcount, PolyDrawMode drawmode) -{ - if (vcount < 3) - return; - - int vinput = index; - - ShadedTriVertex vertbuffer[3]; - ShadedTriVertex *vert[3] = { &vertbuffer[0], &vertbuffer[1], &vertbuffer[2] }; - if (drawmode == PolyDrawMode::Triangles) - { - for (int i = 0; i < vcount / 3; i++) - { - for (int j = 0; j < 3; j++) - *vert[j] = ShadeVertex(vinput++); - DrawShadedTriangle(vert, ccw); - } - } - else if (drawmode == PolyDrawMode::TriangleFan) - { - *vert[0] = ShadeVertex(vinput++); - *vert[1] = ShadeVertex(vinput++); - for (int i = 2; i < vcount; i++) - { - *vert[2] = ShadeVertex(vinput++); - DrawShadedTriangle(vert, ccw); - std::swap(vert[1], vert[2]); - } - } - else if (drawmode == PolyDrawMode::TriangleStrip) - { - bool toggleccw = ccw; - *vert[0] = ShadeVertex(vinput++); - *vert[1] = ShadeVertex(vinput++); - for (int i = 2; i < vcount; i++) - { - *vert[2] = ShadeVertex(vinput++); - DrawShadedTriangle(vert, toggleccw); - ShadedTriVertex *vtmp = vert[0]; - vert[0] = vert[1]; - vert[1] = vert[2]; - vert[2] = vtmp; - toggleccw = !toggleccw; - } - } - else if (drawmode == PolyDrawMode::Lines) - { - for (int i = 0; i < vcount / 2; i++) - { - *vert[0] = ShadeVertex(vinput++); - *vert[1] = ShadeVertex(vinput++); - DrawShadedLine(vert); - } - } - else if (drawmode == PolyDrawMode::Points) - { - for (int i = 0; i < vcount; i++) - { - *vert[0] = ShadeVertex(vinput++); - DrawShadedPoint(vert); - } - } -} - -ShadedTriVertex PolyTriangleThreadData::ShadeVertex(int index) -{ - inputAssembly->Load(this, vertices, index); - mainVertexShader.SIMPLE = (SpecialEffect == EFF_BURN) || (SpecialEffect == EFF_STENCIL); - mainVertexShader.SPHEREMAP = (SpecialEffect == EFF_SPHEREMAP); - mainVertexShader.main(); - return mainVertexShader; -} - -bool PolyTriangleThreadData::IsDegenerate(const ShadedTriVertex *const* vert) -{ - // A degenerate triangle has a zero cross product for two of its sides. - float ax = vert[1]->gl_Position.X - vert[0]->gl_Position.X; - float ay = vert[1]->gl_Position.Y - vert[0]->gl_Position.Y; - float az = vert[1]->gl_Position.W - vert[0]->gl_Position.W; - float bx = vert[2]->gl_Position.X - vert[0]->gl_Position.X; - float by = vert[2]->gl_Position.Y - vert[0]->gl_Position.Y; - float bz = vert[2]->gl_Position.W - vert[0]->gl_Position.W; - float crossx = ay * bz - az * by; - float crossy = az * bx - ax * bz; - float crossz = ax * by - ay * bx; - float crosslengthsqr = crossx * crossx + crossy * crossy + crossz * crossz; - return crosslengthsqr <= 1.e-8f; -} - -bool PolyTriangleThreadData::IsFrontfacing(TriDrawTriangleArgs *args) -{ - float a = - args->v1->x * args->v2->y - args->v2->x * args->v1->y + - args->v2->x * args->v3->y - args->v3->x * args->v2->y + - args->v3->x * args->v1->y - args->v1->x * args->v3->y; - return a <= 0.0f; -} - -void PolyTriangleThreadData::DrawShadedPoint(const ShadedTriVertex *const* vertex) -{ -} - -void PolyTriangleThreadData::DrawShadedLine(const ShadedTriVertex *const* vert) -{ - static const int numclipdistances = 9; - float clipdistance[numclipdistances * 2]; - float *clipd = clipdistance; - for (int i = 0; i < 2; i++) - { - const auto &v = *vert[i]; - clipd[0] = v.gl_Position.X + v.gl_Position.W; - clipd[1] = v.gl_Position.W - v.gl_Position.X; - clipd[2] = v.gl_Position.Y + v.gl_Position.W; - clipd[3] = v.gl_Position.W - v.gl_Position.Y; - clipd[4] = v.gl_Position.Z + v.gl_Position.W; - clipd[5] = v.gl_Position.W - v.gl_Position.Z; - clipd[6] = v.gl_ClipDistance[0]; - clipd[7] = v.gl_ClipDistance[1]; - clipd[8] = v.gl_ClipDistance[2]; - clipd += numclipdistances; - } - - float t1 = 0.0f; - float t2 = 1.0f; - for (int p = 0; p < numclipdistances; p++) - { - float clipdistance1 = clipdistance[0 * numclipdistances + p]; - float clipdistance2 = clipdistance[1 * numclipdistances + p]; - if (clipdistance1 < 0.0f) t1 = MAX(-clipdistance1 / (clipdistance2 - clipdistance1), t1); - if (clipdistance2 < 0.0f) t2 = MIN(1.0f + clipdistance2 / (clipdistance1 - clipdistance2), t2); - if (t1 >= t2) - return; - } - - float weights[] = { 1.0f - t1, t1, 1.0f - t2, t2 }; - - ScreenTriVertex clippedvert[2]; - for (int i = 0; i < 2; i++) - { - auto &v = clippedvert[i]; - memset(&v, 0, sizeof(ScreenTriVertex)); - for (int w = 0; w < 2; w++) - { - float weight = weights[i * 2 + w]; - v.x += vert[w]->gl_Position.X * weight; - v.y += vert[w]->gl_Position.Y * weight; - v.z += vert[w]->gl_Position.Z * weight; - v.w += vert[w]->gl_Position.W * weight; - } - - // Calculate normalized device coordinates: - v.w = 1.0f / v.w; - v.x *= v.w; - v.y *= v.w; - v.z *= v.w; - - // Apply viewport scale to get screen coordinates: - v.x = viewport_x + viewport_width * (1.0f + v.x) * 0.5f; - if (topdown) - v.y = viewport_y + viewport_height * (1.0f - v.y) * 0.5f; - else - v.y = viewport_y + viewport_height * (1.0f + v.y) * 0.5f; - } - - uint32_t vColorA = (int)(vert[0]->vColor.W * 255.0f + 0.5f); - uint32_t vColorR = (int)(vert[0]->vColor.X * 255.0f + 0.5f); - uint32_t vColorG = (int)(vert[0]->vColor.Y * 255.0f + 0.5f); - uint32_t vColorB = (int)(vert[0]->vColor.Z * 255.0f + 0.5f); - uint32_t color = MAKEARGB(vColorA, vColorR, vColorG, vColorB); - - // Slow and naive implementation. Hopefully fast enough.. - - float x1 = clippedvert[0].x; - float y1 = clippedvert[0].y; - float x2 = clippedvert[1].x; - float y2 = clippedvert[1].y; - float dx = x2 - x1; - float dy = y2 - y1; - float step = (abs(dx) >= abs(dy)) ? abs(dx) : abs(dy); - dx /= step; - dy /= step; - float x = x1; - float y = y1; - int istep = (int)step; - int pixelsize = dest_bgra ? 4 : 1; - for (int i = 0; i <= istep; i++) - { - int scrx = (int)x; - int scry = (int)y; - if (scrx >= clip.left && scrx < clip.right && scry >= clip.top && scry < clip.bottom && !line_skipped_by_thread(scry)) - { - uint8_t *destpixel = dest + (scrx + scry * dest_width) * pixelsize; - if (pixelsize == 4) - { - *reinterpret_cast(destpixel) = color; - } - else - { - *destpixel = color; - } - } - x += dx; - y += dy; - } -} - -void PolyTriangleThreadData::DrawShadedTriangle(const ShadedTriVertex *const* vert, bool ccw) -{ - // Reject triangle if degenerate - if (IsDegenerate(vert)) - return; - - // Cull, clip and generate additional vertices as needed - ScreenTriVertex clippedvert[max_additional_vertices]; - int numclipvert = ClipEdge(vert); - - // Convert barycentric weights to actual vertices - for (int i = 0; i < numclipvert; i++) - { - auto &v = clippedvert[i]; - memset(&v, 0, sizeof(ScreenTriVertex)); - for (int w = 0; w < 3; w++) - { - float weight = weights[i * 3 + w]; - v.x += vert[w]->gl_Position.X * weight; - v.y += vert[w]->gl_Position.Y * weight; - v.z += vert[w]->gl_Position.Z * weight; - v.w += vert[w]->gl_Position.W * weight; - v.u += vert[w]->vTexCoord.X * weight; - v.v += vert[w]->vTexCoord.Y * weight; - v.worldX += vert[w]->pixelpos.X * weight; - v.worldY += vert[w]->pixelpos.Y * weight; - v.worldZ += vert[w]->pixelpos.Z * weight; - v.a += vert[w]->vColor.W * weight; - v.r += vert[w]->vColor.X * weight; - v.g += vert[w]->vColor.Y * weight; - v.b += vert[w]->vColor.Z * weight; - v.gradientdistZ += vert[w]->gradientdist.Z * weight; - } - } - -#ifdef NO_SSE - // Map to 2D viewport: - for (int j = 0; j < numclipvert; j++) - { - auto &v = clippedvert[j]; - - // Calculate normalized device coordinates: - v.w = 1.0f / v.w; - v.x *= v.w; - v.y *= v.w; - v.z *= v.w; - - // Apply viewport scale to get screen coordinates: - v.x = viewport_x + viewport_width * (1.0f + v.x) * 0.5f; - if (topdown) - v.y = viewport_y + viewport_height * (1.0f - v.y) * 0.5f; - else - v.y = viewport_y + viewport_height * (1.0f + v.y) * 0.5f; - } -#else - // Map to 2D viewport: - __m128 mviewport_x = _mm_set1_ps((float)viewport_x); - __m128 mviewport_y = _mm_set1_ps((float)viewport_y); - __m128 mviewport_halfwidth = _mm_set1_ps(viewport_width * 0.5f); - __m128 mviewport_halfheight = _mm_set1_ps(viewport_height * 0.5f); - __m128 mone = _mm_set1_ps(1.0f); - int sse_length = (numclipvert + 3) / 4 * 4; - for (int j = 0; j < sse_length; j += 4) - { - __m128 vx = _mm_loadu_ps(&clippedvert[j].x); - __m128 vy = _mm_loadu_ps(&clippedvert[j + 1].x); - __m128 vz = _mm_loadu_ps(&clippedvert[j + 2].x); - __m128 vw = _mm_loadu_ps(&clippedvert[j + 3].x); - _MM_TRANSPOSE4_PS(vx, vy, vz, vw); - - // Calculate normalized device coordinates: - vw = _mm_div_ps(mone, vw); - vx = _mm_mul_ps(vx, vw); - vy = _mm_mul_ps(vy, vw); - vz = _mm_mul_ps(vz, vw); - - // Apply viewport scale to get screen coordinates: - vx = _mm_add_ps(mviewport_x, _mm_mul_ps(mviewport_halfwidth, _mm_add_ps(mone, vx))); - if (topdown) - vy = _mm_add_ps(mviewport_y, _mm_mul_ps(mviewport_halfheight, _mm_sub_ps(mone, vy))); - else - vy = _mm_add_ps(mviewport_y, _mm_mul_ps(mviewport_halfheight, _mm_add_ps(mone, vy))); - - _MM_TRANSPOSE4_PS(vx, vy, vz, vw); - _mm_storeu_ps(&clippedvert[j].x, vx); - _mm_storeu_ps(&clippedvert[j + 1].x, vy); - _mm_storeu_ps(&clippedvert[j + 2].x, vz); - _mm_storeu_ps(&clippedvert[j + 3].x, vw); - } -#endif - - if (!topdown) ccw = !ccw; - - TriDrawTriangleArgs args; - - if (twosided && numclipvert > 2) - { - args.v1 = &clippedvert[0]; - args.v2 = &clippedvert[1]; - args.v3 = &clippedvert[2]; - ccw = !IsFrontfacing(&args); - } - - // Draw screen triangles - if (ccw) - { - for (int i = numclipvert - 1; i > 1; i--) - { - args.v1 = &clippedvert[numclipvert - 1]; - args.v2 = &clippedvert[i - 1]; - args.v3 = &clippedvert[i - 2]; - if (IsFrontfacing(&args) == ccw && args.CalculateGradients()) - { - ScreenTriangle::Draw(&args, this); - } - } - } - else - { - for (int i = 2; i < numclipvert; i++) - { - args.v1 = &clippedvert[0]; - args.v2 = &clippedvert[i - 1]; - args.v3 = &clippedvert[i]; - if (IsFrontfacing(&args) != ccw && args.CalculateGradients()) - { - ScreenTriangle::Draw(&args, this); - } - } - } -} - -int PolyTriangleThreadData::ClipEdge(const ShadedTriVertex *const* verts) -{ - // use barycentric weights for clipped vertices - weights = weightsbuffer; - for (int i = 0; i < 3; i++) - { - weights[i * 3 + 0] = 0.0f; - weights[i * 3 + 1] = 0.0f; - weights[i * 3 + 2] = 0.0f; - weights[i * 3 + i] = 1.0f; - } - - // Clip and cull so that the following is true for all vertices: - // -v.w <= v.x <= v.w - // -v.w <= v.y <= v.w - // -v.w <= v.z <= v.w - - // halfspace clip distances - static const int numclipdistances = 9; -#ifdef NO_SSE - float clipdistance[numclipdistances * 3]; - bool needsclipping = false; - float *clipd = clipdistance; - for (int i = 0; i < 3; i++) - { - const auto &v = *verts[i]; - clipd[0] = v.gl_Position.X + v.gl_Position.W; - clipd[1] = v.gl_Position.W - v.gl_Position.X; - clipd[2] = v.gl_Position.Y + v.gl_Position.W; - clipd[3] = v.gl_Position.W - v.gl_Position.Y; - clipd[4] = v.gl_Position.Z + v.gl_Position.W; - clipd[5] = v.gl_Position.W - v.gl_Position.Z; - clipd[6] = v.gl_ClipDistance[0]; - clipd[7] = v.gl_ClipDistance[1]; - clipd[8] = v.gl_ClipDistance[2]; - for (int j = 0; j < 9; j++) - needsclipping = needsclipping || clipd[i]; - clipd += numclipdistances; - } - - // If all halfspace clip distances are positive then the entire triangle is visible. Skip the expensive clipping step. - if (!needsclipping) - { - return 3; - } -#else - __m128 mx = _mm_loadu_ps(&verts[0]->gl_Position.X); - __m128 my = _mm_loadu_ps(&verts[1]->gl_Position.X); - __m128 mz = _mm_loadu_ps(&verts[2]->gl_Position.X); - __m128 mw = _mm_setzero_ps(); - _MM_TRANSPOSE4_PS(mx, my, mz, mw); - __m128 clipd0 = _mm_add_ps(mx, mw); - __m128 clipd1 = _mm_sub_ps(mw, mx); - __m128 clipd2 = _mm_add_ps(my, mw); - __m128 clipd3 = _mm_sub_ps(mw, my); - __m128 clipd4 = _mm_add_ps(mz, mw); - __m128 clipd5 = _mm_sub_ps(mw, mz); - __m128 clipd6 = _mm_setr_ps(verts[0]->gl_ClipDistance[0], verts[1]->gl_ClipDistance[0], verts[2]->gl_ClipDistance[0], 0.0f); - __m128 clipd7 = _mm_setr_ps(verts[0]->gl_ClipDistance[1], verts[1]->gl_ClipDistance[1], verts[2]->gl_ClipDistance[1], 0.0f); - __m128 clipd8 = _mm_setr_ps(verts[0]->gl_ClipDistance[2], verts[1]->gl_ClipDistance[2], verts[2]->gl_ClipDistance[2], 0.0f); - __m128 mneedsclipping = _mm_cmplt_ps(clipd0, _mm_setzero_ps()); - mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd1, _mm_setzero_ps())); - mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd2, _mm_setzero_ps())); - mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd3, _mm_setzero_ps())); - mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd4, _mm_setzero_ps())); - mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd5, _mm_setzero_ps())); - mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd6, _mm_setzero_ps())); - mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd7, _mm_setzero_ps())); - mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd8, _mm_setzero_ps())); - if (_mm_movemask_ps(mneedsclipping) == 0) - { - return 3; - } - float clipdistance[numclipdistances * 4]; - _mm_storeu_ps(clipdistance, clipd0); - _mm_storeu_ps(clipdistance + 4, clipd1); - _mm_storeu_ps(clipdistance + 8, clipd2); - _mm_storeu_ps(clipdistance + 12, clipd3); - _mm_storeu_ps(clipdistance + 16, clipd4); - _mm_storeu_ps(clipdistance + 20, clipd5); - _mm_storeu_ps(clipdistance + 24, clipd6); - _mm_storeu_ps(clipdistance + 28, clipd7); - _mm_storeu_ps(clipdistance + 32, clipd8); -#endif - - // Clip against each halfspace - float *input = weights; - float *output = weights + max_additional_vertices * 3; - int inputverts = 3; - for (int p = 0; p < numclipdistances; p++) - { - // Clip each edge - int outputverts = 0; - for (int i = 0; i < inputverts; i++) - { - int j = (i + 1) % inputverts; -#ifdef NO_SSE - float clipdistance1 = - clipdistance[0 * numclipdistances + p] * input[i * 3 + 0] + - clipdistance[1 * numclipdistances + p] * input[i * 3 + 1] + - clipdistance[2 * numclipdistances + p] * input[i * 3 + 2]; - - float clipdistance2 = - clipdistance[0 * numclipdistances + p] * input[j * 3 + 0] + - clipdistance[1 * numclipdistances + p] * input[j * 3 + 1] + - clipdistance[2 * numclipdistances + p] * input[j * 3 + 2]; -#else - float clipdistance1 = - clipdistance[0 + p * 4] * input[i * 3 + 0] + - clipdistance[1 + p * 4] * input[i * 3 + 1] + - clipdistance[2 + p * 4] * input[i * 3 + 2]; - - float clipdistance2 = - clipdistance[0 + p * 4] * input[j * 3 + 0] + - clipdistance[1 + p * 4] * input[j * 3 + 1] + - clipdistance[2 + p * 4] * input[j * 3 + 2]; -#endif - - // Clip halfspace - if ((clipdistance1 >= 0.0f || clipdistance2 >= 0.0f) && outputverts + 1 < max_additional_vertices) - { - float t1 = (clipdistance1 < 0.0f) ? MAX(-clipdistance1 / (clipdistance2 - clipdistance1), 0.0f) : 0.0f; - float t2 = (clipdistance2 < 0.0f) ? MIN(1.0f + clipdistance2 / (clipdistance1 - clipdistance2), 1.0f) : 1.0f; - - // add t1 vertex - for (int k = 0; k < 3; k++) - output[outputverts * 3 + k] = input[i * 3 + k] * (1.0f - t1) + input[j * 3 + k] * t1; - outputverts++; - - if (t2 != 1.0f && t2 > t1) - { - // add t2 vertex - for (int k = 0; k < 3; k++) - output[outputverts * 3 + k] = input[i * 3 + k] * (1.0f - t2) + input[j * 3 + k] * t2; - outputverts++; - } - } - } - std::swap(input, output); - inputverts = outputverts; - if (inputverts == 0) - break; - } - - weights = input; - return inputverts; -} - -PolyTriangleThreadData *PolyTriangleThreadData::Get(DrawerThread *thread) -{ - if (!thread->poly) - thread->poly = std::make_shared(thread->core, thread->num_cores, thread->numa_node, thread->num_numa_nodes, thread->numa_start_y, thread->numa_end_y); - return thread->poly.get(); -} diff --git a/src/rendering/polyrenderer/drawers/poly_triangle.h b/src/rendering/polyrenderer/drawers/poly_triangle.h index dd281b680..80e942744 100644 --- a/src/rendering/polyrenderer/drawers/poly_triangle.h +++ b/src/rendering/polyrenderer/drawers/poly_triangle.h @@ -119,467 +119,3 @@ class PolyInputAssembly public: virtual void Load(PolyTriangleThreadData *thread, const void *vertices, int index) = 0; }; - -struct PolyLight -{ - uint32_t color; - float x, y, z; - float radius; -}; - -class PolyTriangleThreadData -{ -public: - PolyTriangleThreadData(int32_t core, int32_t num_cores, int32_t numa_node, int32_t num_numa_nodes, int numa_start_y, int numa_end_y) - : core(core), num_cores(num_cores), numa_node(numa_node), num_numa_nodes(num_numa_nodes), numa_start_y(numa_start_y), numa_end_y(numa_end_y) - { - } - - void ClearDepth(float value); - void ClearStencil(uint8_t value); - void SetViewport(int x, int y, int width, int height, uint8_t *dest, int dest_width, int dest_height, int dest_pitch, bool dest_bgra, PolyDepthStencil *depthstencil, bool topdown); - - void SetCullCCW(bool value) { ccw = value; } - void SetTwoSided(bool value) { twosided = value; } - - void SetInputAssembly(PolyInputAssembly *input) { inputAssembly = input; } - void SetVertexBuffer(const void *data) { vertices = data; } - void SetIndexBuffer(const void *data) { elements = (const unsigned int *)data; } - void SetLightBuffer(const void *data) { lights = (const FVector4 *)data; } - void SetViewpointUniforms(const HWViewpointUniforms *uniforms); - void SetDepthClamp(bool on); - void SetDepthMask(bool on); - void SetDepthFunc(int func); - void SetDepthRange(float min, float max); - void SetDepthBias(float depthBiasConstantFactor, float depthBiasSlopeFactor); - void SetColorMask(bool r, bool g, bool b, bool a); - void SetStencil(int stencilRef, int op); - void SetCulling(int mode); - void EnableStencil(bool on); - void SetScissor(int x, int y, int w, int h); - void SetRenderStyle(FRenderStyle style); - void SetTexture(int unit, const void *pixels, int width, int height, bool bgra); - void SetShader(int specialEffect, int effectState, bool alphaTest); - - void UpdateClip(); - - void PushStreamData(const StreamData &data, const PolyPushConstants &constants); - void PushMatrices(const VSMatrix &modelMatrix, const VSMatrix &normalModelMatrix, const VSMatrix &textureMatrix); - - void DrawIndexed(int index, int count, PolyDrawMode mode); - void Draw(int index, int vcount, PolyDrawMode mode); - - int32_t core; - int32_t num_cores; - int32_t numa_node; - int32_t num_numa_nodes; - - int numa_start_y; - int numa_end_y; - - bool line_skipped_by_thread(int line) - { - return line < numa_start_y || line >= numa_end_y || line % num_cores != core; - } - - int skipped_by_thread(int first_line) - { - int clip_first_line = MAX(first_line, numa_start_y); - int core_skip = (num_cores - (clip_first_line - core) % num_cores) % num_cores; - return clip_first_line + core_skip - first_line; - } - - int count_for_thread(int first_line, int count) - { - count = MIN(count, numa_end_y - first_line); - int c = (count - skipped_by_thread(first_line) + num_cores - 1) / num_cores; - return MAX(c, 0); - } - - struct Scanline - { - float W[MAXWIDTH]; - uint16_t U[MAXWIDTH]; - uint16_t V[MAXWIDTH]; - float WorldX[MAXWIDTH]; - float WorldY[MAXWIDTH]; - float WorldZ[MAXWIDTH]; - uint8_t vColorA[MAXWIDTH]; - uint8_t vColorR[MAXWIDTH]; - uint8_t vColorG[MAXWIDTH]; - uint8_t vColorB[MAXWIDTH]; - float GradientdistZ[MAXWIDTH]; - uint32_t FragColor[MAXWIDTH]; - uint16_t lightarray[MAXWIDTH]; - uint32_t dynlights[MAXWIDTH]; - uint8_t discard[MAXWIDTH]; - } scanline; - - static PolyTriangleThreadData *Get(DrawerThread *thread); - - int dest_pitch = 0; - int dest_width = 0; - int dest_height = 0; - bool dest_bgra = false; - uint8_t *dest = nullptr; - PolyDepthStencil *depthstencil = nullptr; - bool topdown = true; - - float depthbias = 0.0f; - - int viewport_y = 0; - - struct ClipRect - { - int left = 0; - int top = 0; - int right = 0; - int bottom = 0; - } clip, scissor; - - FRenderStyle RenderStyle; - int SpecialEffect = EFF_NONE; - int EffectState = 0; - bool AlphaTest = false; - uint32_t AlphaThreshold = 0x7f000000; - const PolyPushConstants* PushConstants = nullptr; - - const void *vertices = nullptr; - const unsigned int *elements = nullptr; - const FVector4 *lights = nullptr; - - enum { maxPolyLights = 16 }; - PolyLight polyLights[maxPolyLights]; - int numPolyLights = 0; - - PolyMainVertexShader mainVertexShader; - - struct TextureUnit - { - const void* pixels = nullptr; - int width = 0; - int height = 0; - bool bgra = true; - } textures[16]; - - bool DepthTest = false; - bool StencilTest = true; - bool WriteStencil = true; - bool WriteColor = true; - bool WriteDepth = true; - uint8_t StencilTestValue = 0; - uint8_t StencilWriteValue = 0; - - void (*FragmentShader)(int x0, int x1, PolyTriangleThreadData* thread) = nullptr; - void (*WriteColorFunc)(int y, int x0, int x1, PolyTriangleThreadData* thread) = nullptr; - -private: - ShadedTriVertex ShadeVertex(int index); - void DrawShadedPoint(const ShadedTriVertex *const* vertex); - void DrawShadedLine(const ShadedTriVertex *const* vertices); - void DrawShadedTriangle(const ShadedTriVertex *const* vertices, bool ccw); - static bool IsDegenerate(const ShadedTriVertex *const* vertices); - static bool IsFrontfacing(TriDrawTriangleArgs *args); - - int ClipEdge(const ShadedTriVertex *const* verts); - - int viewport_x = 0; - int viewport_width = 0; - int viewport_height = 0; - bool ccw = true; - bool twosided = true; - PolyInputAssembly *inputAssembly = nullptr; - - enum { max_additional_vertices = 16 }; - float weightsbuffer[max_additional_vertices * 3 * 2]; - float *weights = nullptr; -}; - -class PolyDrawerCommand : public DrawerCommand -{ -public: -}; - -class PolySetDepthClampCommand : public PolyDrawerCommand -{ -public: - PolySetDepthClampCommand(bool on) : on(on) { } - void Execute(DrawerThread *thread) override { PolyTriangleThreadData::Get(thread)->SetDepthClamp(on); } - -private: - bool on; -}; - -class PolySetDepthMaskCommand : public PolyDrawerCommand -{ -public: - PolySetDepthMaskCommand(bool on) : on(on) { } - void Execute(DrawerThread *thread) override { PolyTriangleThreadData::Get(thread)->SetDepthMask(on); } - -private: - bool on; -}; - -class PolySetDepthFuncCommand : public PolyDrawerCommand -{ -public: - PolySetDepthFuncCommand(int func) : func(func) { } - void Execute(DrawerThread *thread) override { PolyTriangleThreadData::Get(thread)->SetDepthFunc(func); } - -private: - int func; -}; - -class PolySetDepthRangeCommand : public PolyDrawerCommand -{ -public: - PolySetDepthRangeCommand(float min, float max) : min(min), max(max) { } - void Execute(DrawerThread *thread) override { PolyTriangleThreadData::Get(thread)->SetDepthRange(min, max); } - -private: - float min; - float max; -}; - -class PolySetDepthBiasCommand : public PolyDrawerCommand -{ -public: - PolySetDepthBiasCommand(float depthBiasConstantFactor, float depthBiasSlopeFactor) : depthBiasConstantFactor(depthBiasConstantFactor), depthBiasSlopeFactor(depthBiasSlopeFactor) { } - void Execute(DrawerThread *thread) override { PolyTriangleThreadData::Get(thread)->SetDepthBias(depthBiasConstantFactor, depthBiasSlopeFactor); } - -private: - float depthBiasConstantFactor; - float depthBiasSlopeFactor; -}; - -class PolySetColorMaskCommand : public PolyDrawerCommand -{ -public: - PolySetColorMaskCommand(bool r, bool g, bool b, bool a) : r(r), g(g), b(b), a(a) { } - void Execute(DrawerThread *thread) override { PolyTriangleThreadData::Get(thread)->SetColorMask(r, g, b, a); } - -private: - bool r; - bool g; - bool b; - bool a; -}; - -class PolySetStencilCommand : public PolyDrawerCommand -{ -public: - PolySetStencilCommand(int stencilRef, int op) : stencilRef(stencilRef), op(op) { } - void Execute(DrawerThread *thread) override { PolyTriangleThreadData::Get(thread)->SetStencil(stencilRef, op); } - -private: - int stencilRef; - int op; -}; - -class PolySetCullingCommand : public PolyDrawerCommand -{ -public: - PolySetCullingCommand(int mode) : mode(mode) { } - void Execute(DrawerThread *thread) override { PolyTriangleThreadData::Get(thread)->SetCulling(mode); } - -private: - int mode; -}; - -class PolyEnableStencilCommand : public PolyDrawerCommand -{ -public: - PolyEnableStencilCommand(bool on) : on(on) { } - void Execute(DrawerThread *thread) override { PolyTriangleThreadData::Get(thread)->EnableStencil(on); } - -private: - bool on; -}; - -class PolySetScissorCommand : public PolyDrawerCommand -{ -public: - PolySetScissorCommand(int x, int y, int w, int h) : x(x), y(y), w(w), h(h) { } - void Execute(DrawerThread *thread) override { PolyTriangleThreadData::Get(thread)->SetScissor(x, y, w, h); } - -private: - int x; - int y; - int w; - int h; -}; - -class PolySetRenderStyleCommand : public PolyDrawerCommand -{ -public: - PolySetRenderStyleCommand(FRenderStyle style) : style(style) { } - void Execute(DrawerThread *thread) override { PolyTriangleThreadData::Get(thread)->SetRenderStyle(style); } - -private: - FRenderStyle style; -}; - -class PolySetTextureCommand : public PolyDrawerCommand -{ -public: - PolySetTextureCommand(int unit, void *pixels, int width, int height, bool bgra) : unit(unit), pixels(pixels), width(width), height(height), bgra(bgra) { } - void Execute(DrawerThread *thread) override { PolyTriangleThreadData::Get(thread)->SetTexture(unit, pixels, width, height, bgra); } - -private: - int unit; - void *pixels; - int width; - int height; - bool bgra; -}; - -class PolySetShaderCommand : public PolyDrawerCommand -{ -public: - PolySetShaderCommand(int specialEffect, int effectState, bool alphaTest) : specialEffect(specialEffect), effectState(effectState), alphaTest(alphaTest) { } - void Execute(DrawerThread *thread) override { PolyTriangleThreadData::Get(thread)->SetShader(specialEffect, effectState, alphaTest); } - -private: - int specialEffect; - int effectState; - bool alphaTest; -}; - -class PolySetVertexBufferCommand : public PolyDrawerCommand -{ -public: - PolySetVertexBufferCommand(const void *vertices) : vertices(vertices) { } - void Execute(DrawerThread *thread) override { PolyTriangleThreadData::Get(thread)->SetVertexBuffer(vertices); } - -private: - const void *vertices; -}; - -class PolySetIndexBufferCommand : public PolyDrawerCommand -{ -public: - PolySetIndexBufferCommand(const void *indices) : indices(indices) { } - void Execute(DrawerThread *thread) override { PolyTriangleThreadData::Get(thread)->SetIndexBuffer(indices); } - -private: - const void *indices; -}; - -class PolySetLightBufferCommand : public PolyDrawerCommand -{ -public: - PolySetLightBufferCommand(const void *lights) : lights(lights) { } - void Execute(DrawerThread *thread) override { PolyTriangleThreadData::Get(thread)->SetLightBuffer(lights); } - -private: - const void *lights; -}; - -class PolySetInputAssemblyCommand : public PolyDrawerCommand -{ -public: - PolySetInputAssemblyCommand(PolyInputAssembly *input) : input(input) { } - void Execute(DrawerThread *thread) override { PolyTriangleThreadData::Get(thread)->SetInputAssembly(input); } - -private: - PolyInputAssembly *input; -}; - -class PolyClearDepthCommand : public PolyDrawerCommand -{ -public: - PolyClearDepthCommand(float value) : value(value) { } - void Execute(DrawerThread *thread) override { PolyTriangleThreadData::Get(thread)->ClearDepth(value); } - -private: - float value; -}; - -class PolyClearStencilCommand : public PolyDrawerCommand -{ -public: - PolyClearStencilCommand(uint8_t value) : value(value) { } - void Execute(DrawerThread *thread) override { PolyTriangleThreadData::Get(thread)->ClearStencil(value); } - -private: - uint8_t value; -}; - -class PolySetViewportCommand : public PolyDrawerCommand -{ -public: - PolySetViewportCommand(int x, int y, int width, int height, uint8_t *dest, int dest_width, int dest_height, int dest_pitch, bool dest_bgra, PolyDepthStencil *depthstencil, bool topdown) - : x(x), y(y), width(width), height(height), dest(dest), dest_width(dest_width), dest_height(dest_height), dest_pitch(dest_pitch), dest_bgra(dest_bgra), depthstencil(depthstencil), topdown(topdown) { } - void Execute(DrawerThread *thread) override { PolyTriangleThreadData::Get(thread)->SetViewport(x, y, width, height, dest, dest_width, dest_height, dest_pitch, dest_bgra, depthstencil, topdown); } - -private: - int x; - int y; - int width; - int height; - uint8_t *dest; - int dest_width; - int dest_height; - int dest_pitch; - bool dest_bgra; - PolyDepthStencil *depthstencil; - bool topdown; -}; - -class PolySetViewpointUniformsCommand : public PolyDrawerCommand -{ -public: - PolySetViewpointUniformsCommand(const HWViewpointUniforms *uniforms) : uniforms(uniforms) {} - void Execute(DrawerThread *thread) override { PolyTriangleThreadData::Get(thread)->SetViewpointUniforms(uniforms); } - -private: - const HWViewpointUniforms *uniforms; -}; - -class PolyPushMatricesCommand : public PolyDrawerCommand -{ -public: - PolyPushMatricesCommand(const VSMatrix &modelMatrix, const VSMatrix &normalModelMatrix, const VSMatrix &textureMatrix) - : modelMatrix(modelMatrix), normalModelMatrix(normalModelMatrix), textureMatrix(textureMatrix) { } - void Execute(DrawerThread *thread) override { PolyTriangleThreadData::Get(thread)->PushMatrices(modelMatrix, normalModelMatrix, textureMatrix); } - -private: - VSMatrix modelMatrix; - VSMatrix normalModelMatrix; - VSMatrix textureMatrix; -}; - -class PolyPushStreamDataCommand : public PolyDrawerCommand -{ -public: - PolyPushStreamDataCommand(const StreamData &data, const PolyPushConstants &constants) : data(data), constants(constants) { } - void Execute(DrawerThread *thread) override { PolyTriangleThreadData::Get(thread)->PushStreamData(data, constants); } - -private: - StreamData data; - PolyPushConstants constants; -}; - -class PolyDrawCommand : public PolyDrawerCommand -{ -public: - PolyDrawCommand(int index, int count, PolyDrawMode mode) : index(index), count(count), mode(mode) { } - void Execute(DrawerThread *thread) override { PolyTriangleThreadData::Get(thread)->Draw(index, count, mode); } - -private: - int index; - int count; - PolyDrawMode mode; -}; - -class PolyDrawIndexedCommand : public PolyDrawerCommand -{ -public: - PolyDrawIndexedCommand(int index, int count, PolyDrawMode mode) : index(index), count(count), mode(mode) { } - void Execute(DrawerThread *thread) override { PolyTriangleThreadData::Get(thread)->DrawIndexed(index, count, mode); } - -private: - int index; - int count; - PolyDrawMode mode; -}; diff --git a/src/rendering/polyrenderer/drawers/screen_blend.cpp b/src/rendering/polyrenderer/drawers/screen_blend.cpp new file mode 100644 index 000000000..c061668af --- /dev/null +++ b/src/rendering/polyrenderer/drawers/screen_blend.cpp @@ -0,0 +1,575 @@ +/* +** Polygon Doom software renderer +** Copyright (c) 2016 Magnus Norddahl +** +** This software is provided 'as-is', without any express or implied +** warranty. In no event will the authors be held liable for any damages +** arising from the use of this software. +** +** Permission is granted to anyone to use this software for any purpose, +** including commercial applications, and to alter it and redistribute it +** freely, subject to the following restrictions: +** +** 1. The origin of this software must not be misrepresented; you must not +** claim that you wrote the original software. If you use this software +** in a product, an acknowledgment in the product documentation would be +** appreciated but is not required. +** 2. Altered source versions must be plainly marked as such, and must not be +** misrepresented as being the original software. +** 3. This notice may not be removed or altered from any source distribution. +** +*/ + +#include "screen_blend.h" + +static const int shiftTable[] = { + 0, 0, 0, 0, // STYLEALPHA_Zero + 0, 0, 0, 0, // STYLEALPHA_One + 24, 24, 24, 24, // STYLEALPHA_Src + 24, 24, 24, 24, // STYLEALPHA_InvSrc + 24, 16, 8, 0, // STYLEALPHA_SrcCol + 24, 16, 8, 0, // STYLEALPHA_InvSrcCol + 24, 16, 8, 0, // STYLEALPHA_DstCol + 24, 16, 8, 0 // STYLEALPHA_InvDstCol +}; + +#if 1 //#ifndef USE_AVX2 +template +static void BlendColor(int y, int x0, int x1, PolyTriangleThreadData* thread) +{ + FRenderStyle style = thread->RenderStyle; + + bool invsrc = style.SrcAlpha & 1; + bool invdst = style.DestAlpha & 1; + + const int* shiftsrc = shiftTable + (style.SrcAlpha << 2); + const int* shiftdst = shiftTable + (style.DestAlpha << 2); + + uint32_t* dest = (uint32_t*)thread->dest; + uint32_t* line = dest + y * (ptrdiff_t)thread->dest_pitch; + uint32_t* fragcolor = thread->scanline.FragColor; + + int srcSelect = style.SrcAlpha <= STYLEALPHA_One ? 0 : (style.SrcAlpha >= STYLEALPHA_DstCol ? 1 : 2); + int dstSelect = style.DestAlpha <= STYLEALPHA_One ? 0 : (style.DestAlpha >= STYLEALPHA_DstCol ? 1 : 2); + + uint32_t inputs[3]; + inputs[0] = 0; + + for (int x = x0; x < x1; x++) + { + inputs[1] = line[x]; + inputs[2] = fragcolor[x]; + + uint32_t srcinput = inputs[srcSelect]; + uint32_t dstinput = inputs[dstSelect]; + + uint32_t out[4]; + for (int i = 0; i < 4; i++) + { + // Grab component for scale factors + int32_t src = (srcinput >> shiftsrc[i]) & 0xff; + int32_t dst = (dstinput >> shiftdst[i]) & 0xff; + + // Inverse if needed + if (invsrc) src = 0xff - src; + if (invdst) dst = 0xff - dst; + + // Rescale 0-255 to 0-256 + src = src + (src >> 7); + dst = dst + (dst >> 7); + + // Multiply with input + src = src * ((inputs[2] >> (24 - (i << 3))) & 0xff); + dst = dst * ((inputs[1] >> (24 - (i << 3))) & 0xff); + + // Apply blend operator + int32_t val; + if (OptT::Flags & SWBLEND_Sub) + { + val = src - dst; + } + else if (OptT::Flags & SWBLEND_RevSub) + { + val = dst - src; + } + else + { + val = src + dst; + } + out[i] = clamp((val + 127) >> 8, 0, 255); + } + + line[x] = MAKEARGB(out[0], out[1], out[2], out[3]); + } +} +#else +template +static void BlendColor(int y, int x0, int x1, PolyTriangleThreadData* thread) +{ + FRenderStyle style = thread->RenderStyle; + + bool invsrc = style.SrcAlpha & 1; + bool invdst = style.DestAlpha & 1; + + __m128i shiftsrc = _mm_loadu_si128((const __m128i*)(shiftTable + (style.SrcAlpha << 2))); + __m128i shiftdst = _mm_loadu_si128((const __m128i*)(shiftTable + (style.DestAlpha << 2))); + + uint32_t* dest = (uint32_t*)thread->dest; + uint32_t* line = dest + y * (ptrdiff_t)thread->dest_pitch; + uint32_t* fragcolor = thread->scanline.FragColor; + + int srcSelect = style.SrcAlpha <= STYLEALPHA_One ? 0 : (style.SrcAlpha >= STYLEALPHA_DstCol ? 1 : 2); + int dstSelect = style.DestAlpha <= STYLEALPHA_One ? 0 : (style.DestAlpha >= STYLEALPHA_DstCol ? 1 : 2); + + uint32_t inputs[3]; + inputs[0] = 0; + + __m128i shiftmul = _mm_set_epi32(24, 16, 8, 0); + + for (int x = x0; x < x1; x++) + { + inputs[1] = line[x]; + inputs[2] = fragcolor[x]; + + __m128i srcinput = _mm_set1_epi32(inputs[srcSelect]); + __m128i dstinput = _mm_set1_epi32(inputs[dstSelect]); + + // Grab component for scale factors + __m128i src = _mm_and_si128(_mm_srlv_epi32(srcinput, shiftsrc), _mm_set1_epi32(0xff)); + __m128i dst = _mm_and_si128(_mm_srlv_epi32(dstinput, shiftdst), _mm_set1_epi32(0xff)); + + // Inverse if needed + if (invsrc) src = _mm_sub_epi32(_mm_set1_epi32(0xff), src); + if (invdst) dst = _mm_sub_epi32(_mm_set1_epi32(0xff), dst); + + // Rescale 0-255 to 0-256 + src = _mm_add_epi32(src, _mm_srli_epi32(src, 7)); + dst = _mm_add_epi32(dst, _mm_srli_epi32(dst, 7)); + + // Multiply with input + __m128i mulsrc = _mm_and_si128(_mm_srlv_epi32(_mm_set1_epi32(inputs[2]), shiftmul), _mm_set1_epi32(0xff)); + __m128i muldst = _mm_and_si128(_mm_srlv_epi32(_mm_set1_epi32(inputs[1]), shiftmul), _mm_set1_epi32(0xff)); + __m128i mulresult = _mm_mullo_epi16(_mm_packs_epi32(src, dst), _mm_packs_epi32(mulsrc, muldst)); + src = _mm_unpacklo_epi16(mulresult, _mm_setzero_si128()); + dst = _mm_unpackhi_epi16(mulresult, _mm_setzero_si128()); + + // Apply blend operator + __m128i val; + if (OptT::Flags & SWBLEND_Sub) + { + val = _mm_sub_epi32(src, dst); + } + else if (OptT::Flags & SWBLEND_RevSub) + { + val = _mm_sub_epi32(dst, src); + } + else + { + val = _mm_add_epi32(src, dst); + } + + __m128i out = _mm_srli_epi32(_mm_add_epi32(val, _mm_set1_epi32(127)), 8); + out = _mm_packs_epi32(out, out); + out = _mm_packus_epi16(out, out); + line[x] = _mm_cvtsi128_si32(out); + } +} +#endif + +#ifdef NO_SSE +static void BlendColorOpaque(int y, int x0, int x1, PolyTriangleThreadData* thread) +{ + uint32_t* dest = (uint32_t*)thread->dest; + uint32_t* line = dest + y * (ptrdiff_t)thread->dest_pitch; + uint32_t* fragcolor = thread->scanline.FragColor; + + memcpy(line + x0, fragcolor + x0, (x1 - x0) * sizeof(uint32_t)); +} +#else +static void BlendColorOpaque(int y, int x0, int x1, PolyTriangleThreadData* thread) +{ + uint32_t* dest = (uint32_t*)thread->dest; + uint32_t* line = dest + y * (ptrdiff_t)thread->dest_pitch; + uint32_t* fragcolor = thread->scanline.FragColor; + + int ssecount = ((x1 - x0) & ~3); + int sseend = x0 + ssecount; + + for (int x = x0; x < sseend; x += 4) + { + __m128i v = _mm_loadu_si128((__m128i*) & fragcolor[x]); + _mm_storeu_si128((__m128i*) & line[x], v); + } + + for (int x = sseend; x < x1; x++) + { + line[x] = fragcolor[x]; + } +} +#endif + +static void BlendColorAdd_Src_InvSrc(int y, int x0, int x1, PolyTriangleThreadData* thread) +{ + uint32_t* line = (uint32_t*)thread->dest + y * (ptrdiff_t)thread->dest_pitch; + uint32_t* fragcolor = thread->scanline.FragColor; + + int sseend = x0; + +#ifndef NO_SSE + int ssecount = ((x1 - x0) & ~1); + sseend = x0 + ssecount; + for (int x = x0; x < sseend; x += 2) + { + __m128i dst = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)&line[x]), _mm_setzero_si128()); + __m128i src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)&fragcolor[x]), _mm_setzero_si128()); + + __m128i srcscale = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)); + srcscale = _mm_add_epi16(srcscale, _mm_srli_epi16(srcscale, 7)); + __m128i dstscale = _mm_sub_epi16(_mm_set1_epi16(256), srcscale); + + __m128i out = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(src, srcscale), _mm_mullo_epi16(dst, dstscale)), _mm_set1_epi16(127)), 8); + _mm_storel_epi64((__m128i*)&line[x], _mm_packus_epi16(out, out)); + } +#endif + + for (int x = sseend; x < x1; x++) + { + uint32_t dst = line[x]; + uint32_t src = fragcolor[x]; + + uint32_t srcscale = APART(src); + srcscale += srcscale >> 7; + uint32_t dstscale = 256 - srcscale; + + uint32_t a = ((APART(src) * srcscale + APART(dst) * dstscale) + 127) >> 8; + uint32_t r = ((RPART(src) * srcscale + RPART(dst) * dstscale) + 127) >> 8; + uint32_t g = ((GPART(src) * srcscale + GPART(dst) * dstscale) + 127) >> 8; + uint32_t b = ((BPART(src) * srcscale + BPART(dst) * dstscale) + 127) >> 8; + + line[x] = MAKEARGB(a, r, g, b); + } +} + +static void BlendColorAdd_SrcCol_InvSrcCol(int y, int x0, int x1, PolyTriangleThreadData* thread) +{ + uint32_t* line = (uint32_t*)thread->dest + y * (ptrdiff_t)thread->dest_pitch; + uint32_t* fragcolor = thread->scanline.FragColor; + + int sseend = x0; + +#ifndef NO_SSE + int ssecount = ((x1 - x0) & ~1); + sseend = x0 + ssecount; + for (int x = x0; x < sseend; x += 2) + { + __m128i dst = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*) & line[x]), _mm_setzero_si128()); + __m128i src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*) & fragcolor[x]), _mm_setzero_si128()); + + __m128i srcscale = src; + srcscale = _mm_add_epi16(srcscale, _mm_srli_epi16(srcscale, 7)); + __m128i dstscale = _mm_sub_epi16(_mm_set1_epi16(256), srcscale); + + __m128i out = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(src, srcscale), _mm_mullo_epi16(dst, dstscale)), _mm_set1_epi16(127)), 8); + _mm_storel_epi64((__m128i*) & line[x], _mm_packus_epi16(out, out)); + } +#endif + + for (int x = sseend; x < x1; x++) + { + uint32_t dst = line[x]; + uint32_t src = fragcolor[x]; + + uint32_t srcscale_a = APART(src); + uint32_t srcscale_r = RPART(src); + uint32_t srcscale_g = GPART(src); + uint32_t srcscale_b = BPART(src); + srcscale_a += srcscale_a >> 7; + srcscale_r += srcscale_r >> 7; + srcscale_g += srcscale_g >> 7; + srcscale_b += srcscale_b >> 7; + uint32_t dstscale_a = 256 - srcscale_a; + uint32_t dstscale_r = 256 - srcscale_r; + uint32_t dstscale_g = 256 - srcscale_g; + uint32_t dstscale_b = 256 - srcscale_b; + + uint32_t a = ((APART(src) * srcscale_a + APART(dst) * dstscale_a) + 127) >> 8; + uint32_t r = ((RPART(src) * srcscale_r + RPART(dst) * dstscale_r) + 127) >> 8; + uint32_t g = ((GPART(src) * srcscale_g + GPART(dst) * dstscale_g) + 127) >> 8; + uint32_t b = ((BPART(src) * srcscale_b + BPART(dst) * dstscale_b) + 127) >> 8; + + line[x] = MAKEARGB(a, r, g, b); + } +} + +static void BlendColorAdd_Src_One(int y, int x0, int x1, PolyTriangleThreadData* thread) +{ + uint32_t* line = (uint32_t*)thread->dest + y * (ptrdiff_t)thread->dest_pitch; + uint32_t* fragcolor = thread->scanline.FragColor; + + int sseend = x0; + +#ifndef NO_SSE + int ssecount = ((x1 - x0) & ~1); + sseend = x0 + ssecount; + for (int x = x0; x < sseend; x += 2) + { + __m128i dst = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*) & line[x]), _mm_setzero_si128()); + __m128i src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*) & fragcolor[x]), _mm_setzero_si128()); + + __m128i srcscale = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)); + srcscale = _mm_add_epi16(srcscale, _mm_srli_epi16(srcscale, 7)); + + __m128i out = _mm_add_epi16(_mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(src, srcscale), _mm_set1_epi16(127)), 8), dst); + _mm_storel_epi64((__m128i*) & line[x], _mm_packus_epi16(out, out)); + } +#endif + + for (int x = sseend; x < x1; x++) + { + uint32_t dst = line[x]; + uint32_t src = fragcolor[x]; + + uint32_t srcscale = APART(src); + srcscale += srcscale >> 7; + + uint32_t a = MIN((((APART(src) * srcscale) + 127) >> 8) + APART(dst), 255); + uint32_t r = MIN((((RPART(src) * srcscale) + 127) >> 8) + RPART(dst), 255); + uint32_t g = MIN((((GPART(src) * srcscale) + 127) >> 8) + GPART(dst), 255); + uint32_t b = MIN((((BPART(src) * srcscale) + 127) >> 8) + BPART(dst), 255); + + line[x] = MAKEARGB(a, r, g, b); + } +} + +static void BlendColorAdd_SrcCol_One(int y, int x0, int x1, PolyTriangleThreadData* thread) +{ + uint32_t* line = (uint32_t*)thread->dest + y * (ptrdiff_t)thread->dest_pitch; + uint32_t* fragcolor = thread->scanline.FragColor; + + int sseend = x0; + +#ifndef NO_SSE + int ssecount = ((x1 - x0) & ~1); + sseend = x0 + ssecount; + for (int x = x0; x < sseend; x += 2) + { + __m128i dst = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*) & line[x]), _mm_setzero_si128()); + __m128i src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*) & fragcolor[x]), _mm_setzero_si128()); + + __m128i srcscale = src; + srcscale = _mm_add_epi16(srcscale, _mm_srli_epi16(srcscale, 7)); + + __m128i out = _mm_add_epi16(_mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(src, srcscale), _mm_set1_epi16(127)), 8), dst); + _mm_storel_epi64((__m128i*) & line[x], _mm_packus_epi16(out, out)); + } +#endif + + for (int x = sseend; x < x1; x++) + { + uint32_t dst = line[x]; + uint32_t src = fragcolor[x]; + + uint32_t srcscale_a = APART(src); + uint32_t srcscale_r = RPART(src); + uint32_t srcscale_g = GPART(src); + uint32_t srcscale_b = BPART(src); + srcscale_a += srcscale_a >> 7; + srcscale_r += srcscale_r >> 7; + srcscale_g += srcscale_g >> 7; + srcscale_b += srcscale_b >> 7; + + uint32_t a = MIN((((APART(src) * srcscale_a) + 127) >> 8) + APART(dst), 255); + uint32_t r = MIN((((RPART(src) * srcscale_r) + 127) >> 8) + RPART(dst), 255); + uint32_t g = MIN((((GPART(src) * srcscale_g) + 127) >> 8) + GPART(dst), 255); + uint32_t b = MIN((((BPART(src) * srcscale_b) + 127) >> 8) + BPART(dst), 255); + + line[x] = MAKEARGB(a, r, g, b); + } +} + +static void BlendColorAdd_DstCol_Zero(int y, int x0, int x1, PolyTriangleThreadData* thread) +{ + uint32_t* line = (uint32_t*)thread->dest + y * (ptrdiff_t)thread->dest_pitch; + uint32_t* fragcolor = thread->scanline.FragColor; + + int sseend = x0; + +#ifndef NO_SSE + int ssecount = ((x1 - x0) & ~1); + sseend = x0 + ssecount; + for (int x = x0; x < sseend; x += 2) + { + __m128i dst = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*) & line[x]), _mm_setzero_si128()); + __m128i src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*) & fragcolor[x]), _mm_setzero_si128()); + + __m128i srcscale = dst; + srcscale = _mm_add_epi16(srcscale, _mm_srli_epi16(srcscale, 7)); + + __m128i out = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(src, srcscale), _mm_set1_epi16(127)), 8); + _mm_storel_epi64((__m128i*) & line[x], _mm_packus_epi16(out, out)); + } +#endif + + for (int x = sseend; x < x1; x++) + { + uint32_t dst = line[x]; + uint32_t src = fragcolor[x]; + + uint32_t srcscale_a = APART(dst); + uint32_t srcscale_r = RPART(dst); + uint32_t srcscale_g = GPART(dst); + uint32_t srcscale_b = BPART(dst); + srcscale_a += srcscale_a >> 7; + srcscale_r += srcscale_r >> 7; + srcscale_g += srcscale_g >> 7; + srcscale_b += srcscale_b >> 7; + + uint32_t a = (((APART(src) * srcscale_a) + 127) >> 8); + uint32_t r = (((RPART(src) * srcscale_r) + 127) >> 8); + uint32_t g = (((GPART(src) * srcscale_g) + 127) >> 8); + uint32_t b = (((BPART(src) * srcscale_b) + 127) >> 8); + + line[x] = MAKEARGB(a, r, g, b); + } +} + +static void BlendColorAdd_InvDstCol_Zero(int y, int x0, int x1, PolyTriangleThreadData* thread) +{ + uint32_t* line = (uint32_t*)thread->dest + y * (ptrdiff_t)thread->dest_pitch; + uint32_t* fragcolor = thread->scanline.FragColor; + + int sseend = x0; + +#ifndef NO_SSE + int ssecount = ((x1 - x0) & ~1); + sseend = x0 + ssecount; + for (int x = x0; x < sseend; x += 2) + { + __m128i dst = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*) & line[x]), _mm_setzero_si128()); + __m128i src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*) & fragcolor[x]), _mm_setzero_si128()); + + __m128i srcscale = _mm_sub_epi16(_mm_set1_epi16(255), dst); + srcscale = _mm_add_epi16(srcscale, _mm_srli_epi16(srcscale, 7)); + + __m128i out = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(src, srcscale), _mm_set1_epi16(127)), 8); + _mm_storel_epi64((__m128i*) & line[x], _mm_packus_epi16(out, out)); + } +#endif + + for (int x = sseend; x < x1; x++) + { + uint32_t dst = line[x]; + uint32_t src = fragcolor[x]; + + uint32_t srcscale_a = 255 - APART(dst); + uint32_t srcscale_r = 255 - RPART(dst); + uint32_t srcscale_g = 255 - GPART(dst); + uint32_t srcscale_b = 255 - BPART(dst); + srcscale_a += srcscale_a >> 7; + srcscale_r += srcscale_r >> 7; + srcscale_g += srcscale_g >> 7; + srcscale_b += srcscale_b >> 7; + + uint32_t a = (((APART(src) * srcscale_a) + 127) >> 8); + uint32_t r = (((RPART(src) * srcscale_r) + 127) >> 8); + uint32_t g = (((GPART(src) * srcscale_g) + 127) >> 8); + uint32_t b = (((BPART(src) * srcscale_b) + 127) >> 8); + + line[x] = MAKEARGB(a, r, g, b); + } +} + +static void BlendColorRevSub_Src_One(int y, int x0, int x1, PolyTriangleThreadData* thread) +{ + uint32_t* line = (uint32_t*)thread->dest + y * (ptrdiff_t)thread->dest_pitch; + uint32_t* fragcolor = thread->scanline.FragColor; + + int sseend = x0; + +#ifndef NO_SSE + int ssecount = ((x1 - x0) & ~1); + sseend = x0 + ssecount; + for (int x = x0; x < sseend; x += 2) + { + __m128i dst = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*) & line[x]), _mm_setzero_si128()); + __m128i src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*) & fragcolor[x]), _mm_setzero_si128()); + + __m128i srcscale = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)); + srcscale = _mm_add_epi16(srcscale, _mm_srli_epi16(srcscale, 7)); + + __m128i out = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(src, srcscale), _mm_set1_epi16(127)), 8)); + _mm_storel_epi64((__m128i*) & line[x], _mm_packus_epi16(out, out)); + } +#endif + + for (int x = sseend; x < x1; x++) + { + uint32_t dst = line[x]; + uint32_t src = fragcolor[x]; + + uint32_t srcscale = APART(src); + srcscale += srcscale >> 7; + + uint32_t a = MAX(APART(dst) - (((APART(src) * srcscale) + 127) >> 8), 0); + uint32_t r = MAX(RPART(dst) - (((RPART(src) * srcscale) + 127) >> 8), 0); + uint32_t g = MAX(GPART(dst) - (((GPART(src) * srcscale) + 127) >> 8), 0); + uint32_t b = MAX(BPART(dst) - (((BPART(src) * srcscale) + 127) >> 8), 0); + + line[x] = MAKEARGB(a, r, g, b); + } +} + +void SelectWriteColorFunc(PolyTriangleThreadData* thread) +{ + FRenderStyle style = thread->RenderStyle; + if (style.BlendOp == STYLEOP_Add) + { + if (style.SrcAlpha == STYLEALPHA_One && style.DestAlpha == STYLEALPHA_Zero) + { + thread->WriteColorFunc = &BlendColorOpaque; + } + else if (style.SrcAlpha == STYLEALPHA_Src && style.DestAlpha == STYLEALPHA_InvSrc) + { + thread->WriteColorFunc = &BlendColorAdd_Src_InvSrc; + } + else if (style.SrcAlpha == STYLEALPHA_SrcCol && style.DestAlpha == STYLEALPHA_InvSrcCol) + { + thread->WriteColorFunc = &BlendColorAdd_SrcCol_InvSrcCol; + } + else if (style.SrcAlpha == STYLEALPHA_Src && style.DestAlpha == STYLEALPHA_One) + { + thread->WriteColorFunc = &BlendColorAdd_Src_One; + } + else if (style.SrcAlpha == STYLEALPHA_SrcCol && style.DestAlpha == STYLEALPHA_One) + { + thread->WriteColorFunc = &BlendColorAdd_SrcCol_One; + } + else if (style.SrcAlpha == STYLEALPHA_DstCol && style.DestAlpha == STYLEALPHA_Zero) + { + thread->WriteColorFunc = &BlendColorAdd_DstCol_Zero; + } + else if (style.SrcAlpha == STYLEALPHA_InvDstCol && style.DestAlpha == STYLEALPHA_Zero) + { + thread->WriteColorFunc = &BlendColorAdd_InvDstCol_Zero; + } + else + { + thread->WriteColorFunc = &BlendColor; + } + } + else if (style.BlendOp == STYLEOP_Sub) + { + thread->WriteColorFunc = &BlendColor; + } + else // if (style.BlendOp == STYLEOP_RevSub) + { + if (style.SrcAlpha == STYLEALPHA_Src && style.DestAlpha == STYLEALPHA_One) + { + thread->WriteColorFunc = &BlendColorRevSub_Src_One; + } + else + { + thread->WriteColorFunc = &BlendColor; + } + } +} diff --git a/src/rendering/polyrenderer/drawers/screen_blend.h b/src/rendering/polyrenderer/drawers/screen_blend.h new file mode 100644 index 000000000..101c151b0 --- /dev/null +++ b/src/rendering/polyrenderer/drawers/screen_blend.h @@ -0,0 +1,49 @@ +/* +** Polygon Doom software renderer +** Copyright (c) 2016 Magnus Norddahl +** +** This software is provided 'as-is', without any express or implied +** warranty. In no event will the authors be held liable for any damages +** arising from the use of this software. +** +** Permission is granted to anyone to use this software for any purpose, +** including commercial applications, and to alter it and redistribute it +** freely, subject to the following restrictions: +** +** 1. The origin of this software must not be misrepresented; you must not +** claim that you wrote the original software. If you use this software +** in a product, an acknowledgment in the product documentation would be +** appreciated but is not required. +** 2. Altered source versions must be plainly marked as such, and must not be +** misrepresented as being the original software. +** 3. This notice may not be removed or altered from any source distribution. +** +*/ + +#pragma once + +class PolyTriangleThreadData; + +enum SWBlendColor +{ + SWBLEND_Sub = 1, + SWBLEND_RevSub = 2 +}; + +struct BlendColorOpt_Add { static const int Flags = 0; }; +struct BlendColorOpt_Sub { static const int Flags = 1; }; +struct BlendColorOpt_RevSub { static const int Flags = 2; }; + +template +void BlendColor(int y, int x0, int x1, PolyTriangleThreadData* thread); +void BlendColorOpaque(int y, int x0, int x1, PolyTriangleThreadData* thread); +void BlendColorOpaque(int y, int x0, int x1, PolyTriangleThreadData* thread); +void BlendColorAdd_Src_InvSrc(int y, int x0, int x1, PolyTriangleThreadData* thread); +void BlendColorAdd_SrcCol_InvSrcCol(int y, int x0, int x1, PolyTriangleThreadData* thread); +void BlendColorAdd_Src_One(int y, int x0, int x1, PolyTriangleThreadData* thread); +void BlendColorAdd_SrcCol_One(int y, int x0, int x1, PolyTriangleThreadData* thread); +void BlendColorAdd_DstCol_Zero(int y, int x0, int x1, PolyTriangleThreadData* thread); +void BlendColorAdd_InvDstCol_Zero(int y, int x0, int x1, PolyTriangleThreadData* thread); +void BlendColorRevSub_Src_One(int y, int x0, int x1, PolyTriangleThreadData* thread); + +void SelectWriteColorFunc(PolyTriangleThreadData* thread); diff --git a/src/rendering/polyrenderer/drawers/screen_scanline_setup.cpp b/src/rendering/polyrenderer/drawers/screen_scanline_setup.cpp new file mode 100644 index 000000000..1b9e2e1c3 --- /dev/null +++ b/src/rendering/polyrenderer/drawers/screen_scanline_setup.cpp @@ -0,0 +1,420 @@ +/* +** Polygon Doom software renderer +** Copyright (c) 2016 Magnus Norddahl +** +** This software is provided 'as-is', without any express or implied +** warranty. In no event will the authors be held liable for any damages +** arising from the use of this software. +** +** Permission is granted to anyone to use this software for any purpose, +** including commercial applications, and to alter it and redistribute it +** freely, subject to the following restrictions: +** +** 1. The origin of this software must not be misrepresented; you must not +** claim that you wrote the original software. If you use this software +** in a product, an acknowledgment in the product documentation would be +** appreciated but is not required. +** 2. Altered source versions must be plainly marked as such, and must not be +** misrepresented as being the original software. +** 3. This notice may not be removed or altered from any source distribution. +** +*/ + +#include +#include "templates.h" +#include "doomdef.h" +#include "poly_thread.h" +#include "screen_scanline_setup.h" +#include "x86.h" +#include + +#ifdef NO_SSE +void WriteW(int y, int x0, int x1, const TriDrawTriangleArgs* args, PolyTriangleThreadData* thread) +{ + float startX = x0 + (0.5f - args->v1->x); + float startY = y + (0.5f - args->v1->y); + + float posW = args->v1->w + args->gradientX.W * startX + args->gradientY.W * startY; + float stepW = args->gradientX.W; + float* w = thread->scanline.W; + for (int x = x0; x < x1; x++) + { + w[x] = 1.0f / posW; + posW += stepW; + } +} +#else +void WriteW(int y, int x0, int x1, const TriDrawTriangleArgs* args, PolyTriangleThreadData* thread) +{ + float startX = x0 + (0.5f - args->v1->x); + float startY = y + (0.5f - args->v1->y); + + float posW = args->v1->w + args->gradientX.W * startX + args->gradientY.W * startY; + float stepW = args->gradientX.W; + float* w = thread->scanline.W; + + int ssecount = ((x1 - x0) & ~3); + int sseend = x0 + ssecount; + + __m128 mstepW = _mm_set1_ps(stepW * 4.0f); + __m128 mposW = _mm_setr_ps(posW, posW + stepW, posW + stepW + stepW, posW + stepW + stepW + stepW); + + for (int x = x0; x < sseend; x += 4) + { + // One Newton-Raphson iteration for 1/posW + __m128 res = _mm_rcp_ps(mposW); + __m128 muls = _mm_mul_ps(mposW, _mm_mul_ps(res, res)); + _mm_storeu_ps(w + x, _mm_sub_ps(_mm_add_ps(res, res), muls)); + mposW = _mm_add_ps(mposW, mstepW); + } + + posW += ssecount * stepW; + for (int x = sseend; x < x1; x++) + { + w[x] = 1.0f / posW; + posW += stepW; + } +} +#endif + +static void WriteDynLightArray(int x0, int x1, PolyTriangleThreadData* thread) +{ + int num_lights = thread->numPolyLights; + PolyLight* lights = thread->polyLights; + + float worldnormalX = thread->mainVertexShader.vWorldNormal.X; + float worldnormalY = thread->mainVertexShader.vWorldNormal.Y; + float worldnormalZ = thread->mainVertexShader.vWorldNormal.Z; + + uint32_t* dynlights = thread->scanline.dynlights; + float* worldposX = thread->scanline.WorldX; + float* worldposY = thread->scanline.WorldY; + float* worldposZ = thread->scanline.WorldZ; + + int sseend = x0; + +#ifndef NO_SSE + int ssecount = ((x1 - x0) & ~3); + sseend = x0 + ssecount; + + __m128 mworldnormalX = _mm_set1_ps(worldnormalX); + __m128 mworldnormalY = _mm_set1_ps(worldnormalY); + __m128 mworldnormalZ = _mm_set1_ps(worldnormalZ); + + for (int x = x0; x < sseend; x += 4) + { + __m128i litlo = _mm_setzero_si128(); + //__m128i litlo = _mm_shuffle_epi32(_mm_unpacklo_epi8(_mm_cvtsi32_si128(dynlightcolor), _mm_setzero_si128()), _MM_SHUFFLE(1, 0, 1, 0)); + __m128i lithi = litlo; + + for (int i = 0; i < num_lights; i++) + { + __m128 lightposX = _mm_set1_ps(lights[i].x); + __m128 lightposY = _mm_set1_ps(lights[i].y); + __m128 lightposZ = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128i light_color = _mm_shuffle_epi32(_mm_unpacklo_epi8(_mm_cvtsi32_si128(lights[i].color), _mm_setzero_si128()), _MM_SHUFFLE(1, 0, 1, 0)); + + __m128 is_attenuated = _mm_cmplt_ps(light_radius, _mm_setzero_ps()); + light_radius = _mm_andnot_ps(_mm_set1_ps(-0.0f), light_radius); // clear sign bit + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lx = _mm_sub_ps(lightposX, _mm_loadu_ps(&worldposX[x])); + __m128 Ly = _mm_sub_ps(lightposY, _mm_loadu_ps(&worldposY[x])); + __m128 Lz = _mm_sub_ps(lightposZ, _mm_loadu_ps(&worldposZ[x])); + __m128 dist2 = _mm_add_ps(_mm_mul_ps(Lx, Lx), _mm_add_ps(_mm_mul_ps(Ly, Ly), _mm_mul_ps(Lz, Lz))); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(_mm_set1_ps(256.0f), _mm_min_ps(_mm_mul_ps(dist, light_radius), _mm_set1_ps(256.0f))); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = max(dot(N,normalize(L)),0) * attenuation + Lx = _mm_mul_ps(Lx, rcp_dist); + Ly = _mm_mul_ps(Ly, rcp_dist); + Lz = _mm_mul_ps(Lz, rcp_dist); + __m128 dotNL = _mm_add_ps(_mm_add_ps(_mm_mul_ps(mworldnormalX, Lx), _mm_mul_ps(mworldnormalY, Ly)), _mm_mul_ps(mworldnormalZ, Lz)); + __m128 point_attenuation = _mm_mul_ps(_mm_max_ps(dotNL, _mm_setzero_ps()), distance_attenuation); + + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, point_attenuation), _mm_andnot_ps(is_attenuated, simple_attenuation))); + + attenuation = _mm_shufflehi_epi16(_mm_shufflelo_epi16(attenuation, _MM_SHUFFLE(2, 2, 0, 0)), _MM_SHUFFLE(2, 2, 0, 0)); + __m128i attenlo = _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1, 1, 0, 0)); + __m128i attenhi = _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(3, 3, 2, 2)); + + litlo = _mm_add_epi16(litlo, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenlo), 8)); + lithi = _mm_add_epi16(lithi, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenhi), 8)); + } + + _mm_storeu_si128((__m128i*)&dynlights[x], _mm_packus_epi16(litlo, lithi)); + } +#endif + + for (int x = x0; x < x1; x++) + { + uint32_t lit_r = 0; + uint32_t lit_g = 0; + uint32_t lit_b = 0; + + for (int i = 0; i < num_lights; i++) + { + float lightposX = lights[i].x; + float lightposY = lights[i].y; + float lightposZ = lights[i].z; + float light_radius = lights[i].radius; + uint32_t light_color = lights[i].color; + + bool is_attenuated = light_radius < 0.0f; + if (is_attenuated) + light_radius = -light_radius; + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + float Lx = lightposX - worldposX[x]; + float Ly = lightposY - worldposY[x]; + float Lz = lightposZ - worldposZ[x]; + float dist2 = Lx * Lx + Ly * Ly + Lz * Lz; +#ifdef NO_SSE + //float rcp_dist = 1.0f / sqrt(dist2); + float rcp_dist = 1.0f / (dist2 * 0.01f); +#else + float rcp_dist = _mm_cvtss_f32(_mm_rsqrt_ss(_mm_set_ss(dist2))); +#endif + float dist = dist2 * rcp_dist; + float distance_attenuation = 256.0f - MIN(dist * light_radius, 256.0f); + + // The simple light type + float simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = max(dot(N,normalize(L)),0) * attenuation + Lx *= rcp_dist; + Ly *= rcp_dist; + Lz *= rcp_dist; + float dotNL = worldnormalX * Lx + worldnormalY * Ly + worldnormalZ * Lz; + float point_attenuation = MAX(dotNL, 0.0f) * distance_attenuation; + + uint32_t attenuation = (uint32_t)(is_attenuated ? (int32_t)point_attenuation : (int32_t)simple_attenuation); + + lit_r += (RPART(light_color) * attenuation) >> 8; + lit_g += (GPART(light_color) * attenuation) >> 8; + lit_b += (BPART(light_color) * attenuation) >> 8; + } + + lit_r = MIN(lit_r, 255); + lit_g = MIN(lit_g, 255); + lit_b = MIN(lit_b, 255); + dynlights[x] = MAKEARGB(255, lit_r, lit_g, lit_b); + + // Palette version: + // dynlights[x] = RGB256k.All[((lit_r >> 2) << 12) | ((lit_g >> 2) << 6) | (lit_b >> 2)]; + } +} + +static void WriteLightArray(int y, int x0, int x1, const TriDrawTriangleArgs* args, PolyTriangleThreadData* thread) +{ + float startX = x0 + (0.5f - args->v1->x); + float startY = y + (0.5f - args->v1->y); + float posW = args->v1->w + args->gradientX.W * startX + args->gradientY.W * startY; + float stepW = args->gradientX.W; + + float globVis = thread->mainVertexShader.Viewpoint->mGlobVis; + + uint32_t light = (int)(thread->PushConstants->uLightLevel * 255.0f); + fixed_t shade = (fixed_t)((2.0f - (light + 12.0f) / 128.0f) * (float)FRACUNIT); + fixed_t lightpos = (fixed_t)(globVis * posW * (float)FRACUNIT); + fixed_t lightstep = (fixed_t)(globVis * stepW * (float)FRACUNIT); + + fixed_t maxvis = 24 * FRACUNIT / 32; + fixed_t maxlight = 31 * FRACUNIT / 32; + + uint16_t *lightarray = thread->scanline.lightarray; + + fixed_t lightend = lightpos + lightstep * (x1 - x0); + if (lightpos < maxvis && shade >= lightpos && shade - lightpos <= maxlight && + lightend < maxvis && shade >= lightend && shade - lightend <= maxlight) + { + //if (BitsPerPixel == 32) + { + lightpos += FRACUNIT - shade; + for (int x = x0; x < x1; x++) + { + lightarray[x] = lightpos >> 8; + lightpos += lightstep; + } + } + /*else + { + lightpos = shade - lightpos; + for (int x = x0; x < x1; x++) + { + lightarray[x] = (lightpos >> 3) & 0xffffff00; + lightpos -= lightstep; + } + }*/ + } + else + { + //if (BitsPerPixel == 32) + { + for (int x = x0; x < x1; x++) + { + lightarray[x] = (FRACUNIT - clamp(shade - MIN(maxvis, lightpos), 0, maxlight)) >> 8; + lightpos += lightstep; + } + } + /*else + { + for (int x = x0; x < x1; x++) + { + lightarray[x] = (clamp(shade - MIN(maxvis, lightpos), 0, maxlight) >> 3) & 0xffffff00; + lightpos += lightstep; + } + }*/ + } +} + +#ifdef NO_SSE +static void WriteVarying(float pos, float step, int x0, int x1, const float* w, float* varying) +{ + for (int x = x0; x < x1; x++) + { + varying[x] = pos * w[x]; + pos += step; + } +} +#else +static void WriteVarying(float pos, float step, int x0, int x1, const float* w, float* varying) +{ + int ssecount = ((x1 - x0) & ~3); + int sseend = x0 + ssecount; + + __m128 mstep = _mm_set1_ps(step * 4.0f); + __m128 mpos = _mm_setr_ps(pos, pos + step, pos + step + step, pos + step + step + step); + + for (int x = x0; x < sseend; x += 4) + { + _mm_storeu_ps(varying + x, _mm_mul_ps(mpos, _mm_loadu_ps(w + x))); + mpos = _mm_add_ps(mpos, mstep); + } + + pos += ssecount * step; + for (int x = sseend; x < x1; x++) + { + varying[x] = pos * w[x]; + pos += step; + } +} +#endif + +#ifdef NO_SSE +static void WriteVaryingWrap(float pos, float step, int x0, int x1, const float* w, uint16_t* varying) +{ + for (int x = x0; x < x1; x++) + { + float value = pos * w[x]; + value = value - std::floor(value); + varying[x] = static_cast(static_cast(value * static_cast(0x1000'0000)) << 4) >> 16; + pos += step; + } +} +#else +static void WriteVaryingWrap(float pos, float step, int x0, int x1, const float* w, uint16_t* varying) +{ + int ssecount = ((x1 - x0) & ~3); + int sseend = x0 + ssecount; + + __m128 mstep = _mm_set1_ps(step * 4.0f); + __m128 mpos = _mm_setr_ps(pos, pos + step, pos + step + step, pos + step + step + step); + + for (int x = x0; x < sseend; x += 4) + { + __m128 value = _mm_mul_ps(mpos, _mm_loadu_ps(w + x)); + __m128 f = value; + __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(f)); + __m128 r = _mm_sub_ps(t, _mm_and_ps(_mm_cmplt_ps(f, t), _mm_set1_ps(1.0f))); + value = _mm_sub_ps(f, r); + + __m128i ivalue = _mm_srli_epi32(_mm_slli_epi32(_mm_cvttps_epi32(_mm_mul_ps(value, _mm_set1_ps(static_cast(0x1000'0000)))), 4), 17); + _mm_storel_epi64((__m128i*)(varying + x), _mm_slli_epi16(_mm_packs_epi32(ivalue, ivalue), 1)); + mpos = _mm_add_ps(mpos, mstep); + } + + pos += ssecount * step; + for (int x = sseend; x < x1; x++) + { + float value = pos * w[x]; + __m128 f = _mm_set_ss(value); + __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(f)); + __m128 r = _mm_sub_ss(t, _mm_and_ps(_mm_cmplt_ps(f, t), _mm_set_ss(1.0f))); + value = _mm_cvtss_f32(_mm_sub_ss(f, r)); + + varying[x] = static_cast(static_cast(value * static_cast(0x1000'0000)) << 4) >> 16; + pos += step; + } +} +#endif + +#ifdef NO_SSE +static void WriteVaryingColor(float pos, float step, int x0, int x1, const float* w, uint8_t* varying) +{ + for (int x = x0; x < x1; x++) + { + varying[x] = clamp(static_cast(pos * w[x] * 255.0f), 0, 255); + pos += step; + } +} +#else +static void WriteVaryingColor(float pos, float step, int x0, int x1, const float* w, uint8_t* varying) +{ + int ssecount = ((x1 - x0) & ~3); + int sseend = x0 + ssecount; + + __m128 mstep = _mm_set1_ps(step * 4.0f); + __m128 mpos = _mm_setr_ps(pos, pos + step, pos + step + step, pos + step + step + step); + + for (int x = x0; x < sseend; x += 4) + { + __m128i value = _mm_cvttps_epi32(_mm_mul_ps(_mm_mul_ps(mpos, _mm_loadu_ps(w + x)), _mm_set1_ps(255.0f))); + value = _mm_packs_epi32(value, value); + value = _mm_packus_epi16(value, value); + *(uint32_t*)(varying + x) = _mm_cvtsi128_si32(value); + mpos = _mm_add_ps(mpos, mstep); + } + + pos += ssecount * step; + for (int x = sseend; x < x1; x++) + { + varying[x] = clamp(static_cast(pos * w[x] * 255.0f), 0, 255); + pos += step; + } +} +#endif + +void WriteVaryings(int y, int x0, int x1, const TriDrawTriangleArgs* args, PolyTriangleThreadData* thread) +{ + float startX = x0 + (0.5f - args->v1->x); + float startY = y + (0.5f - args->v1->y); + + WriteVaryingWrap(args->v1->u * args->v1->w + args->gradientX.U * startX + args->gradientY.U * startY, args->gradientX.U, x0, x1, thread->scanline.W, thread->scanline.U); + WriteVaryingWrap(args->v1->v * args->v1->w + args->gradientX.V * startX + args->gradientY.V * startY, args->gradientX.V, x0, x1, thread->scanline.W, thread->scanline.V); + WriteVarying(args->v1->worldX * args->v1->w + args->gradientX.WorldX * startX + args->gradientY.WorldX * startY, args->gradientX.WorldX, x0, x1, thread->scanline.W, thread->scanline.WorldX); + WriteVarying(args->v1->worldY * args->v1->w + args->gradientX.WorldY * startX + args->gradientY.WorldY * startY, args->gradientX.WorldY, x0, x1, thread->scanline.W, thread->scanline.WorldY); + WriteVarying(args->v1->worldZ * args->v1->w + args->gradientX.WorldZ * startX + args->gradientY.WorldZ * startY, args->gradientX.WorldZ, x0, x1, thread->scanline.W, thread->scanline.WorldZ); + WriteVarying(args->v1->gradientdistZ * args->v1->w + args->gradientX.GradientdistZ * startX + args->gradientY.GradientdistZ * startY, args->gradientX.GradientdistZ, x0, x1, thread->scanline.W, thread->scanline.GradientdistZ); + WriteVaryingColor(args->v1->a * args->v1->w + args->gradientX.A * startX + args->gradientY.A * startY, args->gradientX.A, x0, x1, thread->scanline.W, thread->scanline.vColorA); + WriteVaryingColor(args->v1->r * args->v1->w + args->gradientX.R * startX + args->gradientY.R * startY, args->gradientX.R, x0, x1, thread->scanline.W, thread->scanline.vColorR); + WriteVaryingColor(args->v1->g * args->v1->w + args->gradientX.G * startX + args->gradientY.G * startY, args->gradientX.G, x0, x1, thread->scanline.W, thread->scanline.vColorG); + WriteVaryingColor(args->v1->b * args->v1->w + args->gradientX.B * startX + args->gradientY.B * startY, args->gradientX.B, x0, x1, thread->scanline.W, thread->scanline.vColorB); + + if (thread->PushConstants->uLightLevel >= 0.0f) + WriteLightArray(y, x0, x1, args, thread); + + if (thread->numPolyLights > 0) + WriteDynLightArray(x0, x1, thread); +} diff --git a/src/rendering/polyrenderer/drawers/screen_scanline_setup.h b/src/rendering/polyrenderer/drawers/screen_scanline_setup.h new file mode 100644 index 000000000..5ea66cf96 --- /dev/null +++ b/src/rendering/polyrenderer/drawers/screen_scanline_setup.h @@ -0,0 +1,29 @@ +/* +** Polygon Doom software renderer +** Copyright (c) 2016 Magnus Norddahl +** +** This software is provided 'as-is', without any express or implied +** warranty. In no event will the authors be held liable for any damages +** arising from the use of this software. +** +** Permission is granted to anyone to use this software for any purpose, +** including commercial applications, and to alter it and redistribute it +** freely, subject to the following restrictions: +** +** 1. The origin of this software must not be misrepresented; you must not +** claim that you wrote the original software. If you use this software +** in a product, an acknowledgment in the product documentation would be +** appreciated but is not required. +** 2. Altered source versions must be plainly marked as such, and must not be +** misrepresented as being the original software. +** 3. This notice may not be removed or altered from any source distribution. +** +*/ + +#pragma once + +struct TriDrawTriangleArgs; +class PolyTriangleThreadData; + +void WriteW(int y, int x0, int x1, const TriDrawTriangleArgs* args, PolyTriangleThreadData* thread); +void WriteVaryings(int y, int x0, int x1, const TriDrawTriangleArgs* args, PolyTriangleThreadData* thread); diff --git a/src/rendering/polyrenderer/drawers/screen_shader.cpp b/src/rendering/polyrenderer/drawers/screen_shader.cpp new file mode 100644 index 000000000..ed446ad2a --- /dev/null +++ b/src/rendering/polyrenderer/drawers/screen_shader.cpp @@ -0,0 +1,524 @@ +/* +** Polygon Doom software renderer +** Copyright (c) 2016 Magnus Norddahl +** +** This software is provided 'as-is', without any express or implied +** warranty. In no event will the authors be held liable for any damages +** arising from the use of this software. +** +** Permission is granted to anyone to use this software for any purpose, +** including commercial applications, and to alter it and redistribute it +** freely, subject to the following restrictions: +** +** 1. The origin of this software must not be misrepresented; you must not +** claim that you wrote the original software. If you use this software +** in a product, an acknowledgment in the product documentation would be +** appreciated but is not required. +** 2. Altered source versions must be plainly marked as such, and must not be +** misrepresented as being the original software. +** 3. This notice may not be removed or altered from any source distribution. +** +*/ + +#include +#include "templates.h" +#include "doomdef.h" +#include "poly_thread.h" +#include "screen_scanline_setup.h" +#include "x86.h" +#include + +static uint32_t SampleTexture(uint32_t u, uint32_t v, const void* texPixels, int texWidth, int texHeight, bool texBgra) +{ + int texelX = (u * texWidth) >> 16; + int texelY = (v * texHeight) >> 16; + int texelOffset = texelX + texelY * texWidth; + if (texBgra) + { + return static_cast(texPixels)[texelOffset]; + } + else + { + uint32_t c = static_cast(texPixels)[texelOffset]; + return (c << 16) | 0xff000000; + } +} + +static void EffectFogBoundary(int x0, int x1, PolyTriangleThreadData* thread) +{ + uint32_t* fragcolor = thread->scanline.FragColor; + for (int x = x0; x < x1; x++) + { + /*float fogdist = pixelpos.w; + float fogfactor = exp2(uFogDensity * fogdist); + FragColor = vec4(uFogColor.rgb, 1.0 - fogfactor);*/ + fragcolor[x] = 0; + } +} + +static void EffectBurn(int x0, int x1, PolyTriangleThreadData* thread) +{ + int texWidth = thread->textures[0].width; + int texHeight = thread->textures[0].height; + const void* texPixels = thread->textures[0].pixels; + bool texBgra = thread->textures[0].bgra; + + int tex2Width = thread->textures[1].width; + int tex2Height = thread->textures[1].height; + const void* tex2Pixels = thread->textures[1].pixels; + bool tex2Bgra = thread->textures[1].bgra; + + uint32_t* fragcolor = thread->scanline.FragColor; + uint16_t* u = thread->scanline.U; + uint16_t* v = thread->scanline.V; + for (int x = x0; x < x1; x++) + { + uint32_t frag_r = thread->scanline.vColorR[x]; + uint32_t frag_g = thread->scanline.vColorG[x]; + uint32_t frag_b = thread->scanline.vColorB[x]; + uint32_t frag_a = thread->scanline.vColorA[x]; + frag_r += frag_r >> 7; // 255 -> 256 + frag_g += frag_g >> 7; // 255 -> 256 + frag_b += frag_b >> 7; // 255 -> 256 + frag_a += frag_a >> 7; // 255 -> 256 + + uint32_t t1 = SampleTexture(u[x], v[x], texPixels, texWidth, texHeight, texBgra); + uint32_t t2 = SampleTexture(u[x], 0xffff - v[x], tex2Pixels, tex2Width, tex2Height, tex2Bgra); + + uint32_t r = (frag_r * RPART(t1)) >> 8; + uint32_t g = (frag_g * GPART(t1)) >> 8; + uint32_t b = (frag_b * BPART(t1)) >> 8; + uint32_t a = (frag_a * APART(t2)) >> 8; + + fragcolor[x] = MAKEARGB(a, r, g, b); + } +} + +static void EffectStencil(int x0, int x1, PolyTriangleThreadData* thread) +{ + /*for (int x = x0; x < x1; x++) + { + fragcolor[x] = 0x00ffffff; + }*/ +} + +static void FuncPaletted(int x0, int x1, PolyTriangleThreadData* thread) +{ + int texWidth = thread->textures[0].width; + int texHeight = thread->textures[0].height; + const void* texPixels = thread->textures[0].pixels; + bool texBgra = thread->textures[0].bgra; + const uint32_t* lut = (const uint32_t*)thread->textures[1].pixels; + uint32_t* fragcolor = thread->scanline.FragColor; + uint16_t* u = thread->scanline.U; + uint16_t* v = thread->scanline.V; + + for (int x = x0; x < x1; x++) + { + fragcolor[x] = lut[RPART(SampleTexture(u[x], v[x], texPixels, texWidth, texHeight, texBgra))] | 0xff000000; + } +} + +static void FuncNoTexture(int x0, int x1, PolyTriangleThreadData* thread) +{ + auto& streamdata = thread->mainVertexShader.Data; + uint32_t a = (int)(streamdata.uObjectColor.a * 255.0f); + uint32_t r = (int)(streamdata.uObjectColor.r * 255.0f); + uint32_t g = (int)(streamdata.uObjectColor.g * 255.0f); + uint32_t b = (int)(streamdata.uObjectColor.b * 255.0f); + uint32_t texel = MAKEARGB(a, r, g, b); + + if (streamdata.uDesaturationFactor > 0.0f) + { + uint32_t t = (int)(streamdata.uDesaturationFactor * 256.0f); + uint32_t inv_t = 256 - t; + uint32_t gray = (RPART(texel) * 77 + GPART(texel) * 143 + BPART(texel) * 37) >> 8; + texel = MAKEARGB( + APART(texel), + (RPART(texel) * inv_t + gray * t + 127) >> 8, + (GPART(texel) * inv_t + gray * t + 127) >> 8, + (BPART(texel) * inv_t + gray * t + 127) >> 8); + } + + uint32_t* fragcolor = thread->scanline.FragColor; + for (int x = x0; x < x1; x++) + { + fragcolor[x] = texel; + } +} + +static void FuncNormal(int x0, int x1, PolyTriangleThreadData* thread) +{ + int texWidth = thread->textures[0].width; + int texHeight = thread->textures[0].height; + const void* texPixels = thread->textures[0].pixels; + bool texBgra = thread->textures[0].bgra; + uint32_t* fragcolor = thread->scanline.FragColor; + uint16_t* u = thread->scanline.U; + uint16_t* v = thread->scanline.V; + + for (int x = x0; x < x1; x++) + { + uint32_t texel = SampleTexture(u[x], v[x], texPixels, texWidth, texHeight, texBgra); + fragcolor[x] = texel; + } +} + +static void FuncNormal_Stencil(int x0, int x1, PolyTriangleThreadData* thread) +{ + int texWidth = thread->textures[0].width; + int texHeight = thread->textures[0].height; + const void* texPixels = thread->textures[0].pixels; + bool texBgra = thread->textures[0].bgra; + uint32_t* fragcolor = thread->scanline.FragColor; + uint16_t* u = thread->scanline.U; + uint16_t* v = thread->scanline.V; + + for (int x = x0; x < x1; x++) + { + uint32_t texel = SampleTexture(u[x], v[x], texPixels, texWidth, texHeight, texBgra); + fragcolor[x] = texel | 0x00ffffff; + } +} + +static void FuncNormal_Opaque(int x0, int x1, PolyTriangleThreadData* thread) +{ + int texWidth = thread->textures[0].width; + int texHeight = thread->textures[0].height; + const void* texPixels = thread->textures[0].pixels; + bool texBgra = thread->textures[0].bgra; + uint32_t* fragcolor = thread->scanline.FragColor; + uint16_t* u = thread->scanline.U; + uint16_t* v = thread->scanline.V; + + for (int x = x0; x < x1; x++) + { + uint32_t texel = SampleTexture(u[x], v[x], texPixels, texWidth, texHeight, texBgra); + fragcolor[x] = texel | 0xff000000; + } +} + +static void FuncNormal_Inverse(int x0, int x1, PolyTriangleThreadData* thread) +{ + int texWidth = thread->textures[0].width; + int texHeight = thread->textures[0].height; + const void* texPixels = thread->textures[0].pixels; + bool texBgra = thread->textures[0].bgra; + uint32_t* fragcolor = thread->scanline.FragColor; + uint16_t* u = thread->scanline.U; + uint16_t* v = thread->scanline.V; + + for (int x = x0; x < x1; x++) + { + uint32_t texel = SampleTexture(u[x], v[x], texPixels, texWidth, texHeight, texBgra); + fragcolor[x] = MAKEARGB(APART(texel), 0xff - RPART(texel), 0xff - BPART(texel), 0xff - GPART(texel)); + } +} + +static void FuncNormal_AlphaTexture(int x0, int x1, PolyTriangleThreadData* thread) +{ + int texWidth = thread->textures[0].width; + int texHeight = thread->textures[0].height; + const void* texPixels = thread->textures[0].pixels; + bool texBgra = thread->textures[0].bgra; + uint32_t* fragcolor = thread->scanline.FragColor; + uint16_t* u = thread->scanline.U; + uint16_t* v = thread->scanline.V; + + for (int x = x0; x < x1; x++) + { + uint32_t texel = SampleTexture(u[x], v[x], texPixels, texWidth, texHeight, texBgra); + uint32_t gray = (RPART(texel) * 77 + GPART(texel) * 143 + BPART(texel) * 37) >> 8; + uint32_t alpha = APART(texel); + alpha += alpha >> 7; + alpha = (alpha * gray + 127) >> 8; + texel = (alpha << 24) | 0x00ffffff; + fragcolor[x] = texel; + } +} + +static void FuncNormal_ClampY(int x0, int x1, PolyTriangleThreadData* thread) +{ + int texWidth = thread->textures[0].width; + int texHeight = thread->textures[0].height; + const void* texPixels = thread->textures[0].pixels; + bool texBgra = thread->textures[0].bgra; + uint32_t* fragcolor = thread->scanline.FragColor; + uint16_t* u = thread->scanline.U; + uint16_t* v = thread->scanline.V; + + for (int x = x0; x < x1; x++) + { + fragcolor[x] = SampleTexture(u[x], v[x], texPixels, texWidth, texHeight, texBgra); + if (v[x] < 0.0 || v[x] > 1.0) + fragcolor[x] &= 0x00ffffff; + } +} + +static void FuncNormal_InvertOpaque(int x0, int x1, PolyTriangleThreadData* thread) +{ + int texWidth = thread->textures[0].width; + int texHeight = thread->textures[0].height; + const void* texPixels = thread->textures[0].pixels; + bool texBgra = thread->textures[0].bgra; + uint32_t* fragcolor = thread->scanline.FragColor; + uint16_t* u = thread->scanline.U; + uint16_t* v = thread->scanline.V; + + for (int x = x0; x < x1; x++) + { + uint32_t texel = SampleTexture(u[x], v[x], texPixels, texWidth, texHeight, texBgra); + fragcolor[x] = MAKEARGB(0xff, 0xff - RPART(texel), 0xff - BPART(texel), 0xff - GPART(texel)); + } +} + +static void FuncNormal_AddColor(int x0, int x1, PolyTriangleThreadData* thread) +{ + auto& streamdata = thread->mainVertexShader.Data; + uint32_t r = (int)(streamdata.uAddColor.r * 255.0f); + uint32_t g = (int)(streamdata.uAddColor.g * 255.0f); + uint32_t b = (int)(streamdata.uAddColor.b * 255.0f); + uint32_t* fragcolor = thread->scanline.FragColor; + for (int x = x0; x < x1; x++) + { + uint32_t texel = fragcolor[x]; + fragcolor[x] = MAKEARGB( + APART(texel), + MIN(r + RPART(texel), (uint32_t)255), + MIN(g + GPART(texel), (uint32_t)255), + MIN(b + BPART(texel), (uint32_t)255)); + } +} + +static void FuncNormal_AddObjectColor(int x0, int x1, PolyTriangleThreadData* thread) +{ + auto& streamdata = thread->mainVertexShader.Data; + uint32_t r = (int)(streamdata.uObjectColor.r * 256.0f); + uint32_t g = (int)(streamdata.uObjectColor.g * 256.0f); + uint32_t b = (int)(streamdata.uObjectColor.b * 256.0f); + uint32_t* fragcolor = thread->scanline.FragColor; + for (int x = x0; x < x1; x++) + { + uint32_t texel = fragcolor[x]; + fragcolor[x] = MAKEARGB( + APART(texel), + MIN((r * RPART(texel)) >> 8, (uint32_t)255), + MIN((g * GPART(texel)) >> 8, (uint32_t)255), + MIN((b * BPART(texel)) >> 8, (uint32_t)255)); + } +} + +static void FuncNormal_AddObjectColor2(int x0, int x1, PolyTriangleThreadData* thread) +{ + auto& streamdata = thread->mainVertexShader.Data; + float* gradientdistZ = thread->scanline.GradientdistZ; + uint32_t* fragcolor = thread->scanline.FragColor; + for (int x = x0; x < x1; x++) + { + float t = gradientdistZ[x]; + float inv_t = 1.0f - t; + uint32_t r = (int)((streamdata.uObjectColor.r * inv_t + streamdata.uObjectColor2.r * t) * 256.0f); + uint32_t g = (int)((streamdata.uObjectColor.g * inv_t + streamdata.uObjectColor2.g * t) * 256.0f); + uint32_t b = (int)((streamdata.uObjectColor.b * inv_t + streamdata.uObjectColor2.b * t) * 256.0f); + + uint32_t texel = fragcolor[x]; + fragcolor[x] = MAKEARGB( + APART(texel), + MIN((r * RPART(texel)) >> 8, (uint32_t)255), + MIN((g * GPART(texel)) >> 8, (uint32_t)255), + MIN((b * BPART(texel)) >> 8, (uint32_t)255)); + } +} + +static void FuncNormal_DesaturationFactor(int x0, int x1, PolyTriangleThreadData* thread) +{ + auto& streamdata = thread->mainVertexShader.Data; + uint32_t* fragcolor = thread->scanline.FragColor; + uint32_t t = (int)(streamdata.uDesaturationFactor * 256.0f); + uint32_t inv_t = 256 - t; + for (int x = x0; x < x1; x++) + { + uint32_t texel = fragcolor[x]; + uint32_t gray = (RPART(texel) * 77 + GPART(texel) * 143 + BPART(texel) * 37) >> 8; + fragcolor[x] = MAKEARGB( + APART(texel), + (RPART(texel) * inv_t + gray * t + 127) >> 8, + (GPART(texel) * inv_t + gray * t + 127) >> 8, + (BPART(texel) * inv_t + gray * t + 127) >> 8); + } +} + +static void RunAlphaTest(int x0, int x1, PolyTriangleThreadData* thread) +{ + uint32_t alphaThreshold = thread->AlphaThreshold; + uint32_t* fragcolor = thread->scanline.FragColor; + uint8_t* discard = thread->scanline.discard; + for (int x = x0; x < x1; x++) + { + discard[x] = fragcolor[x] <= alphaThreshold; + } +} + +static void ApplyVertexColor(int x0, int x1, PolyTriangleThreadData* thread) +{ + uint32_t* fragcolor = thread->scanline.FragColor; + for (int x = x0; x < x1; x++) + { + uint32_t r = thread->scanline.vColorR[x]; + uint32_t g = thread->scanline.vColorG[x]; + uint32_t b = thread->scanline.vColorB[x]; + uint32_t a = thread->scanline.vColorA[x]; + + a += a >> 7; + r += r >> 7; + g += g >> 7; + b += b >> 7; + + uint32_t texel = fragcolor[x]; + fragcolor[x] = MAKEARGB( + (APART(texel) * a + 127) >> 8, + (RPART(texel) * r + 127) >> 8, + (GPART(texel) * g + 127) >> 8, + (BPART(texel) * b + 127) >> 8); + } +} + +static void MainFP(int x0, int x1, PolyTriangleThreadData* thread) +{ + if (thread->EffectState == SHADER_Paletted) // func_paletted + { + FuncPaletted(x0, x1, thread); + } + else if (thread->EffectState == SHADER_NoTexture) // func_notexture + { + FuncNoTexture(x0, x1, thread); + } + else // func_normal + { + auto constants = thread->PushConstants; + + switch (constants->uTextureMode) + { + default: + case TM_NORMAL: + case TM_FOGLAYER: FuncNormal(x0, x1, thread); break; + case TM_STENCIL: FuncNormal_Stencil(x0, x1, thread); break; + case TM_OPAQUE: FuncNormal_Opaque(x0, x1, thread); break; + case TM_INVERSE: FuncNormal_Inverse(x0, x1, thread); break; + case TM_ALPHATEXTURE: FuncNormal_AlphaTexture(x0, x1, thread); break; + case TM_CLAMPY: FuncNormal_ClampY(x0, x1, thread); break; + case TM_INVERTOPAQUE: FuncNormal_InvertOpaque(x0, x1, thread); break; + } + + if (constants->uTextureMode != TM_FOGLAYER) + { + auto& streamdata = thread->mainVertexShader.Data; + + if (streamdata.uAddColor.r != 0.0f || streamdata.uAddColor.g != 0.0f || streamdata.uAddColor.b != 0.0f) + { + FuncNormal_AddColor(x0, x1, thread); + } + + if (streamdata.uObjectColor2.a == 0.0f) + { + if (streamdata.uObjectColor.r != 1.0f || streamdata.uObjectColor.g != 1.0f || streamdata.uObjectColor.b != 1.0f) + { + FuncNormal_AddObjectColor(x0, x1, thread); + } + } + else + { + FuncNormal_AddObjectColor2(x0, x1, thread); + } + + if (streamdata.uDesaturationFactor > 0.0f) + { + FuncNormal_DesaturationFactor(x0, x1, thread); + } + } + } + + if (thread->AlphaTest) + RunAlphaTest(x0, x1, thread); + + ApplyVertexColor(x0, x1, thread); + + auto constants = thread->PushConstants; + uint32_t* fragcolor = thread->scanline.FragColor; + if (constants->uLightLevel >= 0.0f && thread->numPolyLights > 0) + { + uint16_t* lightarray = thread->scanline.lightarray; + uint32_t* dynlights = thread->scanline.dynlights; + for (int x = x0; x < x1; x++) + { + uint32_t fg = fragcolor[x]; + int lightshade = lightarray[x]; + uint32_t dynlight = dynlights[x]; + + uint32_t a = APART(fg); + uint32_t r = MIN((RPART(fg) * (lightshade + RPART(dynlight))) >> 8, (uint32_t)255); + uint32_t g = MIN((GPART(fg) * (lightshade + GPART(dynlight))) >> 8, (uint32_t)255); + uint32_t b = MIN((BPART(fg) * (lightshade + BPART(dynlight))) >> 8, (uint32_t)255); + + fragcolor[x] = MAKEARGB(a, r, g, b); + } + } + else if (constants->uLightLevel >= 0.0f) + { + uint16_t* lightarray = thread->scanline.lightarray; + for (int x = x0; x < x1; x++) + { + uint32_t fg = fragcolor[x]; + int lightshade = lightarray[x]; + + uint32_t a = APART(fg); + uint32_t r = (RPART(fg) * lightshade) >> 8; + uint32_t g = (GPART(fg) * lightshade) >> 8; + uint32_t b = (BPART(fg) * lightshade) >> 8; + + fragcolor[x] = MAKEARGB(a, r, g, b); + } + + // To do: apply fog + } + else if (thread->numPolyLights > 0) + { + uint32_t* dynlights = thread->scanline.dynlights; + for (int x = x0; x < x1; x++) + { + uint32_t fg = fragcolor[x]; + uint32_t dynlight = dynlights[x]; + + uint32_t a = APART(fg); + uint32_t r = MIN((RPART(fg) * RPART(dynlight)) >> 8, (uint32_t)255); + uint32_t g = MIN((GPART(fg) * GPART(dynlight)) >> 8, (uint32_t)255); + uint32_t b = MIN((BPART(fg) * BPART(dynlight)) >> 8, (uint32_t)255); + + fragcolor[x] = MAKEARGB(a, r, g, b); + } + } +} + +void SelectFragmentShader(PolyTriangleThreadData* thread) +{ + void (*fragshader)(int x0, int x1, PolyTriangleThreadData * thread); + + if (thread->SpecialEffect == EFF_FOGBOUNDARY) // fogboundary.fp + { + fragshader = &EffectFogBoundary; + } + else if (thread->SpecialEffect == EFF_BURN) // burn.fp + { + fragshader = &EffectBurn; + } + else if (thread->SpecialEffect == EFF_STENCIL) // stencil.fp + { + fragshader = &EffectStencil; + } + else + { + fragshader = &MainFP; + } + + thread->FragmentShader = fragshader; +} diff --git a/src/rendering/polyrenderer/drawers/screen_shader.h b/src/rendering/polyrenderer/drawers/screen_shader.h new file mode 100644 index 000000000..567b4d6c9 --- /dev/null +++ b/src/rendering/polyrenderer/drawers/screen_shader.h @@ -0,0 +1,27 @@ +/* +** Polygon Doom software renderer +** Copyright (c) 2016 Magnus Norddahl +** +** This software is provided 'as-is', without any express or implied +** warranty. In no event will the authors be held liable for any damages +** arising from the use of this software. +** +** Permission is granted to anyone to use this software for any purpose, +** including commercial applications, and to alter it and redistribute it +** freely, subject to the following restrictions: +** +** 1. The origin of this software must not be misrepresented; you must not +** claim that you wrote the original software. If you use this software +** in a product, an acknowledgment in the product documentation would be +** appreciated but is not required. +** 2. Altered source versions must be plainly marked as such, and must not be +** misrepresented as being the original software. +** 3. This notice may not be removed or altered from any source distribution. +** +*/ + +#pragma once + +class PolyTriangleThreadData; + +void SelectFragmentShader(PolyTriangleThreadData* thread); diff --git a/src/rendering/polyrenderer/drawers/screen_triangle.cpp b/src/rendering/polyrenderer/drawers/screen_triangle.cpp index 53c54954e..c286b8c65 100644 --- a/src/rendering/polyrenderer/drawers/screen_triangle.cpp +++ b/src/rendering/polyrenderer/drawers/screen_triangle.cpp @@ -36,946 +36,12 @@ #include "poly_triangle.h" #include "swrenderer/drawers/r_draw_rgba.h" #include "screen_triangle.h" +#include "screen_blend.h" +#include "screen_scanline_setup.h" +#include "screen_shader.h" #include "x86.h" #include -#ifdef NO_SSE -static void WriteW(int y, int x0, int x1, const TriDrawTriangleArgs* args, PolyTriangleThreadData* thread) -{ - float startX = x0 + (0.5f - args->v1->x); - float startY = y + (0.5f - args->v1->y); - - float posW = args->v1->w + args->gradientX.W * startX + args->gradientY.W * startY; - float stepW = args->gradientX.W; - float* w = thread->scanline.W; - for (int x = x0; x < x1; x++) - { - w[x] = 1.0f / posW; - posW += stepW; - } -} -#else -static void WriteW(int y, int x0, int x1, const TriDrawTriangleArgs* args, PolyTriangleThreadData* thread) -{ - float startX = x0 + (0.5f - args->v1->x); - float startY = y + (0.5f - args->v1->y); - - float posW = args->v1->w + args->gradientX.W * startX + args->gradientY.W * startY; - float stepW = args->gradientX.W; - float* w = thread->scanline.W; - - int ssecount = ((x1 - x0) & ~3); - int sseend = x0 + ssecount; - - __m128 mstepW = _mm_set1_ps(stepW * 4.0f); - __m128 mposW = _mm_setr_ps(posW, posW + stepW, posW + stepW + stepW, posW + stepW + stepW + stepW); - - for (int x = x0; x < sseend; x += 4) - { - // One Newton-Raphson iteration for 1/posW - __m128 res = _mm_rcp_ps(mposW); - __m128 muls = _mm_mul_ps(mposW, _mm_mul_ps(res, res)); - _mm_storeu_ps(w + x, _mm_sub_ps(_mm_add_ps(res, res), muls)); - mposW = _mm_add_ps(mposW, mstepW); - } - - posW += ssecount * stepW; - for (int x = sseend; x < x1; x++) - { - w[x] = 1.0f / posW; - posW += stepW; - } -} -#endif - -static void WriteDynLightArray(int x0, int x1, PolyTriangleThreadData* thread) -{ - int num_lights = thread->numPolyLights; - PolyLight* lights = thread->polyLights; - - float worldnormalX = thread->mainVertexShader.vWorldNormal.X; - float worldnormalY = thread->mainVertexShader.vWorldNormal.Y; - float worldnormalZ = thread->mainVertexShader.vWorldNormal.Z; - - uint32_t* dynlights = thread->scanline.dynlights; - float* worldposX = thread->scanline.WorldX; - float* worldposY = thread->scanline.WorldY; - float* worldposZ = thread->scanline.WorldZ; - - int sseend = x0; - -#ifndef NO_SSE - int ssecount = ((x1 - x0) & ~3); - sseend = x0 + ssecount; - - __m128 mworldnormalX = _mm_set1_ps(worldnormalX); - __m128 mworldnormalY = _mm_set1_ps(worldnormalY); - __m128 mworldnormalZ = _mm_set1_ps(worldnormalZ); - - for (int x = x0; x < sseend; x += 4) - { - __m128i litlo = _mm_setzero_si128(); - //__m128i litlo = _mm_shuffle_epi32(_mm_unpacklo_epi8(_mm_cvtsi32_si128(dynlightcolor), _mm_setzero_si128()), _MM_SHUFFLE(1, 0, 1, 0)); - __m128i lithi = litlo; - - for (int i = 0; i < num_lights; i++) - { - __m128 lightposX = _mm_set1_ps(lights[i].x); - __m128 lightposY = _mm_set1_ps(lights[i].y); - __m128 lightposZ = _mm_set1_ps(lights[i].z); - __m128 light_radius = _mm_set1_ps(lights[i].radius); - __m128i light_color = _mm_shuffle_epi32(_mm_unpacklo_epi8(_mm_cvtsi32_si128(lights[i].color), _mm_setzero_si128()), _MM_SHUFFLE(1, 0, 1, 0)); - - __m128 is_attenuated = _mm_cmplt_ps(light_radius, _mm_setzero_ps()); - light_radius = _mm_andnot_ps(_mm_set1_ps(-0.0f), light_radius); // clear sign bit - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - __m128 Lx = _mm_sub_ps(lightposX, _mm_loadu_ps(&worldposX[x])); - __m128 Ly = _mm_sub_ps(lightposY, _mm_loadu_ps(&worldposY[x])); - __m128 Lz = _mm_sub_ps(lightposZ, _mm_loadu_ps(&worldposZ[x])); - __m128 dist2 = _mm_add_ps(_mm_mul_ps(Lx, Lx), _mm_add_ps(_mm_mul_ps(Ly, Ly), _mm_mul_ps(Lz, Lz))); - __m128 rcp_dist = _mm_rsqrt_ps(dist2); - __m128 dist = _mm_mul_ps(dist2, rcp_dist); - __m128 distance_attenuation = _mm_sub_ps(_mm_set1_ps(256.0f), _mm_min_ps(_mm_mul_ps(dist, light_radius), _mm_set1_ps(256.0f))); - - // The simple light type - __m128 simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = max(dot(N,normalize(L)),0) * attenuation - Lx = _mm_mul_ps(Lx, rcp_dist); - Ly = _mm_mul_ps(Ly, rcp_dist); - Lz = _mm_mul_ps(Lz, rcp_dist); - __m128 dotNL = _mm_add_ps(_mm_add_ps(_mm_mul_ps(mworldnormalX, Lx), _mm_mul_ps(mworldnormalY, Ly)), _mm_mul_ps(mworldnormalZ, Lz)); - __m128 point_attenuation = _mm_mul_ps(_mm_max_ps(dotNL, _mm_setzero_ps()), distance_attenuation); - - __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, point_attenuation), _mm_andnot_ps(is_attenuated, simple_attenuation))); - - attenuation = _mm_shufflehi_epi16(_mm_shufflelo_epi16(attenuation, _MM_SHUFFLE(2, 2, 0, 0)), _MM_SHUFFLE(2, 2, 0, 0)); - __m128i attenlo = _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1, 1, 0, 0)); - __m128i attenhi = _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(3, 3, 2, 2)); - - litlo = _mm_add_epi16(litlo, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenlo), 8)); - lithi = _mm_add_epi16(lithi, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenhi), 8)); - } - - _mm_storeu_si128((__m128i*)&dynlights[x], _mm_packus_epi16(litlo, lithi)); - } -#endif - - for (int x = x0; x < x1; x++) - { - uint32_t lit_r = 0; - uint32_t lit_g = 0; - uint32_t lit_b = 0; - - for (int i = 0; i < num_lights; i++) - { - float lightposX = lights[i].x; - float lightposY = lights[i].y; - float lightposZ = lights[i].z; - float light_radius = lights[i].radius; - uint32_t light_color = lights[i].color; - - bool is_attenuated = light_radius < 0.0f; - if (is_attenuated) - light_radius = -light_radius; - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - float Lx = lightposX - worldposX[x]; - float Ly = lightposY - worldposY[x]; - float Lz = lightposZ - worldposZ[x]; - float dist2 = Lx * Lx + Ly * Ly + Lz * Lz; -#ifdef NO_SSE - //float rcp_dist = 1.0f / sqrt(dist2); - float rcp_dist = 1.0f / (dist2 * 0.01f); -#else - float rcp_dist = _mm_cvtss_f32(_mm_rsqrt_ss(_mm_set_ss(dist2))); -#endif - float dist = dist2 * rcp_dist; - float distance_attenuation = 256.0f - MIN(dist * light_radius, 256.0f); - - // The simple light type - float simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = max(dot(N,normalize(L)),0) * attenuation - Lx *= rcp_dist; - Ly *= rcp_dist; - Lz *= rcp_dist; - float dotNL = worldnormalX * Lx + worldnormalY * Ly + worldnormalZ * Lz; - float point_attenuation = MAX(dotNL, 0.0f) * distance_attenuation; - - uint32_t attenuation = (uint32_t)(is_attenuated ? (int32_t)point_attenuation : (int32_t)simple_attenuation); - - lit_r += (RPART(light_color) * attenuation) >> 8; - lit_g += (GPART(light_color) * attenuation) >> 8; - lit_b += (BPART(light_color) * attenuation) >> 8; - } - - lit_r = MIN(lit_r, 255); - lit_g = MIN(lit_g, 255); - lit_b = MIN(lit_b, 255); - dynlights[x] = MAKEARGB(255, lit_r, lit_g, lit_b); - - // Palette version: - // dynlights[x] = RGB256k.All[((lit_r >> 2) << 12) | ((lit_g >> 2) << 6) | (lit_b >> 2)]; - } -} - -static void WriteLightArray(int y, int x0, int x1, const TriDrawTriangleArgs* args, PolyTriangleThreadData* thread) -{ - float startX = x0 + (0.5f - args->v1->x); - float startY = y + (0.5f - args->v1->y); - float posW = args->v1->w + args->gradientX.W * startX + args->gradientY.W * startY; - float stepW = args->gradientX.W; - - float globVis = thread->mainVertexShader.Viewpoint->mGlobVis; - - uint32_t light = (int)(thread->PushConstants->uLightLevel * 255.0f); - fixed_t shade = (fixed_t)((2.0f - (light + 12.0f) / 128.0f) * (float)FRACUNIT); - fixed_t lightpos = (fixed_t)(globVis * posW * (float)FRACUNIT); - fixed_t lightstep = (fixed_t)(globVis * stepW * (float)FRACUNIT); - - fixed_t maxvis = 24 * FRACUNIT / 32; - fixed_t maxlight = 31 * FRACUNIT / 32; - - uint16_t *lightarray = thread->scanline.lightarray; - - fixed_t lightend = lightpos + lightstep * (x1 - x0); - if (lightpos < maxvis && shade >= lightpos && shade - lightpos <= maxlight && - lightend < maxvis && shade >= lightend && shade - lightend <= maxlight) - { - //if (BitsPerPixel == 32) - { - lightpos += FRACUNIT - shade; - for (int x = x0; x < x1; x++) - { - lightarray[x] = lightpos >> 8; - lightpos += lightstep; - } - } - /*else - { - lightpos = shade - lightpos; - for (int x = x0; x < x1; x++) - { - lightarray[x] = (lightpos >> 3) & 0xffffff00; - lightpos -= lightstep; - } - }*/ - } - else - { - //if (BitsPerPixel == 32) - { - for (int x = x0; x < x1; x++) - { - lightarray[x] = (FRACUNIT - clamp(shade - MIN(maxvis, lightpos), 0, maxlight)) >> 8; - lightpos += lightstep; - } - } - /*else - { - for (int x = x0; x < x1; x++) - { - lightarray[x] = (clamp(shade - MIN(maxvis, lightpos), 0, maxlight) >> 3) & 0xffffff00; - lightpos += lightstep; - } - }*/ - } -} - -#ifdef NO_SSE -static void WriteVarying(float pos, float step, int x0, int x1, const float* w, float* varying) -{ - for (int x = x0; x < x1; x++) - { - varying[x] = pos * w[x]; - pos += step; - } -} -#else -static void WriteVarying(float pos, float step, int x0, int x1, const float* w, float* varying) -{ - int ssecount = ((x1 - x0) & ~3); - int sseend = x0 + ssecount; - - __m128 mstep = _mm_set1_ps(step * 4.0f); - __m128 mpos = _mm_setr_ps(pos, pos + step, pos + step + step, pos + step + step + step); - - for (int x = x0; x < sseend; x += 4) - { - _mm_storeu_ps(varying + x, _mm_mul_ps(mpos, _mm_loadu_ps(w + x))); - mpos = _mm_add_ps(mpos, mstep); - } - - pos += ssecount * step; - for (int x = sseend; x < x1; x++) - { - varying[x] = pos * w[x]; - pos += step; - } -} -#endif - -#ifdef NO_SSE -static void WriteVaryingWrap(float pos, float step, int x0, int x1, const float* w, uint16_t* varying) -{ - for (int x = x0; x < x1; x++) - { - float value = pos * w[x]; - value = value - std::floor(value); - varying[x] = static_cast(static_cast(value * static_cast(0x1000'0000)) << 4) >> 16; - pos += step; - } -} -#else -static void WriteVaryingWrap(float pos, float step, int x0, int x1, const float* w, uint16_t* varying) -{ - int ssecount = ((x1 - x0) & ~3); - int sseend = x0 + ssecount; - - __m128 mstep = _mm_set1_ps(step * 4.0f); - __m128 mpos = _mm_setr_ps(pos, pos + step, pos + step + step, pos + step + step + step); - - for (int x = x0; x < sseend; x += 4) - { - __m128 value = _mm_mul_ps(mpos, _mm_loadu_ps(w + x)); - __m128 f = value; - __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(f)); - __m128 r = _mm_sub_ps(t, _mm_and_ps(_mm_cmplt_ps(f, t), _mm_set1_ps(1.0f))); - value = _mm_sub_ps(f, r); - - __m128i ivalue = _mm_srli_epi32(_mm_slli_epi32(_mm_cvttps_epi32(_mm_mul_ps(value, _mm_set1_ps(static_cast(0x1000'0000)))), 4), 17); - _mm_storel_epi64((__m128i*)(varying + x), _mm_slli_epi16(_mm_packs_epi32(ivalue, ivalue), 1)); - mpos = _mm_add_ps(mpos, mstep); - } - - pos += ssecount * step; - for (int x = sseend; x < x1; x++) - { - float value = pos * w[x]; - __m128 f = _mm_set_ss(value); - __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(f)); - __m128 r = _mm_sub_ss(t, _mm_and_ps(_mm_cmplt_ps(f, t), _mm_set_ss(1.0f))); - value = _mm_cvtss_f32(_mm_sub_ss(f, r)); - - varying[x] = static_cast(static_cast(value * static_cast(0x1000'0000)) << 4) >> 16; - pos += step; - } -} -#endif - -#ifdef NO_SSE -static void WriteVaryingColor(float pos, float step, int x0, int x1, const float* w, uint8_t* varying) -{ - for (int x = x0; x < x1; x++) - { - varying[x] = clamp(static_cast(pos * w[x] * 255.0f), 0, 255); - pos += step; - } -} -#else -static void WriteVaryingColor(float pos, float step, int x0, int x1, const float* w, uint8_t* varying) -{ - int ssecount = ((x1 - x0) & ~3); - int sseend = x0 + ssecount; - - __m128 mstep = _mm_set1_ps(step * 4.0f); - __m128 mpos = _mm_setr_ps(pos, pos + step, pos + step + step, pos + step + step + step); - - for (int x = x0; x < sseend; x += 4) - { - __m128i value = _mm_cvttps_epi32(_mm_mul_ps(_mm_mul_ps(mpos, _mm_loadu_ps(w + x)), _mm_set1_ps(255.0f))); - value = _mm_packs_epi32(value, value); - value = _mm_packus_epi16(value, value); - *(uint32_t*)(varying + x) = _mm_cvtsi128_si32(value); - mpos = _mm_add_ps(mpos, mstep); - } - - pos += ssecount * step; - for (int x = sseend; x < x1; x++) - { - varying[x] = clamp(static_cast(pos * w[x] * 255.0f), 0, 255); - pos += step; - } -} -#endif - -static void WriteVaryings(int y, int x0, int x1, const TriDrawTriangleArgs* args, PolyTriangleThreadData* thread) -{ - float startX = x0 + (0.5f - args->v1->x); - float startY = y + (0.5f - args->v1->y); - - WriteVaryingWrap(args->v1->u * args->v1->w + args->gradientX.U * startX + args->gradientY.U * startY, args->gradientX.U, x0, x1, thread->scanline.W, thread->scanline.U); - WriteVaryingWrap(args->v1->v * args->v1->w + args->gradientX.V * startX + args->gradientY.V * startY, args->gradientX.V, x0, x1, thread->scanline.W, thread->scanline.V); - WriteVarying(args->v1->worldX * args->v1->w + args->gradientX.WorldX * startX + args->gradientY.WorldX * startY, args->gradientX.WorldX, x0, x1, thread->scanline.W, thread->scanline.WorldX); - WriteVarying(args->v1->worldY * args->v1->w + args->gradientX.WorldY * startX + args->gradientY.WorldY * startY, args->gradientX.WorldY, x0, x1, thread->scanline.W, thread->scanline.WorldY); - WriteVarying(args->v1->worldZ * args->v1->w + args->gradientX.WorldZ * startX + args->gradientY.WorldZ * startY, args->gradientX.WorldZ, x0, x1, thread->scanline.W, thread->scanline.WorldZ); - WriteVarying(args->v1->gradientdistZ * args->v1->w + args->gradientX.GradientdistZ * startX + args->gradientY.GradientdistZ * startY, args->gradientX.GradientdistZ, x0, x1, thread->scanline.W, thread->scanline.GradientdistZ); - WriteVaryingColor(args->v1->a * args->v1->w + args->gradientX.A * startX + args->gradientY.A * startY, args->gradientX.A, x0, x1, thread->scanline.W, thread->scanline.vColorA); - WriteVaryingColor(args->v1->r * args->v1->w + args->gradientX.R * startX + args->gradientY.R * startY, args->gradientX.R, x0, x1, thread->scanline.W, thread->scanline.vColorR); - WriteVaryingColor(args->v1->g * args->v1->w + args->gradientX.G * startX + args->gradientY.G * startY, args->gradientX.G, x0, x1, thread->scanline.W, thread->scanline.vColorG); - WriteVaryingColor(args->v1->b * args->v1->w + args->gradientX.B * startX + args->gradientY.B * startY, args->gradientX.B, x0, x1, thread->scanline.W, thread->scanline.vColorB); -} - -static const int shiftTable[] = { - 0, 0, 0, 0, // STYLEALPHA_Zero - 0, 0, 0, 0, // STYLEALPHA_One - 24, 24, 24, 24, // STYLEALPHA_Src - 24, 24, 24, 24, // STYLEALPHA_InvSrc - 24, 16, 8, 0, // STYLEALPHA_SrcCol - 24, 16, 8, 0, // STYLEALPHA_InvSrcCol - 24, 16, 8, 0, // STYLEALPHA_DstCol - 24, 16, 8, 0 // STYLEALPHA_InvDstCol -}; - -#if 1 //#ifndef USE_AVX2 -template -static void BlendColor(int y, int x0, int x1, PolyTriangleThreadData* thread) -{ - FRenderStyle style = thread->RenderStyle; - - bool invsrc = style.SrcAlpha & 1; - bool invdst = style.DestAlpha & 1; - - const int* shiftsrc = shiftTable + (style.SrcAlpha << 2); - const int* shiftdst = shiftTable + (style.DestAlpha << 2); - - uint32_t* dest = (uint32_t*)thread->dest; - uint32_t* line = dest + y * (ptrdiff_t)thread->dest_pitch; - uint32_t* fragcolor = thread->scanline.FragColor; - - int srcSelect = style.SrcAlpha <= STYLEALPHA_One ? 0 : (style.SrcAlpha >= STYLEALPHA_DstCol ? 1 : 2); - int dstSelect = style.DestAlpha <= STYLEALPHA_One ? 0 : (style.DestAlpha >= STYLEALPHA_DstCol ? 1 : 2); - - uint32_t inputs[3]; - inputs[0] = 0; - - for (int x = x0; x < x1; x++) - { - inputs[1] = line[x]; - inputs[2] = fragcolor[x]; - - uint32_t srcinput = inputs[srcSelect]; - uint32_t dstinput = inputs[dstSelect]; - - uint32_t out[4]; - for (int i = 0; i < 4; i++) - { - // Grab component for scale factors - int32_t src = (srcinput >> shiftsrc[i]) & 0xff; - int32_t dst = (dstinput >> shiftdst[i]) & 0xff; - - // Inverse if needed - if (invsrc) src = 0xff - src; - if (invdst) dst = 0xff - dst; - - // Rescale 0-255 to 0-256 - src = src + (src >> 7); - dst = dst + (dst >> 7); - - // Multiply with input - src = src * ((inputs[2] >> (24 - (i << 3))) & 0xff); - dst = dst * ((inputs[1] >> (24 - (i << 3))) & 0xff); - - // Apply blend operator - int32_t val; - if (OptT::Flags & SWBLEND_Sub) - { - val = src - dst; - } - else if (OptT::Flags & SWBLEND_RevSub) - { - val = dst - src; - } - else - { - val = src + dst; - } - out[i] = clamp((val + 127) >> 8, 0, 255); - } - - line[x] = MAKEARGB(out[0], out[1], out[2], out[3]); - } -} -#else -template -static void BlendColor(int y, int x0, int x1, PolyTriangleThreadData* thread) -{ - FRenderStyle style = thread->RenderStyle; - - bool invsrc = style.SrcAlpha & 1; - bool invdst = style.DestAlpha & 1; - - __m128i shiftsrc = _mm_loadu_si128((const __m128i*)(shiftTable + (style.SrcAlpha << 2))); - __m128i shiftdst = _mm_loadu_si128((const __m128i*)(shiftTable + (style.DestAlpha << 2))); - - uint32_t* dest = (uint32_t*)thread->dest; - uint32_t* line = dest + y * (ptrdiff_t)thread->dest_pitch; - uint32_t* fragcolor = thread->scanline.FragColor; - - int srcSelect = style.SrcAlpha <= STYLEALPHA_One ? 0 : (style.SrcAlpha >= STYLEALPHA_DstCol ? 1 : 2); - int dstSelect = style.DestAlpha <= STYLEALPHA_One ? 0 : (style.DestAlpha >= STYLEALPHA_DstCol ? 1 : 2); - - uint32_t inputs[3]; - inputs[0] = 0; - - __m128i shiftmul = _mm_set_epi32(24, 16, 8, 0); - - for (int x = x0; x < x1; x++) - { - inputs[1] = line[x]; - inputs[2] = fragcolor[x]; - - __m128i srcinput = _mm_set1_epi32(inputs[srcSelect]); - __m128i dstinput = _mm_set1_epi32(inputs[dstSelect]); - - // Grab component for scale factors - __m128i src = _mm_and_si128(_mm_srlv_epi32(srcinput, shiftsrc), _mm_set1_epi32(0xff)); - __m128i dst = _mm_and_si128(_mm_srlv_epi32(dstinput, shiftdst), _mm_set1_epi32(0xff)); - - // Inverse if needed - if (invsrc) src = _mm_sub_epi32(_mm_set1_epi32(0xff), src); - if (invdst) dst = _mm_sub_epi32(_mm_set1_epi32(0xff), dst); - - // Rescale 0-255 to 0-256 - src = _mm_add_epi32(src, _mm_srli_epi32(src, 7)); - dst = _mm_add_epi32(dst, _mm_srli_epi32(dst, 7)); - - // Multiply with input - __m128i mulsrc = _mm_and_si128(_mm_srlv_epi32(_mm_set1_epi32(inputs[2]), shiftmul), _mm_set1_epi32(0xff)); - __m128i muldst = _mm_and_si128(_mm_srlv_epi32(_mm_set1_epi32(inputs[1]), shiftmul), _mm_set1_epi32(0xff)); - __m128i mulresult = _mm_mullo_epi16(_mm_packs_epi32(src, dst), _mm_packs_epi32(mulsrc, muldst)); - src = _mm_unpacklo_epi16(mulresult, _mm_setzero_si128()); - dst = _mm_unpackhi_epi16(mulresult, _mm_setzero_si128()); - - // Apply blend operator - __m128i val; - if (OptT::Flags & SWBLEND_Sub) - { - val = _mm_sub_epi32(src, dst); - } - else if (OptT::Flags & SWBLEND_RevSub) - { - val = _mm_sub_epi32(dst, src); - } - else - { - val = _mm_add_epi32(src, dst); - } - - __m128i out = _mm_srli_epi32(_mm_add_epi32(val, _mm_set1_epi32(127)), 8); - out = _mm_packs_epi32(out, out); - out = _mm_packus_epi16(out, out); - line[x] = _mm_cvtsi128_si32(out); - } -} -#endif - -#ifdef NO_SSE -static void BlendColorOpaque(int y, int x0, int x1, PolyTriangleThreadData* thread) -{ - uint32_t* dest = (uint32_t*)thread->dest; - uint32_t* line = dest + y * (ptrdiff_t)thread->dest_pitch; - uint32_t* fragcolor = thread->scanline.FragColor; - - memcpy(line + x0, fragcolor + x0, (x1 - x0) * sizeof(uint32_t)); -} -#else -static void BlendColorOpaque(int y, int x0, int x1, PolyTriangleThreadData* thread) -{ - uint32_t* dest = (uint32_t*)thread->dest; - uint32_t* line = dest + y * (ptrdiff_t)thread->dest_pitch; - uint32_t* fragcolor = thread->scanline.FragColor; - - int ssecount = ((x1 - x0) & ~3); - int sseend = x0 + ssecount; - - for (int x = x0; x < sseend; x += 4) - { - __m128i v = _mm_loadu_si128((__m128i*) & fragcolor[x]); - _mm_storeu_si128((__m128i*) & line[x], v); - } - - for (int x = sseend; x < x1; x++) - { - line[x] = fragcolor[x]; - } -} -#endif - -static void BlendColorAdd_Src_InvSrc(int y, int x0, int x1, PolyTriangleThreadData* thread) -{ - uint32_t* line = (uint32_t*)thread->dest + y * (ptrdiff_t)thread->dest_pitch; - uint32_t* fragcolor = thread->scanline.FragColor; - - int sseend = x0; - -#ifndef NO_SSE - int ssecount = ((x1 - x0) & ~1); - sseend = x0 + ssecount; - for (int x = x0; x < sseend; x += 2) - { - __m128i dst = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)&line[x]), _mm_setzero_si128()); - __m128i src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)&fragcolor[x]), _mm_setzero_si128()); - - __m128i srcscale = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)); - srcscale = _mm_add_epi16(srcscale, _mm_srli_epi16(srcscale, 7)); - __m128i dstscale = _mm_sub_epi16(_mm_set1_epi16(256), srcscale); - - __m128i out = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(src, srcscale), _mm_mullo_epi16(dst, dstscale)), _mm_set1_epi16(127)), 8); - _mm_storel_epi64((__m128i*)&line[x], _mm_packus_epi16(out, out)); - } -#endif - - for (int x = sseend; x < x1; x++) - { - uint32_t dst = line[x]; - uint32_t src = fragcolor[x]; - - uint32_t srcscale = APART(src); - srcscale += srcscale >> 7; - uint32_t dstscale = 256 - srcscale; - - uint32_t a = ((APART(src) * srcscale + APART(dst) * dstscale) + 127) >> 8; - uint32_t r = ((RPART(src) * srcscale + RPART(dst) * dstscale) + 127) >> 8; - uint32_t g = ((GPART(src) * srcscale + GPART(dst) * dstscale) + 127) >> 8; - uint32_t b = ((BPART(src) * srcscale + BPART(dst) * dstscale) + 127) >> 8; - - line[x] = MAKEARGB(a, r, g, b); - } -} - -static void BlendColorAdd_SrcCol_InvSrcCol(int y, int x0, int x1, PolyTriangleThreadData* thread) -{ - uint32_t* line = (uint32_t*)thread->dest + y * (ptrdiff_t)thread->dest_pitch; - uint32_t* fragcolor = thread->scanline.FragColor; - - int sseend = x0; - -#ifndef NO_SSE - int ssecount = ((x1 - x0) & ~1); - sseend = x0 + ssecount; - for (int x = x0; x < sseend; x += 2) - { - __m128i dst = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*) & line[x]), _mm_setzero_si128()); - __m128i src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*) & fragcolor[x]), _mm_setzero_si128()); - - __m128i srcscale = src; - srcscale = _mm_add_epi16(srcscale, _mm_srli_epi16(srcscale, 7)); - __m128i dstscale = _mm_sub_epi16(_mm_set1_epi16(256), srcscale); - - __m128i out = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(src, srcscale), _mm_mullo_epi16(dst, dstscale)), _mm_set1_epi16(127)), 8); - _mm_storel_epi64((__m128i*) & line[x], _mm_packus_epi16(out, out)); - } -#endif - - for (int x = sseend; x < x1; x++) - { - uint32_t dst = line[x]; - uint32_t src = fragcolor[x]; - - uint32_t srcscale_a = APART(src); - uint32_t srcscale_r = RPART(src); - uint32_t srcscale_g = GPART(src); - uint32_t srcscale_b = BPART(src); - srcscale_a += srcscale_a >> 7; - srcscale_r += srcscale_r >> 7; - srcscale_g += srcscale_g >> 7; - srcscale_b += srcscale_b >> 7; - uint32_t dstscale_a = 256 - srcscale_a; - uint32_t dstscale_r = 256 - srcscale_r; - uint32_t dstscale_g = 256 - srcscale_g; - uint32_t dstscale_b = 256 - srcscale_b; - - uint32_t a = ((APART(src) * srcscale_a + APART(dst) * dstscale_a) + 127) >> 8; - uint32_t r = ((RPART(src) * srcscale_r + RPART(dst) * dstscale_r) + 127) >> 8; - uint32_t g = ((GPART(src) * srcscale_g + GPART(dst) * dstscale_g) + 127) >> 8; - uint32_t b = ((BPART(src) * srcscale_b + BPART(dst) * dstscale_b) + 127) >> 8; - - line[x] = MAKEARGB(a, r, g, b); - } -} - -static void BlendColorAdd_Src_One(int y, int x0, int x1, PolyTriangleThreadData* thread) -{ - uint32_t* line = (uint32_t*)thread->dest + y * (ptrdiff_t)thread->dest_pitch; - uint32_t* fragcolor = thread->scanline.FragColor; - - int sseend = x0; - -#ifndef NO_SSE - int ssecount = ((x1 - x0) & ~1); - sseend = x0 + ssecount; - for (int x = x0; x < sseend; x += 2) - { - __m128i dst = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*) & line[x]), _mm_setzero_si128()); - __m128i src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*) & fragcolor[x]), _mm_setzero_si128()); - - __m128i srcscale = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)); - srcscale = _mm_add_epi16(srcscale, _mm_srli_epi16(srcscale, 7)); - - __m128i out = _mm_add_epi16(_mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(src, srcscale), _mm_set1_epi16(127)), 8), dst); - _mm_storel_epi64((__m128i*) & line[x], _mm_packus_epi16(out, out)); - } -#endif - - for (int x = sseend; x < x1; x++) - { - uint32_t dst = line[x]; - uint32_t src = fragcolor[x]; - - uint32_t srcscale = APART(src); - srcscale += srcscale >> 7; - - uint32_t a = MIN((((APART(src) * srcscale) + 127) >> 8) + APART(dst), 255); - uint32_t r = MIN((((RPART(src) * srcscale) + 127) >> 8) + RPART(dst), 255); - uint32_t g = MIN((((GPART(src) * srcscale) + 127) >> 8) + GPART(dst), 255); - uint32_t b = MIN((((BPART(src) * srcscale) + 127) >> 8) + BPART(dst), 255); - - line[x] = MAKEARGB(a, r, g, b); - } -} - -static void BlendColorAdd_SrcCol_One(int y, int x0, int x1, PolyTriangleThreadData* thread) -{ - uint32_t* line = (uint32_t*)thread->dest + y * (ptrdiff_t)thread->dest_pitch; - uint32_t* fragcolor = thread->scanline.FragColor; - - int sseend = x0; - -#ifndef NO_SSE - int ssecount = ((x1 - x0) & ~1); - sseend = x0 + ssecount; - for (int x = x0; x < sseend; x += 2) - { - __m128i dst = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*) & line[x]), _mm_setzero_si128()); - __m128i src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*) & fragcolor[x]), _mm_setzero_si128()); - - __m128i srcscale = src; - srcscale = _mm_add_epi16(srcscale, _mm_srli_epi16(srcscale, 7)); - - __m128i out = _mm_add_epi16(_mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(src, srcscale), _mm_set1_epi16(127)), 8), dst); - _mm_storel_epi64((__m128i*) & line[x], _mm_packus_epi16(out, out)); - } -#endif - - for (int x = sseend; x < x1; x++) - { - uint32_t dst = line[x]; - uint32_t src = fragcolor[x]; - - uint32_t srcscale_a = APART(src); - uint32_t srcscale_r = RPART(src); - uint32_t srcscale_g = GPART(src); - uint32_t srcscale_b = BPART(src); - srcscale_a += srcscale_a >> 7; - srcscale_r += srcscale_r >> 7; - srcscale_g += srcscale_g >> 7; - srcscale_b += srcscale_b >> 7; - - uint32_t a = MIN((((APART(src) * srcscale_a) + 127) >> 8) + APART(dst), 255); - uint32_t r = MIN((((RPART(src) * srcscale_r) + 127) >> 8) + RPART(dst), 255); - uint32_t g = MIN((((GPART(src) * srcscale_g) + 127) >> 8) + GPART(dst), 255); - uint32_t b = MIN((((BPART(src) * srcscale_b) + 127) >> 8) + BPART(dst), 255); - - line[x] = MAKEARGB(a, r, g, b); - } -} - -static void BlendColorAdd_DstCol_Zero(int y, int x0, int x1, PolyTriangleThreadData* thread) -{ - uint32_t* line = (uint32_t*)thread->dest + y * (ptrdiff_t)thread->dest_pitch; - uint32_t* fragcolor = thread->scanline.FragColor; - - int sseend = x0; - -#ifndef NO_SSE - int ssecount = ((x1 - x0) & ~1); - sseend = x0 + ssecount; - for (int x = x0; x < sseend; x += 2) - { - __m128i dst = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*) & line[x]), _mm_setzero_si128()); - __m128i src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*) & fragcolor[x]), _mm_setzero_si128()); - - __m128i srcscale = dst; - srcscale = _mm_add_epi16(srcscale, _mm_srli_epi16(srcscale, 7)); - - __m128i out = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(src, srcscale), _mm_set1_epi16(127)), 8); - _mm_storel_epi64((__m128i*) & line[x], _mm_packus_epi16(out, out)); - } -#endif - - for (int x = sseend; x < x1; x++) - { - uint32_t dst = line[x]; - uint32_t src = fragcolor[x]; - - uint32_t srcscale_a = APART(dst); - uint32_t srcscale_r = RPART(dst); - uint32_t srcscale_g = GPART(dst); - uint32_t srcscale_b = BPART(dst); - srcscale_a += srcscale_a >> 7; - srcscale_r += srcscale_r >> 7; - srcscale_g += srcscale_g >> 7; - srcscale_b += srcscale_b >> 7; - - uint32_t a = (((APART(src) * srcscale_a) + 127) >> 8); - uint32_t r = (((RPART(src) * srcscale_r) + 127) >> 8); - uint32_t g = (((GPART(src) * srcscale_g) + 127) >> 8); - uint32_t b = (((BPART(src) * srcscale_b) + 127) >> 8); - - line[x] = MAKEARGB(a, r, g, b); - } -} - -static void BlendColorAdd_InvDstCol_Zero(int y, int x0, int x1, PolyTriangleThreadData* thread) -{ - uint32_t* line = (uint32_t*)thread->dest + y * (ptrdiff_t)thread->dest_pitch; - uint32_t* fragcolor = thread->scanline.FragColor; - - int sseend = x0; - -#ifndef NO_SSE - int ssecount = ((x1 - x0) & ~1); - sseend = x0 + ssecount; - for (int x = x0; x < sseend; x += 2) - { - __m128i dst = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*) & line[x]), _mm_setzero_si128()); - __m128i src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*) & fragcolor[x]), _mm_setzero_si128()); - - __m128i srcscale = _mm_sub_epi16(_mm_set1_epi16(255), dst); - srcscale = _mm_add_epi16(srcscale, _mm_srli_epi16(srcscale, 7)); - - __m128i out = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(src, srcscale), _mm_set1_epi16(127)), 8); - _mm_storel_epi64((__m128i*) & line[x], _mm_packus_epi16(out, out)); - } -#endif - - for (int x = sseend; x < x1; x++) - { - uint32_t dst = line[x]; - uint32_t src = fragcolor[x]; - - uint32_t srcscale_a = 255 - APART(dst); - uint32_t srcscale_r = 255 - RPART(dst); - uint32_t srcscale_g = 255 - GPART(dst); - uint32_t srcscale_b = 255 - BPART(dst); - srcscale_a += srcscale_a >> 7; - srcscale_r += srcscale_r >> 7; - srcscale_g += srcscale_g >> 7; - srcscale_b += srcscale_b >> 7; - - uint32_t a = (((APART(src) * srcscale_a) + 127) >> 8); - uint32_t r = (((RPART(src) * srcscale_r) + 127) >> 8); - uint32_t g = (((GPART(src) * srcscale_g) + 127) >> 8); - uint32_t b = (((BPART(src) * srcscale_b) + 127) >> 8); - - line[x] = MAKEARGB(a, r, g, b); - } -} - -static void BlendColorRevSub_Src_One(int y, int x0, int x1, PolyTriangleThreadData* thread) -{ - uint32_t* line = (uint32_t*)thread->dest + y * (ptrdiff_t)thread->dest_pitch; - uint32_t* fragcolor = thread->scanline.FragColor; - - int sseend = x0; - -#ifndef NO_SSE - int ssecount = ((x1 - x0) & ~1); - sseend = x0 + ssecount; - for (int x = x0; x < sseend; x += 2) - { - __m128i dst = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*) & line[x]), _mm_setzero_si128()); - __m128i src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*) & fragcolor[x]), _mm_setzero_si128()); - - __m128i srcscale = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)); - srcscale = _mm_add_epi16(srcscale, _mm_srli_epi16(srcscale, 7)); - - __m128i out = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(src, srcscale), _mm_set1_epi16(127)), 8)); - _mm_storel_epi64((__m128i*) & line[x], _mm_packus_epi16(out, out)); - } -#endif - - for (int x = sseend; x < x1; x++) - { - uint32_t dst = line[x]; - uint32_t src = fragcolor[x]; - - uint32_t srcscale = APART(src); - srcscale += srcscale >> 7; - - uint32_t a = MAX(APART(dst) - (((APART(src) * srcscale) + 127) >> 8), 0); - uint32_t r = MAX(RPART(dst) - (((RPART(src) * srcscale) + 127) >> 8), 0); - uint32_t g = MAX(GPART(dst) - (((GPART(src) * srcscale) + 127) >> 8), 0); - uint32_t b = MAX(BPART(dst) - (((BPART(src) * srcscale) + 127) >> 8), 0); - - line[x] = MAKEARGB(a, r, g, b); - } -} - -static void SelectWriteColorFunc(PolyTriangleThreadData* thread) -{ - FRenderStyle style = thread->RenderStyle; - if (style.BlendOp == STYLEOP_Add) - { - if (style.SrcAlpha == STYLEALPHA_One && style.DestAlpha == STYLEALPHA_Zero) - { - thread->WriteColorFunc = &BlendColorOpaque; - } - else if (style.SrcAlpha == STYLEALPHA_Src && style.DestAlpha == STYLEALPHA_InvSrc) - { - thread->WriteColorFunc = &BlendColorAdd_Src_InvSrc; - } - else if (style.SrcAlpha == STYLEALPHA_SrcCol && style.DestAlpha == STYLEALPHA_InvSrcCol) - { - thread->WriteColorFunc = &BlendColorAdd_SrcCol_InvSrcCol; - } - else if (style.SrcAlpha == STYLEALPHA_Src && style.DestAlpha == STYLEALPHA_One) - { - thread->WriteColorFunc = &BlendColorAdd_Src_One; - } - else if (style.SrcAlpha == STYLEALPHA_SrcCol && style.DestAlpha == STYLEALPHA_One) - { - thread->WriteColorFunc = &BlendColorAdd_SrcCol_One; - } - else if (style.SrcAlpha == STYLEALPHA_DstCol && style.DestAlpha == STYLEALPHA_Zero) - { - thread->WriteColorFunc = &BlendColorAdd_DstCol_Zero; - } - else if (style.SrcAlpha == STYLEALPHA_InvDstCol && style.DestAlpha == STYLEALPHA_Zero) - { - thread->WriteColorFunc = &BlendColorAdd_InvDstCol_Zero; - } - else - { - thread->WriteColorFunc = &BlendColor; - } - } - else if (style.BlendOp == STYLEOP_Sub) - { - thread->WriteColorFunc = &BlendColor; - } - else // if (style.BlendOp == STYLEOP_RevSub) - { - if (style.SrcAlpha == STYLEALPHA_Src && style.DestAlpha == STYLEALPHA_One) - { - thread->WriteColorFunc = &BlendColorRevSub_Src_One; - } - else - { - thread->WriteColorFunc = &BlendColor; - } - } -} - static void WriteDepth(int y, int x0, int x1, PolyTriangleThreadData* thread) { size_t pitch = thread->depthstencil->Width(); @@ -998,511 +64,10 @@ static void WriteStencil(int y, int x0, int x1, PolyTriangleThreadData* thread) } } -static uint32_t SampleTexture(uint32_t u, uint32_t v, const void* texPixels, int texWidth, int texHeight, bool texBgra) -{ - int texelX = (u * texWidth) >> 16; - int texelY = (v * texHeight) >> 16; - int texelOffset = texelX + texelY * texWidth; - if (texBgra) - { - return static_cast(texPixels)[texelOffset]; - } - else - { - uint32_t c = static_cast(texPixels)[texelOffset]; - return (c << 16) | 0xff000000; - } -} - -static void EffectFogBoundary(int x0, int x1, PolyTriangleThreadData* thread) -{ - uint32_t* fragcolor = thread->scanline.FragColor; - for (int x = x0; x < x1; x++) - { - /*float fogdist = pixelpos.w; - float fogfactor = exp2(uFogDensity * fogdist); - FragColor = vec4(uFogColor.rgb, 1.0 - fogfactor);*/ - fragcolor[x] = 0; - } -} - -static void EffectBurn(int x0, int x1, PolyTriangleThreadData* thread) -{ - int texWidth = thread->textures[0].width; - int texHeight = thread->textures[0].height; - const void* texPixels = thread->textures[0].pixels; - bool texBgra = thread->textures[0].bgra; - - int tex2Width = thread->textures[1].width; - int tex2Height = thread->textures[1].height; - const void* tex2Pixels = thread->textures[1].pixels; - bool tex2Bgra = thread->textures[1].bgra; - - uint32_t* fragcolor = thread->scanline.FragColor; - uint16_t* u = thread->scanline.U; - uint16_t* v = thread->scanline.V; - for (int x = x0; x < x1; x++) - { - uint32_t frag_r = thread->scanline.vColorR[x]; - uint32_t frag_g = thread->scanline.vColorG[x]; - uint32_t frag_b = thread->scanline.vColorB[x]; - uint32_t frag_a = thread->scanline.vColorA[x]; - frag_r += frag_r >> 7; // 255 -> 256 - frag_g += frag_g >> 7; // 255 -> 256 - frag_b += frag_b >> 7; // 255 -> 256 - frag_a += frag_a >> 7; // 255 -> 256 - - uint32_t t1 = SampleTexture(u[x], v[x], texPixels, texWidth, texHeight, texBgra); - uint32_t t2 = SampleTexture(u[x], 0xffff - v[x], tex2Pixels, tex2Width, tex2Height, tex2Bgra); - - uint32_t r = (frag_r * RPART(t1)) >> 8; - uint32_t g = (frag_g * GPART(t1)) >> 8; - uint32_t b = (frag_b * BPART(t1)) >> 8; - uint32_t a = (frag_a * APART(t2)) >> 8; - - fragcolor[x] = MAKEARGB(a, r, g, b); - } -} - -static void EffectStencil(int x0, int x1, PolyTriangleThreadData* thread) -{ - /*for (int x = x0; x < x1; x++) - { - fragcolor[x] = 0x00ffffff; - }*/ -} - -static void FuncPaletted(int x0, int x1, PolyTriangleThreadData* thread) -{ - int texWidth = thread->textures[0].width; - int texHeight = thread->textures[0].height; - const void* texPixels = thread->textures[0].pixels; - bool texBgra = thread->textures[0].bgra; - const uint32_t* lut = (const uint32_t*)thread->textures[1].pixels; - uint32_t* fragcolor = thread->scanline.FragColor; - uint16_t* u = thread->scanline.U; - uint16_t* v = thread->scanline.V; - - for (int x = x0; x < x1; x++) - { - fragcolor[x] = lut[RPART(SampleTexture(u[x], v[x], texPixels, texWidth, texHeight, texBgra))] | 0xff000000; - } -} - -static void FuncNoTexture(int x0, int x1, PolyTriangleThreadData* thread) -{ - auto& streamdata = thread->mainVertexShader.Data; - uint32_t a = (int)(streamdata.uObjectColor.a * 255.0f); - uint32_t r = (int)(streamdata.uObjectColor.r * 255.0f); - uint32_t g = (int)(streamdata.uObjectColor.g * 255.0f); - uint32_t b = (int)(streamdata.uObjectColor.b * 255.0f); - uint32_t texel = MAKEARGB(a, r, g, b); - - if (streamdata.uDesaturationFactor > 0.0f) - { - uint32_t t = (int)(streamdata.uDesaturationFactor * 256.0f); - uint32_t inv_t = 256 - t; - uint32_t gray = (RPART(texel) * 77 + GPART(texel) * 143 + BPART(texel) * 37) >> 8; - texel = MAKEARGB( - APART(texel), - (RPART(texel) * inv_t + gray * t + 127) >> 8, - (GPART(texel) * inv_t + gray * t + 127) >> 8, - (BPART(texel) * inv_t + gray * t + 127) >> 8); - } - - uint32_t* fragcolor = thread->scanline.FragColor; - for (int x = x0; x < x1; x++) - { - fragcolor[x] = texel; - } -} - -static void FuncNormal(int x0, int x1, PolyTriangleThreadData* thread) -{ - int texWidth = thread->textures[0].width; - int texHeight = thread->textures[0].height; - const void* texPixels = thread->textures[0].pixels; - bool texBgra = thread->textures[0].bgra; - uint32_t* fragcolor = thread->scanline.FragColor; - uint16_t* u = thread->scanline.U; - uint16_t* v = thread->scanline.V; - - for (int x = x0; x < x1; x++) - { - uint32_t texel = SampleTexture(u[x], v[x], texPixels, texWidth, texHeight, texBgra); - fragcolor[x] = texel; - } -} - -static void FuncNormal_Stencil(int x0, int x1, PolyTriangleThreadData* thread) -{ - int texWidth = thread->textures[0].width; - int texHeight = thread->textures[0].height; - const void* texPixels = thread->textures[0].pixels; - bool texBgra = thread->textures[0].bgra; - uint32_t* fragcolor = thread->scanline.FragColor; - uint16_t* u = thread->scanline.U; - uint16_t* v = thread->scanline.V; - - for (int x = x0; x < x1; x++) - { - uint32_t texel = SampleTexture(u[x], v[x], texPixels, texWidth, texHeight, texBgra); - fragcolor[x] = texel | 0x00ffffff; - } -} - -static void FuncNormal_Opaque(int x0, int x1, PolyTriangleThreadData* thread) -{ - int texWidth = thread->textures[0].width; - int texHeight = thread->textures[0].height; - const void* texPixels = thread->textures[0].pixels; - bool texBgra = thread->textures[0].bgra; - uint32_t* fragcolor = thread->scanline.FragColor; - uint16_t* u = thread->scanline.U; - uint16_t* v = thread->scanline.V; - - for (int x = x0; x < x1; x++) - { - uint32_t texel = SampleTexture(u[x], v[x], texPixels, texWidth, texHeight, texBgra); - fragcolor[x] = texel | 0xff000000; - } -} - -static void FuncNormal_Inverse(int x0, int x1, PolyTriangleThreadData* thread) -{ - int texWidth = thread->textures[0].width; - int texHeight = thread->textures[0].height; - const void* texPixels = thread->textures[0].pixels; - bool texBgra = thread->textures[0].bgra; - uint32_t* fragcolor = thread->scanline.FragColor; - uint16_t* u = thread->scanline.U; - uint16_t* v = thread->scanline.V; - - for (int x = x0; x < x1; x++) - { - uint32_t texel = SampleTexture(u[x], v[x], texPixels, texWidth, texHeight, texBgra); - fragcolor[x] = MAKEARGB(APART(texel), 0xff - RPART(texel), 0xff - BPART(texel), 0xff - GPART(texel)); - } -} - -static void FuncNormal_AlphaTexture(int x0, int x1, PolyTriangleThreadData* thread) -{ - int texWidth = thread->textures[0].width; - int texHeight = thread->textures[0].height; - const void* texPixels = thread->textures[0].pixels; - bool texBgra = thread->textures[0].bgra; - uint32_t* fragcolor = thread->scanline.FragColor; - uint16_t* u = thread->scanline.U; - uint16_t* v = thread->scanline.V; - - for (int x = x0; x < x1; x++) - { - uint32_t texel = SampleTexture(u[x], v[x], texPixels, texWidth, texHeight, texBgra); - uint32_t gray = (RPART(texel) * 77 + GPART(texel) * 143 + BPART(texel) * 37) >> 8; - uint32_t alpha = APART(texel); - alpha += alpha >> 7; - alpha = (alpha * gray + 127) >> 8; - texel = (alpha << 24) | 0x00ffffff; - fragcolor[x] = texel; - } -} - -static void FuncNormal_ClampY(int x0, int x1, PolyTriangleThreadData* thread) -{ - int texWidth = thread->textures[0].width; - int texHeight = thread->textures[0].height; - const void* texPixels = thread->textures[0].pixels; - bool texBgra = thread->textures[0].bgra; - uint32_t* fragcolor = thread->scanline.FragColor; - uint16_t* u = thread->scanline.U; - uint16_t* v = thread->scanline.V; - - for (int x = x0; x < x1; x++) - { - fragcolor[x] = SampleTexture(u[x], v[x], texPixels, texWidth, texHeight, texBgra); - if (v[x] < 0.0 || v[x] > 1.0) - fragcolor[x] &= 0x00ffffff; - } -} - -static void FuncNormal_InvertOpaque(int x0, int x1, PolyTriangleThreadData* thread) -{ - int texWidth = thread->textures[0].width; - int texHeight = thread->textures[0].height; - const void* texPixels = thread->textures[0].pixels; - bool texBgra = thread->textures[0].bgra; - uint32_t* fragcolor = thread->scanline.FragColor; - uint16_t* u = thread->scanline.U; - uint16_t* v = thread->scanline.V; - - for (int x = x0; x < x1; x++) - { - uint32_t texel = SampleTexture(u[x], v[x], texPixels, texWidth, texHeight, texBgra); - fragcolor[x] = MAKEARGB(0xff, 0xff - RPART(texel), 0xff - BPART(texel), 0xff - GPART(texel)); - } -} - -static void FuncNormal_AddColor(int x0, int x1, PolyTriangleThreadData* thread) -{ - auto& streamdata = thread->mainVertexShader.Data; - uint32_t r = (int)(streamdata.uAddColor.r * 255.0f); - uint32_t g = (int)(streamdata.uAddColor.g * 255.0f); - uint32_t b = (int)(streamdata.uAddColor.b * 255.0f); - uint32_t* fragcolor = thread->scanline.FragColor; - for (int x = x0; x < x1; x++) - { - uint32_t texel = fragcolor[x]; - fragcolor[x] = MAKEARGB( - APART(texel), - MIN(r + RPART(texel), (uint32_t)255), - MIN(g + GPART(texel), (uint32_t)255), - MIN(b + BPART(texel), (uint32_t)255)); - } -} - -static void FuncNormal_AddObjectColor(int x0, int x1, PolyTriangleThreadData* thread) -{ - auto& streamdata = thread->mainVertexShader.Data; - uint32_t r = (int)(streamdata.uObjectColor.r * 256.0f); - uint32_t g = (int)(streamdata.uObjectColor.g * 256.0f); - uint32_t b = (int)(streamdata.uObjectColor.b * 256.0f); - uint32_t* fragcolor = thread->scanline.FragColor; - for (int x = x0; x < x1; x++) - { - uint32_t texel = fragcolor[x]; - fragcolor[x] = MAKEARGB( - APART(texel), - MIN((r * RPART(texel)) >> 8, (uint32_t)255), - MIN((g * GPART(texel)) >> 8, (uint32_t)255), - MIN((b * BPART(texel)) >> 8, (uint32_t)255)); - } -} - -static void FuncNormal_AddObjectColor2(int x0, int x1, PolyTriangleThreadData* thread) -{ - auto& streamdata = thread->mainVertexShader.Data; - float* gradientdistZ = thread->scanline.GradientdistZ; - uint32_t* fragcolor = thread->scanline.FragColor; - for (int x = x0; x < x1; x++) - { - float t = gradientdistZ[x]; - float inv_t = 1.0f - t; - uint32_t r = (int)((streamdata.uObjectColor.r * inv_t + streamdata.uObjectColor2.r * t) * 256.0f); - uint32_t g = (int)((streamdata.uObjectColor.g * inv_t + streamdata.uObjectColor2.g * t) * 256.0f); - uint32_t b = (int)((streamdata.uObjectColor.b * inv_t + streamdata.uObjectColor2.b * t) * 256.0f); - - uint32_t texel = fragcolor[x]; - fragcolor[x] = MAKEARGB( - APART(texel), - MIN((r * RPART(texel)) >> 8, (uint32_t)255), - MIN((g * GPART(texel)) >> 8, (uint32_t)255), - MIN((b * BPART(texel)) >> 8, (uint32_t)255)); - } -} - -static void FuncNormal_DesaturationFactor(int x0, int x1, PolyTriangleThreadData* thread) -{ - auto& streamdata = thread->mainVertexShader.Data; - uint32_t* fragcolor = thread->scanline.FragColor; - uint32_t t = (int)(streamdata.uDesaturationFactor * 256.0f); - uint32_t inv_t = 256 - t; - for (int x = x0; x < x1; x++) - { - uint32_t texel = fragcolor[x]; - uint32_t gray = (RPART(texel) * 77 + GPART(texel) * 143 + BPART(texel) * 37) >> 8; - fragcolor[x] = MAKEARGB( - APART(texel), - (RPART(texel) * inv_t + gray * t + 127) >> 8, - (GPART(texel) * inv_t + gray * t + 127) >> 8, - (BPART(texel) * inv_t + gray * t + 127) >> 8); - } -} - -static void RunAlphaTest(int x0, int x1, PolyTriangleThreadData* thread) -{ - uint32_t alphaThreshold = thread->AlphaThreshold; - uint32_t* fragcolor = thread->scanline.FragColor; - uint8_t* discard = thread->scanline.discard; - for (int x = x0; x < x1; x++) - { - discard[x] = fragcolor[x] <= alphaThreshold; - } -} - -static void ApplyVertexColor(int x0, int x1, PolyTriangleThreadData* thread) -{ - uint32_t* fragcolor = thread->scanline.FragColor; - for (int x = x0; x < x1; x++) - { - uint32_t r = thread->scanline.vColorR[x]; - uint32_t g = thread->scanline.vColorG[x]; - uint32_t b = thread->scanline.vColorB[x]; - uint32_t a = thread->scanline.vColorA[x]; - - a += a >> 7; - r += r >> 7; - g += g >> 7; - b += b >> 7; - - uint32_t texel = fragcolor[x]; - fragcolor[x] = MAKEARGB( - (APART(texel) * a + 127) >> 8, - (RPART(texel) * r + 127) >> 8, - (GPART(texel) * g + 127) >> 8, - (BPART(texel) * b + 127) >> 8); - } -} - -static void MainFP(int x0, int x1, PolyTriangleThreadData* thread) -{ - if (thread->numPolyLights > 0) - WriteDynLightArray(x0, x1, thread); - - if (thread->EffectState == SHADER_Paletted) // func_paletted - { - FuncPaletted(x0, x1, thread); - } - else if (thread->EffectState == SHADER_NoTexture) // func_notexture - { - FuncNoTexture(x0, x1, thread); - } - else // func_normal - { - auto constants = thread->PushConstants; - - switch (constants->uTextureMode) - { - default: - case TM_NORMAL: - case TM_FOGLAYER: FuncNormal(x0, x1, thread); break; - case TM_STENCIL: FuncNormal_Stencil(x0, x1, thread); break; - case TM_OPAQUE: FuncNormal_Opaque(x0, x1, thread); break; - case TM_INVERSE: FuncNormal_Inverse(x0, x1, thread); break; - case TM_ALPHATEXTURE: FuncNormal_AlphaTexture(x0, x1, thread); break; - case TM_CLAMPY: FuncNormal_ClampY(x0, x1, thread); break; - case TM_INVERTOPAQUE: FuncNormal_InvertOpaque(x0, x1, thread); break; - } - - if (constants->uTextureMode != TM_FOGLAYER) - { - auto& streamdata = thread->mainVertexShader.Data; - - if (streamdata.uAddColor.r != 0.0f || streamdata.uAddColor.g != 0.0f || streamdata.uAddColor.b != 0.0f) - { - FuncNormal_AddColor(x0, x1, thread); - } - - if (streamdata.uObjectColor2.a == 0.0f) - { - if (streamdata.uObjectColor.r != 1.0f || streamdata.uObjectColor.g != 1.0f || streamdata.uObjectColor.b != 1.0f) - { - FuncNormal_AddObjectColor(x0, x1, thread); - } - } - else - { - FuncNormal_AddObjectColor2(x0, x1, thread); - } - - if (streamdata.uDesaturationFactor > 0.0f) - { - FuncNormal_DesaturationFactor(x0, x1, thread); - } - } - } - - if (thread->AlphaTest) - RunAlphaTest(x0, x1, thread); - - ApplyVertexColor(x0, x1, thread); - - auto constants = thread->PushConstants; - uint32_t* fragcolor = thread->scanline.FragColor; - if (constants->uLightLevel >= 0.0f && thread->numPolyLights > 0) - { - uint16_t* lightarray = thread->scanline.lightarray; - uint32_t* dynlights = thread->scanline.dynlights; - for (int x = x0; x < x1; x++) - { - uint32_t fg = fragcolor[x]; - int lightshade = lightarray[x]; - uint32_t dynlight = dynlights[x]; - - uint32_t a = APART(fg); - uint32_t r = MIN((RPART(fg) * (lightshade + RPART(dynlight))) >> 8, (uint32_t)255); - uint32_t g = MIN((GPART(fg) * (lightshade + GPART(dynlight))) >> 8, (uint32_t)255); - uint32_t b = MIN((BPART(fg) * (lightshade + BPART(dynlight))) >> 8, (uint32_t)255); - - fragcolor[x] = MAKEARGB(a, r, g, b); - } - } - else if (constants->uLightLevel >= 0.0f) - { - uint16_t* lightarray = thread->scanline.lightarray; - for (int x = x0; x < x1; x++) - { - uint32_t fg = fragcolor[x]; - int lightshade = lightarray[x]; - - uint32_t a = APART(fg); - uint32_t r = (RPART(fg) * lightshade) >> 8; - uint32_t g = (GPART(fg) * lightshade) >> 8; - uint32_t b = (BPART(fg) * lightshade) >> 8; - - fragcolor[x] = MAKEARGB(a, r, g, b); - } - - // To do: apply fog - } - else if (thread->numPolyLights > 0) - { - uint32_t* dynlights = thread->scanline.dynlights; - for (int x = x0; x < x1; x++) - { - uint32_t fg = fragcolor[x]; - uint32_t dynlight = dynlights[x]; - - uint32_t a = APART(fg); - uint32_t r = MIN((RPART(fg) * RPART(dynlight)) >> 8, (uint32_t)255); - uint32_t g = MIN((GPART(fg) * GPART(dynlight)) >> 8, (uint32_t)255); - uint32_t b = MIN((BPART(fg) * BPART(dynlight)) >> 8, (uint32_t)255); - - fragcolor[x] = MAKEARGB(a, r, g, b); - } - } -} - -static void SelectFragmentShader(PolyTriangleThreadData* thread) -{ - void (*fragshader)(int x0, int x1, PolyTriangleThreadData * thread); - - if (thread->SpecialEffect == EFF_FOGBOUNDARY) // fogboundary.fp - { - fragshader = &EffectFogBoundary; - } - else if (thread->SpecialEffect == EFF_BURN) // burn.fp - { - fragshader = &EffectBurn; - } - else if (thread->SpecialEffect == EFF_STENCIL) // stencil.fp - { - fragshader = &EffectStencil; - } - else - { - fragshader = &MainFP; - } - - thread->FragmentShader = fragshader; -} - static void DrawSpan(int y, int x0, int x1, const TriDrawTriangleArgs* args, PolyTriangleThreadData* thread) { WriteVaryings(y, x0, x1, args, thread); - if (thread->PushConstants->uLightLevel >= 0.0f) - WriteLightArray(y, x0, x1, args, thread); - thread->FragmentShader(x0, x1, thread); if (!thread->AlphaTest) diff --git a/src/rendering/polyrenderer/drawers/screen_triangle.h b/src/rendering/polyrenderer/drawers/screen_triangle.h index 83633a4f3..40674ab14 100644 --- a/src/rendering/polyrenderer/drawers/screen_triangle.h +++ b/src/rendering/polyrenderer/drawers/screen_triangle.h @@ -126,13 +126,3 @@ struct TestSpanOpt0 { static const int Flags = 0; }; struct TestSpanOpt1 { static const int Flags = 1; }; struct TestSpanOpt2 { static const int Flags = 2; }; struct TestSpanOpt3 { static const int Flags = 3; }; - -enum SWBlendColor -{ - SWBLEND_Sub = 1, - SWBLEND_RevSub = 2 -}; - -struct BlendColorOpt_Add { static const int Flags = 0; }; -struct BlendColorOpt_Sub { static const int Flags = 1; }; -struct BlendColorOpt_RevSub { static const int Flags = 2; }; diff --git a/src/rendering/polyrenderer/poly_all.cpp b/src/rendering/polyrenderer/poly_all.cpp index ca3a93e8e..d8a53842e 100644 --- a/src/rendering/polyrenderer/poly_all.cpp +++ b/src/rendering/polyrenderer/poly_all.cpp @@ -1,4 +1,8 @@ #include "../swrenderer/textures/r_swtexture.h" #include "drawers/poly_triangle.cpp" +#include "drawers/poly_thread.cpp" #include "drawers/screen_triangle.cpp" +#include "drawers/screen_scanline_setup.cpp" +#include "drawers/screen_shader.cpp" +#include "drawers/screen_blend.cpp" #include "math/gpu_types.cpp" diff --git a/src/rendering/swrenderer/drawers/r_draw.cpp b/src/rendering/swrenderer/drawers/r_draw.cpp index 5881ee93e..42f50c00c 100644 --- a/src/rendering/swrenderer/drawers/r_draw.cpp +++ b/src/rendering/swrenderer/drawers/r_draw.cpp @@ -52,7 +52,7 @@ #include "r_thread.h" #include "swrenderer/scene/r_light.h" #include "playsim/a_dynlight.h" -#include "polyrenderer/drawers/poly_triangle.h" +#include "polyrenderer/drawers/poly_thread.h" CVAR(Bool, r_dynlights, 1, CVAR_ARCHIVE | CVAR_GLOBALCONFIG); CVAR(Bool, r_fuzzscale, 1, CVAR_ARCHIVE | CVAR_GLOBALCONFIG);