From 6db89a2ce11a91ae12d05c75291b7b191e22d40f Mon Sep 17 00:00:00 2001 From: Magnus Norddahl Date: Sat, 1 Apr 2017 03:44:45 +0200 Subject: [PATCH 01/12] - clean up the triangle setup functions --- src/polyrenderer/drawers/poly_drawer32_sse2.h | 214 ++- src/polyrenderer/drawers/poly_drawer8.h | 138 +- src/polyrenderer/drawers/poly_triangle.cpp | 42 +- src/polyrenderer/drawers/poly_triangle.h | 2 +- src/polyrenderer/drawers/screen_triangle.cpp | 1376 ++++++++--------- src/polyrenderer/drawers/screen_triangle.h | 36 +- src/swrenderer/drawers/r_thread.h | 11 - 7 files changed, 785 insertions(+), 1034 deletions(-) diff --git a/src/polyrenderer/drawers/poly_drawer32_sse2.h b/src/polyrenderer/drawers/poly_drawer32_sse2.h index bb94632bc..fba694f61 100644 --- a/src/polyrenderer/drawers/poly_drawer32_sse2.h +++ b/src/polyrenderer/drawers/poly_drawer32_sse2.h @@ -268,7 +268,7 @@ template class TriScreenDrawer32 { public: - static void Execute(const TriDrawTriangleArgs *args, WorkerThreadData *thread) + static void Execute(int x, int y, uint32_t mask0, uint32_t mask1, const TriDrawTriangleArgs *args) { using namespace TriScreenDrawerModes; @@ -281,44 +281,37 @@ public: if (is_simple_shade) { if (is_nearest_filter) - Loop(args, thread); + DrawBlock(x, y, mask0, mask1, args); else - Loop(args, thread); + DrawBlock(x, y, mask0, mask1, args); } else { if (is_nearest_filter) - Loop(args, thread); + DrawBlock(x, y, mask0, mask1, args); else - Loop(args, thread); + DrawBlock(x, y, mask0, mask1, args); } } else // no linear filtering for translated, shaded, stencil, fill or skycap { if (is_simple_shade) { - Loop(args, thread); + DrawBlock(x, y, mask0, mask1, args); } else { - Loop(args, thread); + DrawBlock(x, y, mask0, mask1, args); } } } private: template - FORCEINLINE static void VECTORCALL Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thread) + FORCEINLINE static void VECTORCALL DrawBlock(int destX, int destY, uint32_t mask0, uint32_t mask1, const TriDrawTriangleArgs *args) { using namespace TriScreenDrawerModes; - int numSpans = thread->NumFullSpans; - auto fullSpans = thread->FullSpans; - int numBlocks = thread->NumPartialBlocks; - auto partialBlocks = thread->PartialBlocks; - int startX = thread->StartX; - int startY = thread->StartY; - bool is_fixed_light = args->uniforms->FixedLight(); uint32_t lightmask = is_fixed_light ? 0 : 0xffffffff; uint32_t srcalpha = args->uniforms->SrcAlpha(); @@ -330,20 +323,24 @@ private: const TriVertex &v3 = *args->v3; ScreenTriangleStepVariables gradientX; ScreenTriangleStepVariables gradientY; - ScreenTriangleStepVariables start; + ScreenTriangleStepVariables blockPosY; gradientX.W = FindGradientX(v1.x, v1.y, v2.x, v2.y, v3.x, v3.y, v1.w, v2.w, v3.w); gradientY.W = FindGradientY(v1.x, v1.y, v2.x, v2.y, v3.x, v3.y, v1.w, v2.w, v3.w); gradientX.U = FindGradientX(v1.x, v1.y, v2.x, v2.y, v3.x, v3.y, v1.u * v1.w, v2.u * v2.w, v3.u * v3.w); gradientY.U = FindGradientY(v1.x, v1.y, v2.x, v2.y, v3.x, v3.y, v1.u * v1.w, v2.u * v2.w, v3.u * v3.w); gradientX.V = FindGradientX(v1.x, v1.y, v2.x, v2.y, v3.x, v3.y, v1.v * v1.w, v2.v * v2.w, v3.v * v3.w); gradientY.V = FindGradientY(v1.x, v1.y, v2.x, v2.y, v3.x, v3.y, v1.v * v1.w, v2.v * v2.w, v3.v * v3.w); - start.W = v1.w + gradientX.W * (startX - v1.x) + gradientY.W * (startY - v1.y); - start.U = v1.u * v1.w + gradientX.U * (startX - v1.x) + gradientY.U * (startY - v1.y); - start.V = v1.v * v1.w + gradientX.V * (startX - v1.x) + gradientY.V * (startY - v1.y); + blockPosY.W = v1.w + gradientX.W * (destX - v1.x) + gradientY.W * (destY - v1.y); + blockPosY.U = v1.u * v1.w + gradientX.U * (destX - v1.x) + gradientY.U * (destY - v1.y); + blockPosY.V = v1.v * v1.w + gradientX.V * (destX - v1.x) + gradientY.V * (destY - v1.y); + gradientX.W *= 8.0f; + gradientX.U *= 8.0f; + gradientX.V *= 8.0f; // Output uint32_t * RESTRICT destOrg = (uint32_t*)args->dest; int pitch = args->pitch; + uint32_t *dest = destOrg + destX + destY * pitch; // Light uint32_t light = args->uniforms->Light(); @@ -388,93 +385,78 @@ private: desaturate = 0; } - for (int i = 0; i < numSpans; i++) + if (mask0 == 0xffffffff && mask1 == 0xffffffff) { - const auto &span = fullSpans[i]; - - uint32_t *dest = destOrg + span.X + span.Y * pitch; - int width = span.Length; - int height = 8; - - ScreenTriangleStepVariables blockPosY; - blockPosY.W = start.W + gradientX.W * (span.X - startX) + gradientY.W * (span.Y - startY); - blockPosY.U = start.U + gradientX.U * (span.X - startX) + gradientY.U * (span.Y - startY); - blockPosY.V = start.V + gradientX.V * (span.X - startX) + gradientY.V * (span.Y - startY); - - for (int y = 0; y < height; y++) + for (int y = 0; y < 8; y++) { - ScreenTriangleStepVariables blockPosX = blockPosY; - - float rcpW = 0x01000000 / blockPosX.W; - int32_t posU = (int32_t)(blockPosX.U * rcpW); - int32_t posV = (int32_t)(blockPosX.V * rcpW); + float rcpW = 0x01000000 / blockPosY.W; + int32_t posU = (int32_t)(blockPosY.U * rcpW); + int32_t posV = (int32_t)(blockPosY.V * rcpW); fixed_t lightpos = FRACUNIT - (int)(clamp(shade - MIN(24.0f / 32.0f, globVis * blockPosY.W), 0.0f, 31.0f / 32.0f) * (float)FRACUNIT); lightpos = (lightpos & lightmask) | ((light << 8) & ~lightmask); - for (int x = 0; x < width; x++) + ScreenTriangleStepVariables blockPosX = blockPosY; + blockPosX.W += gradientX.W; + blockPosX.U += gradientX.U; + blockPosX.V += gradientX.V; + + rcpW = 0x01000000 / blockPosX.W; + int32_t nextU = (int32_t)(blockPosX.U * rcpW); + int32_t nextV = (int32_t)(blockPosX.V * rcpW); + int32_t stepU = (nextU - posU) / 8; + int32_t stepV = (nextV - posV) / 8; + + fixed_t lightnext = FRACUNIT - (fixed_t)(clamp(shade - MIN(24.0f / 32.0f, globVis * blockPosX.W), 0.0f, 31.0f / 32.0f) * (float)FRACUNIT); + fixed_t lightstep = (lightnext - lightpos) / 8; + lightstep = lightstep & lightmask; + + for (int ix = 0; ix < 4; ix++) { - blockPosX.W += gradientX.W * 8; - blockPosX.U += gradientX.U * 8; - blockPosX.V += gradientX.V * 8; + // Load bgcolor + __m128i bgcolor; + if (BlendT::Mode != (int)BlendModes::Opaque) + bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + ix * 2)), _mm_setzero_si128()); + else + bgcolor = _mm_setzero_si128(); - rcpW = 0x01000000 / blockPosX.W; - int32_t nextU = (int32_t)(blockPosX.U * rcpW); - int32_t nextV = (int32_t)(blockPosX.V * rcpW); - int32_t stepU = (nextU - posU) / 8; - int32_t stepV = (nextV - posV) / 8; + // Sample fgcolor + unsigned int ifgcolor[2], ifgshade[2]; + ifgcolor[0] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + ifgshade[0] = SampleShade32(posU, posV, texPixels, texWidth, texHeight); + posU += stepU; + posV += stepV; - fixed_t lightnext = FRACUNIT - (fixed_t)(clamp(shade - MIN(24.0f / 32.0f, globVis * blockPosX.W), 0.0f, 31.0f / 32.0f) * (float)FRACUNIT); - fixed_t lightstep = (lightnext - lightpos) / 8; - lightstep = lightstep & lightmask; + ifgcolor[1] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + ifgshade[1] = SampleShade32(posU, posV, texPixels, texWidth, texHeight); + posU += stepU; + posV += stepV; - for (int ix = 0; ix < 4; ix++) + // Setup light + int lightpos0 = lightpos >> 8; + lightpos += lightstep; + int lightpos1 = lightpos >> 8; + lightpos += lightstep; + __m128i mlight = _mm_set_epi16(256, lightpos1, lightpos1, lightpos1, 256, lightpos0, lightpos0, lightpos0); + + __m128i shade_fade_lit; + if (ShadeModeT::Mode == (int)ShadeMode::Advanced) { - // Load bgcolor - __m128i bgcolor; - if (BlendT::Mode != (int)BlendModes::Opaque) - bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(dest + x * 8 + ix * 2)), _mm_setzero_si128()); - else - bgcolor = _mm_setzero_si128(); - - // Sample fgcolor - unsigned int ifgcolor[2], ifgshade[2]; - ifgcolor[0] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - ifgshade[0] = SampleShade32(posU, posV, texPixels, texWidth, texHeight); - posU += stepU; - posV += stepV; - - ifgcolor[1] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - ifgshade[1] = SampleShade32(posU, posV, texPixels, texWidth, texHeight); - posU += stepU; - posV += stepV; - - // Setup light - int lightpos0 = lightpos >> 8; - lightpos += lightstep; - int lightpos1 = lightpos >> 8; - lightpos += lightstep; - __m128i mlight = _mm_set_epi16(256, lightpos1, lightpos1, lightpos1, 256, lightpos0, lightpos0, lightpos0); - - __m128i shade_fade_lit; - if (ShadeModeT::Mode == (int)ShadeMode::Advanced) - { - __m128i inv_light = _mm_sub_epi16(_mm_set_epi16(0, 256, 256, 256, 0, 256, 256, 256), mlight); - shade_fade_lit = _mm_mullo_epi16(shade_fade, inv_light); - } - else - { - shade_fade_lit = _mm_setzero_si128(); - } - - // Shade and blend - __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - fgcolor = Shade32(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); - __m128i outcolor = Blend32(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); - - // Store result - _mm_storel_epi64((__m128i*)(dest + x * 8 + ix * 2), outcolor); + __m128i inv_light = _mm_sub_epi16(_mm_set_epi16(0, 256, 256, 256, 0, 256, 256, 256), mlight); + shade_fade_lit = _mm_mullo_epi16(shade_fade, inv_light); } + else + { + shade_fade_lit = _mm_setzero_si128(); + } + + // Shade and blend + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + fgcolor = Shade32(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); + __m128i outcolor = Blend32(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); + + // Store result + _mm_storel_epi64((__m128i*)(dest + ix * 2), outcolor); } blockPosY.W += gradientY.W; @@ -484,35 +466,22 @@ private: dest += pitch; } } - - for (int i = 0; i < numBlocks; i++) + else { - const auto &block = partialBlocks[i]; - - ScreenTriangleStepVariables blockPosY; - blockPosY.W = start.W + gradientX.W * (block.X - startX) + gradientY.W * (block.Y - startY); - blockPosY.U = start.U + gradientX.U * (block.X - startX) + gradientY.U * (block.Y - startY); - blockPosY.V = start.V + gradientX.V * (block.X - startX) + gradientY.V * (block.Y - startY); - - uint32_t *dest = destOrg + block.X + block.Y * pitch; - uint32_t mask0 = block.Mask0; - uint32_t mask1 = block.Mask1; - // mask0 loop: for (int y = 0; y < 4; y++) { - ScreenTriangleStepVariables blockPosX = blockPosY; - - float rcpW = 0x01000000 / blockPosX.W; - int32_t posU = (int32_t)(blockPosX.U * rcpW); - int32_t posV = (int32_t)(blockPosX.V * rcpW); + float rcpW = 0x01000000 / blockPosY.W; + int32_t posU = (int32_t)(blockPosY.U * rcpW); + int32_t posV = (int32_t)(blockPosY.V * rcpW); fixed_t lightpos = FRACUNIT - (fixed_t)(clamp(shade - MIN(24.0f / 32.0f, globVis * blockPosY.W), 0.0f, 31.0f / 32.0f) * (float)FRACUNIT); lightpos = (lightpos & lightmask) | ((light << 8) & ~lightmask); - blockPosX.W += gradientX.W * 8; - blockPosX.U += gradientX.U * 8; - blockPosX.V += gradientX.V * 8; + ScreenTriangleStepVariables blockPosX = blockPosY; + blockPosX.W += gradientX.W; + blockPosX.U += gradientX.U; + blockPosX.V += gradientX.V; rcpW = 0x01000000 / blockPosX.W; int32_t nextU = (int32_t)(blockPosX.U * rcpW); @@ -590,18 +559,17 @@ private: // mask1 loop: for (int y = 0; y < 4; y++) { - ScreenTriangleStepVariables blockPosX = blockPosY; - - float rcpW = 0x01000000 / blockPosX.W; - int32_t posU = (int32_t)(blockPosX.U * rcpW); - int32_t posV = (int32_t)(blockPosX.V * rcpW); + float rcpW = 0x01000000 / blockPosY.W; + int32_t posU = (int32_t)(blockPosY.U * rcpW); + int32_t posV = (int32_t)(blockPosY.V * rcpW); fixed_t lightpos = FRACUNIT - (fixed_t)(clamp(shade - MIN(24.0f / 32.0f, globVis * blockPosY.W), 0.0f, 31.0f / 32.0f) * (float)FRACUNIT); lightpos = (lightpos & lightmask) | ((light << 8) & ~lightmask); - blockPosX.W += gradientX.W * 8; - blockPosX.U += gradientX.U * 8; - blockPosX.V += gradientX.V * 8; + ScreenTriangleStepVariables blockPosX = blockPosY; + blockPosX.W += gradientX.W; + blockPosX.U += gradientX.U; + blockPosX.V += gradientX.V; rcpW = 0x01000000 / blockPosX.W; int32_t nextU = (int32_t)(blockPosX.U * rcpW); diff --git a/src/polyrenderer/drawers/poly_drawer8.h b/src/polyrenderer/drawers/poly_drawer8.h index c6f98bd09..b916b7c2d 100644 --- a/src/polyrenderer/drawers/poly_drawer8.h +++ b/src/polyrenderer/drawers/poly_drawer8.h @@ -209,17 +209,10 @@ template class TriScreenDrawer8 { public: - static void Execute(const TriDrawTriangleArgs *args, WorkerThreadData *thread) + static void Execute(int destX, int destY, uint32_t mask0, uint32_t mask1, const TriDrawTriangleArgs *args) { using namespace TriScreenDrawerModes; - int numSpans = thread->NumFullSpans; - auto fullSpans = thread->FullSpans; - int numBlocks = thread->NumPartialBlocks; - auto partialBlocks = thread->PartialBlocks; - int startX = thread->StartX; - int startY = thread->StartY; - bool is_fixed_light = args->uniforms->FixedLight(); uint32_t lightmask = is_fixed_light ? 0 : 0xffffffff; auto colormaps = args->uniforms->BaseColormap(); @@ -232,20 +225,24 @@ public: const TriVertex &v3 = *args->v3; ScreenTriangleStepVariables gradientX; ScreenTriangleStepVariables gradientY; - ScreenTriangleStepVariables start; + ScreenTriangleStepVariables blockPosY; gradientX.W = FindGradientX(v1.x, v1.y, v2.x, v2.y, v3.x, v3.y, v1.w, v2.w, v3.w); gradientY.W = FindGradientY(v1.x, v1.y, v2.x, v2.y, v3.x, v3.y, v1.w, v2.w, v3.w); gradientX.U = FindGradientX(v1.x, v1.y, v2.x, v2.y, v3.x, v3.y, v1.u * v1.w, v2.u * v2.w, v3.u * v3.w); gradientY.U = FindGradientY(v1.x, v1.y, v2.x, v2.y, v3.x, v3.y, v1.u * v1.w, v2.u * v2.w, v3.u * v3.w); gradientX.V = FindGradientX(v1.x, v1.y, v2.x, v2.y, v3.x, v3.y, v1.v * v1.w, v2.v * v2.w, v3.v * v3.w); gradientY.V = FindGradientY(v1.x, v1.y, v2.x, v2.y, v3.x, v3.y, v1.v * v1.w, v2.v * v2.w, v3.v * v3.w); - start.W = v1.w + gradientX.W * (startX - v1.x) + gradientY.W * (startY - v1.y); - start.U = v1.u * v1.w + gradientX.U * (startX - v1.x) + gradientY.U * (startY - v1.y); - start.V = v1.v * v1.w + gradientX.V * (startX - v1.x) + gradientY.V * (startY - v1.y); + blockPosY.W = v1.w + gradientX.W * (destX - v1.x) + gradientY.W * (destY - v1.y); + blockPosY.U = v1.u * v1.w + gradientX.U * (destX - v1.x) + gradientY.U * (destY - v1.y); + blockPosY.V = v1.v * v1.w + gradientX.V * (destX - v1.x) + gradientY.V * (destY - v1.y); + gradientX.W *= 8.0f; + gradientX.U *= 8.0f; + gradientX.V *= 8.0f; // Output uint8_t * RESTRICT destOrg = args->dest; int pitch = args->pitch; + uint8_t *dest = destOrg + destX + destY * pitch; // Light uint32_t light = args->uniforms->Light(); @@ -260,57 +257,42 @@ public: uint32_t texWidth = args->uniforms->TextureWidth(); uint32_t texHeight = args->uniforms->TextureHeight(); - for (int i = 0; i < numSpans; i++) + if (mask0 == 0xffffffff && mask1 == 0xffffffff) { - const auto &span = fullSpans[i]; - - uint8_t *dest = destOrg + span.X + span.Y * pitch; - int width = span.Length; - int height = 8; - - ScreenTriangleStepVariables blockPosY; - blockPosY.W = start.W + gradientX.W * (span.X - startX) + gradientY.W * (span.Y - startY); - blockPosY.U = start.U + gradientX.U * (span.X - startX) + gradientY.U * (span.Y - startY); - blockPosY.V = start.V + gradientX.V * (span.X - startX) + gradientY.V * (span.Y - startY); - - for (int y = 0; y < height; y++) + for (int y = 0; y < 8; y++) { - ScreenTriangleStepVariables blockPosX = blockPosY; - - float rcpW = 0x01000000 / blockPosX.W; - int32_t posU = (int32_t)(blockPosX.U * rcpW); - int32_t posV = (int32_t)(blockPosX.V * rcpW); + float rcpW = 0x01000000 / blockPosY.W; + int32_t posU = (int32_t)(blockPosY.U * rcpW); + int32_t posV = (int32_t)(blockPosY.V * rcpW); fixed_t lightpos = FRACUNIT - (int)(clamp(shade - MIN(24.0f / 32.0f, globVis * blockPosY.W), 0.0f, 31.0f / 32.0f) * (float)FRACUNIT); lightpos = (lightpos & lightmask) | ((light << 8) & ~lightmask); - for (int x = 0; x < width; x++) + ScreenTriangleStepVariables blockPosX = blockPosY; + blockPosX.W += gradientX.W; + blockPosX.U += gradientX.U; + blockPosX.V += gradientX.V; + + rcpW = 0x01000000 / blockPosX.W; + int32_t nextU = (int32_t)(blockPosX.U * rcpW); + int32_t nextV = (int32_t)(blockPosX.V * rcpW); + int32_t stepU = (nextU - posU) / 8; + int32_t stepV = (nextV - posV) / 8; + + fixed_t lightnext = FRACUNIT - (fixed_t)(clamp(shade - MIN(24.0f / 32.0f, globVis * blockPosX.W), 0.0f, 31.0f / 32.0f) * (float)FRACUNIT); + fixed_t lightstep = (lightnext - lightpos) / 8; + lightstep = lightstep & lightmask; + + for (int ix = 0; ix < 8; ix++) { - blockPosX.W += gradientX.W * 8; - blockPosX.U += gradientX.U * 8; - blockPosX.V += gradientX.V * 8; - - rcpW = 0x01000000 / blockPosX.W; - int32_t nextU = (int32_t)(blockPosX.U * rcpW); - int32_t nextV = (int32_t)(blockPosX.V * rcpW); - int32_t stepU = (nextU - posU) / 8; - int32_t stepV = (nextV - posV) / 8; - - fixed_t lightnext = FRACUNIT - (fixed_t)(clamp(shade - MIN(24.0f / 32.0f, globVis * blockPosX.W), 0.0f, 31.0f / 32.0f) * (float)FRACUNIT); - fixed_t lightstep = (lightnext - lightpos) / 8; - lightstep = lightstep & lightmask; - - for (int ix = 0; ix < 8; ix++) - { - int lightshade = lightpos >> 8; - uint8_t bgcolor = dest[x * 8 + ix]; - uint8_t fgcolor = Sample8(posU, posV, texPixels, texWidth, texHeight, color, translation); - uint32_t fgshade = SampleShade8(posU, posV, texPixels, texWidth, texHeight); - dest[x * 8 + ix] = ShadeAndBlend8(fgcolor, bgcolor, fgshade, lightshade, colormaps, srcalpha, destalpha); - posU += stepU; - posV += stepV; - lightpos += lightstep; - } + int lightshade = lightpos >> 8; + uint8_t bgcolor = dest[ix]; + uint8_t fgcolor = Sample8(posU, posV, texPixels, texWidth, texHeight, color, translation); + uint32_t fgshade = SampleShade8(posU, posV, texPixels, texWidth, texHeight); + dest[ix] = ShadeAndBlend8(fgcolor, bgcolor, fgshade, lightshade, colormaps, srcalpha, destalpha); + posU += stepU; + posV += stepV; + lightpos += lightstep; } blockPosY.W += gradientY.W; @@ -320,35 +302,22 @@ public: dest += pitch; } } - - for (int i = 0; i < numBlocks; i++) + else { - const auto &block = partialBlocks[i]; - - ScreenTriangleStepVariables blockPosY; - blockPosY.W = start.W + gradientX.W * (block.X - startX) + gradientY.W * (block.Y - startY); - blockPosY.U = start.U + gradientX.U * (block.X - startX) + gradientY.U * (block.Y - startY); - blockPosY.V = start.V + gradientX.V * (block.X - startX) + gradientY.V * (block.Y - startY); - - uint8_t *dest = destOrg + block.X + block.Y * pitch; - uint32_t mask0 = block.Mask0; - uint32_t mask1 = block.Mask1; - // mask0 loop: for (int y = 0; y < 4; y++) { - ScreenTriangleStepVariables blockPosX = blockPosY; - - float rcpW = 0x01000000 / blockPosX.W; - int32_t posU = (int32_t)(blockPosX.U * rcpW); - int32_t posV = (int32_t)(blockPosX.V * rcpW); + float rcpW = 0x01000000 / blockPosY.W; + int32_t posU = (int32_t)(blockPosY.U * rcpW); + int32_t posV = (int32_t)(blockPosY.V * rcpW); fixed_t lightpos = FRACUNIT - (fixed_t)(clamp(shade - MIN(24.0f / 32.0f, globVis * blockPosY.W), 0.0f, 31.0f / 32.0f) * (float)FRACUNIT); lightpos = (lightpos & lightmask) | ((light << 8) & ~lightmask); - blockPosX.W += gradientX.W * 8; - blockPosX.U += gradientX.U * 8; - blockPosX.V += gradientX.V * 8; + ScreenTriangleStepVariables blockPosX = blockPosY; + blockPosX.W += gradientX.W; + blockPosX.U += gradientX.U; + blockPosX.V += gradientX.V; rcpW = 0x01000000 / blockPosX.W; int32_t nextU = (int32_t)(blockPosX.U * rcpW); @@ -388,18 +357,17 @@ public: // mask1 loop: for (int y = 0; y < 4; y++) { - ScreenTriangleStepVariables blockPosX = blockPosY; - - float rcpW = 0x01000000 / blockPosX.W; - int32_t posU = (int32_t)(blockPosX.U * rcpW); - int32_t posV = (int32_t)(blockPosX.V * rcpW); + float rcpW = 0x01000000 / blockPosY.W; + int32_t posU = (int32_t)(blockPosY.U * rcpW); + int32_t posV = (int32_t)(blockPosY.V * rcpW); fixed_t lightpos = FRACUNIT - (fixed_t)(clamp(shade - MIN(24.0f / 32.0f, globVis * blockPosY.W), 0.0f, 31.0f / 32.0f) * (float)FRACUNIT); lightpos = (lightpos & lightmask) | ((light << 8) & ~lightmask); - blockPosX.W += gradientX.W * 8; - blockPosX.U += gradientX.U * 8; - blockPosX.V += gradientX.V * 8; + ScreenTriangleStepVariables blockPosX = blockPosY; + blockPosX.W += gradientX.W; + blockPosX.U += gradientX.U; + blockPosX.V += gradientX.V; rcpW = 0x01000000 / blockPosX.W; int32_t nextU = (int32_t)(blockPosX.U * rcpW); diff --git a/src/polyrenderer/drawers/poly_triangle.cpp b/src/polyrenderer/drawers/poly_triangle.cpp index 4bf57e12e..0bd03821c 100644 --- a/src/polyrenderer/drawers/poly_triangle.cpp +++ b/src/polyrenderer/drawers/poly_triangle.cpp @@ -38,8 +38,6 @@ #include "swrenderer/drawers/r_draw_rgba.h" #include "screen_triangle.h" -CVAR(Bool, r_debug_trisetup, false, 0); - int PolyTriangleDrawer::viewport_x; int PolyTriangleDrawer::viewport_y; int PolyTriangleDrawer::viewport_width; @@ -90,33 +88,13 @@ void PolyTriangleDrawer::draw_arrays(const PolyDrawArgs &drawargs, WorkerThreadD if (drawargs.VertexCount() < 3) return; - PolyDrawFuncPtr drawfuncs[4]; - int num_drawfuncs = 0; - - drawfuncs[num_drawfuncs++] = drawargs.SubsectorTest() ? &ScreenTriangle::SetupSubsector : &ScreenTriangle::SetupNormal; - - if (!r_debug_trisetup) // For profiling how much time is spent in setup vs drawal - { - int bmode = (int)drawargs.BlendMode(); - - if (drawargs.WriteColor()) - drawfuncs[num_drawfuncs++] = dest_bgra ? ScreenTriangle::TriDrawers32[bmode] : ScreenTriangle::TriDrawers8[bmode]; - } - - if (drawargs.WriteStencil()) - drawfuncs[num_drawfuncs++] = &ScreenTriangle::StencilWrite; - - if (drawargs.WriteSubsector()) - drawfuncs[num_drawfuncs++] = &ScreenTriangle::SubsectorWrite; - TriDrawTriangleArgs args; args.dest = dest; args.pitch = dest_pitch; - args.clipleft = 0; args.clipright = dest_width; - args.cliptop = 0; args.clipbottom = dest_height; args.uniforms = &drawargs; + args.destBgra = dest_bgra; args.stencilPitch = PolyStencilBuffer::Instance()->BlockWidth(); args.stencilValues = PolyStencilBuffer::Instance()->Values(); args.stencilMasks = PolyStencilBuffer::Instance()->Masks(); @@ -133,7 +111,7 @@ void PolyTriangleDrawer::draw_arrays(const PolyDrawArgs &drawargs, WorkerThreadD { for (int j = 0; j < 3; j++) vert[j] = shade_vertex(*drawargs.ObjectToClip(), drawargs.ClipPlane(), *(vinput++)); - draw_shaded_triangle(vert, ccw, &args, thread, drawfuncs, num_drawfuncs); + draw_shaded_triangle(vert, ccw, &args, thread); } } else if (drawargs.DrawMode() == PolyDrawMode::TriangleFan) @@ -143,7 +121,7 @@ void PolyTriangleDrawer::draw_arrays(const PolyDrawArgs &drawargs, WorkerThreadD for (int i = 2; i < vcount; i++) { vert[2] = shade_vertex(*drawargs.ObjectToClip(), drawargs.ClipPlane(), *(vinput++)); - draw_shaded_triangle(vert, ccw, &args, thread, drawfuncs, num_drawfuncs); + draw_shaded_triangle(vert, ccw, &args, thread); vert[1] = vert[2]; } } @@ -154,7 +132,7 @@ void PolyTriangleDrawer::draw_arrays(const PolyDrawArgs &drawargs, WorkerThreadD for (int i = 2; i < vcount; i++) { vert[2] = shade_vertex(*drawargs.ObjectToClip(), drawargs.ClipPlane(), *(vinput++)); - draw_shaded_triangle(vert, ccw, &args, thread, drawfuncs, num_drawfuncs); + draw_shaded_triangle(vert, ccw, &args, thread); vert[0] = vert[1]; vert[1] = vert[2]; ccw = !ccw; @@ -173,7 +151,7 @@ ShadedTriVertex PolyTriangleDrawer::shade_vertex(const TriMatrix &objectToClip, return sv; } -void PolyTriangleDrawer::draw_shaded_triangle(const ShadedTriVertex *vert, bool ccw, TriDrawTriangleArgs *args, WorkerThreadData *thread, PolyDrawFuncPtr *drawfuncs, int num_drawfuncs) +void PolyTriangleDrawer::draw_shaded_triangle(const ShadedTriVertex *vert, bool ccw, TriDrawTriangleArgs *args, WorkerThreadData *thread) { // Cull, clip and generate additional vertices as needed TriVertex clippedvert[max_additional_vertices]; @@ -249,9 +227,8 @@ void PolyTriangleDrawer::draw_shaded_triangle(const ShadedTriVertex *vert, bool args->v1 = &clippedvert[numclipvert - 1]; args->v2 = &clippedvert[i - 1]; args->v3 = &clippedvert[i - 2]; - - for (int j = 0; j < num_drawfuncs; j++) - drawfuncs[j](args, thread); + + ScreenTriangle::Draw(args, thread); } } else @@ -262,8 +239,7 @@ void PolyTriangleDrawer::draw_shaded_triangle(const ShadedTriVertex *vert, bool args->v2 = &clippedvert[i - 1]; args->v3 = &clippedvert[i]; - for (int j = 0; j < num_drawfuncs; j++) - drawfuncs[j](args, thread); + ScreenTriangle::Draw(args, thread); } } } @@ -444,8 +420,6 @@ void DrawPolyTrianglesCommand::Execute(DrawerThread *thread) WorkerThreadData thread_data; thread_data.core = thread->core; thread_data.num_cores = thread->num_cores; - thread_data.FullSpans = thread->FullSpansBuffer.data(); - thread_data.PartialBlocks = thread->PartialBlocksBuffer.data(); PolyTriangleDrawer::draw_arrays(args, &thread_data); } diff --git a/src/polyrenderer/drawers/poly_triangle.h b/src/polyrenderer/drawers/poly_triangle.h index 7cd6dbe76..c939149d3 100644 --- a/src/polyrenderer/drawers/poly_triangle.h +++ b/src/polyrenderer/drawers/poly_triangle.h @@ -46,7 +46,7 @@ public: private: static ShadedTriVertex shade_vertex(const TriMatrix &objectToClip, const float *clipPlane, const TriVertex &v); static void draw_arrays(const PolyDrawArgs &args, WorkerThreadData *thread); - static void draw_shaded_triangle(const ShadedTriVertex *vertices, bool ccw, TriDrawTriangleArgs *args, WorkerThreadData *thread, PolyDrawFuncPtr *drawfuncs, int num_drawfuncs); + static void draw_shaded_triangle(const ShadedTriVertex *vertices, bool ccw, TriDrawTriangleArgs *args, WorkerThreadData *thread); static int clipedge(const ShadedTriVertex *verts, TriVertex *clippedvert); diff --git a/src/polyrenderer/drawers/screen_triangle.cpp b/src/polyrenderer/drawers/screen_triangle.cpp index d61bd79a2..ea34c175c 100644 --- a/src/polyrenderer/drawers/screen_triangle.cpp +++ b/src/polyrenderer/drawers/screen_triangle.cpp @@ -41,38 +41,108 @@ #endif #include "poly_drawer8.h" -void ScreenTriangle::SetupNormal(const TriDrawTriangleArgs *args, WorkerThreadData *thread) +class TriangleBlock +{ +public: + TriangleBlock(const TriDrawTriangleArgs *args); + void Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thread); + +private: + // Block size, standard 8x8 (must be power of two) + static const int q = 8; + + // Deltas + int DX12, DX23, DX31; + int DY12, DY23, DY31; + + // Fixed-point deltas + int FDX12, FDX23, FDX31; + int FDY12, FDY23, FDY31; + + // Half-edge constants + int C1, C2, C3; + + // Stencil buffer + int stencilPitch; + uint8_t * RESTRICT stencilValues; + uint32_t * RESTRICT stencilMasks; + uint8_t stencilTestValue; + uint32_t stencilWriteValue; + + // Viewport clipping + int clipright; + int clipbottom; + + // Subsector buffer + uint32_t * RESTRICT subsectorGBuffer; + uint32_t subsectorDepth; + int32_t subsectorPitch; + + // Triangle bounding block + int minx, miny; + int maxx, maxy; + + // Active block + int X, Y; + uint32_t Mask0, Mask1; + +#ifndef NO_SSE + __m128i mFDY12Offset; + __m128i mFDY23Offset; + __m128i mFDY31Offset; + __m128i mFDY12x4; + __m128i mFDY23x4; + __m128i mFDY31x4; + __m128i mFDX12; + __m128i mFDX23; + __m128i mFDX31; +#endif + + void CoverageTest(); + void StencilEqualTest(); + void StencilGreaterEqualTest(); + void SubsectorTest(); + void ClipTest(); + void StencilWrite(); + void SubsectorWrite(); +}; + +TriangleBlock::TriangleBlock(const TriDrawTriangleArgs *args) { const TriVertex &v1 = *args->v1; const TriVertex &v2 = *args->v2; const TriVertex &v3 = *args->v3; - int clipright = args->clipright; - int clipbottom = args->clipbottom; - - int stencilPitch = args->stencilPitch; - uint8_t * RESTRICT stencilValues = args->stencilValues; - uint32_t * RESTRICT stencilMasks = args->stencilMasks; - uint8_t stencilTestValue = args->uniforms->StencilTestValue(); - - TriFullSpan * RESTRICT span = thread->FullSpans; - TriPartialBlock * RESTRICT partial = thread->PartialBlocks; - + + clipright = args->clipright; + clipbottom = args->clipbottom; + + stencilPitch = args->stencilPitch; + stencilValues = args->stencilValues; + stencilMasks = args->stencilMasks; + stencilTestValue = args->uniforms->StencilTestValue(); + stencilWriteValue = args->uniforms->StencilWriteValue(); + + subsectorGBuffer = args->subsectorGBuffer; + subsectorDepth = args->uniforms->SubsectorDepth(); + subsectorPitch = args->pitch; + // 28.4 fixed-point coordinates #ifdef NO_SSE const int Y1 = (int)round(16.0f * v1.y); const int Y2 = (int)round(16.0f * v2.y); const int Y3 = (int)round(16.0f * v3.y); - + const int X1 = (int)round(16.0f * v1.x); const int X2 = (int)round(16.0f * v2.x); const int X3 = (int)round(16.0f * v3.x); #else int tempround[4 * 3]; __m128 m16 = _mm_set1_ps(16.0f); - __m128 mhalf = _mm_set1_ps(0.5f); - _mm_storeu_si128((__m128i*)tempround, _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v1), m16), mhalf))); - _mm_storeu_si128((__m128i*)(tempround + 4), _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v2), m16), mhalf))); - _mm_storeu_si128((__m128i*)(tempround + 8), _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v3), m16), mhalf))); + __m128 mhalf = _mm_set1_ps(65536.5f); + __m128i m65536 = _mm_set1_epi32(65536); + _mm_storeu_si128((__m128i*)tempround, _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v1), m16), mhalf)), m65536)); + _mm_storeu_si128((__m128i*)(tempround + 4), _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v2), m16), mhalf)), m65536)); + _mm_storeu_si128((__m128i*)(tempround + 8), _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v3), m16), mhalf)), m65536)); const int X1 = tempround[0]; const int X2 = tempround[4]; const int X3 = tempround[8]; @@ -80,825 +150,629 @@ void ScreenTriangle::SetupNormal(const TriDrawTriangleArgs *args, WorkerThreadDa const int Y2 = tempround[5]; const int Y3 = tempround[9]; #endif - + // Deltas - const int DX12 = X1 - X2; - const int DX23 = X2 - X3; - const int DX31 = X3 - X1; - - const int DY12 = Y1 - Y2; - const int DY23 = Y2 - Y3; - const int DY31 = Y3 - Y1; - + DX12 = X1 - X2; + DX23 = X2 - X3; + DX31 = X3 - X1; + + DY12 = Y1 - Y2; + DY23 = Y2 - Y3; + DY31 = Y3 - Y1; + // Fixed-point deltas - const int FDX12 = DX12 << 4; - const int FDX23 = DX23 << 4; - const int FDX31 = DX31 << 4; - - const int FDY12 = DY12 << 4; - const int FDY23 = DY23 << 4; - const int FDY31 = DY31 << 4; - + FDX12 = DX12 << 4; + FDX23 = DX23 << 4; + FDX31 = DX31 << 4; + + FDY12 = DY12 << 4; + FDY23 = DY23 << 4; + FDY31 = DY31 << 4; + // Bounding rectangle - int minx = MAX((MIN(MIN(X1, X2), X3) + 0xF) >> 4, 0); - int maxx = MIN((MAX(MAX(X1, X2), X3) + 0xF) >> 4, clipright - 1); - int miny = MAX((MIN(MIN(Y1, Y2), Y3) + 0xF) >> 4, 0); - int maxy = MIN((MAX(MAX(Y1, Y2), Y3) + 0xF) >> 4, clipbottom - 1); + minx = MAX((MIN(MIN(X1, X2), X3) + 0xF) >> 4, 0); + maxx = MIN((MAX(MAX(X1, X2), X3) + 0xF) >> 4, clipright - 1); + miny = MAX((MIN(MIN(Y1, Y2), Y3) + 0xF) >> 4, 0); + maxy = MIN((MAX(MAX(Y1, Y2), Y3) + 0xF) >> 4, clipbottom - 1); if (minx >= maxx || miny >= maxy) { - thread->NumFullSpans = 0; - thread->NumPartialBlocks = 0; return; } - - // Block size, standard 8x8 (must be power of two) - const int q = 8; - + // Start in corner of 8x8 block minx &= ~(q - 1); miny &= ~(q - 1); - + // Half-edge constants - int C1 = DY12 * X1 - DX12 * Y1; - int C2 = DY23 * X2 - DX23 * Y2; - int C3 = DY31 * X3 - DX31 * Y3; - + C1 = DY12 * X1 - DX12 * Y1; + C2 = DY23 * X2 - DX23 * Y2; + C3 = DY31 * X3 - DX31 * Y3; + // Correct for fill convention if (DY12 < 0 || (DY12 == 0 && DX12 > 0)) C1++; if (DY23 < 0 || (DY23 == 0 && DX23 > 0)) C2++; if (DY31 < 0 || (DY31 == 0 && DX31 > 0)) C3++; - + +#ifndef NO_SSE + mFDY12Offset = _mm_setr_epi32(0, FDY12, FDY12 * 2, FDY12 * 3); + mFDY23Offset = _mm_setr_epi32(0, FDY23, FDY23 * 2, FDY23 * 3); + mFDY31Offset = _mm_setr_epi32(0, FDY31, FDY31 * 2, FDY31 * 3); + mFDY12x4 = _mm_set1_epi32(FDY12 * 4); + mFDY23x4 = _mm_set1_epi32(FDY23 * 4); + mFDY31x4 = _mm_set1_epi32(FDY31 * 4); + mFDX12 = _mm_set1_epi32(FDX12); + mFDX23 = _mm_set1_epi32(FDX23); + mFDX31 = _mm_set1_epi32(FDX31); +#endif +} + +void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thread) +{ // First block line for this thread int core = thread->core; int num_cores = thread->num_cores; int core_skip = (num_cores - ((miny / q) - core) % num_cores) % num_cores; - miny += core_skip * q; + int start_miny = miny + core_skip * q; - thread->StartX = minx; - thread->StartY = miny; - span->Length = 0; + bool subsectorTest = args->uniforms->SubsectorTest(); + bool writeColor = args->uniforms->WriteColor(); + bool writeStencil = args->uniforms->WriteStencil(); + bool writeSubsector = args->uniforms->WriteSubsector(); -#ifndef NO_SSE - __m128i mnotxor = _mm_set1_epi32(0xffffffff); - __m128i mstencilTestValue = _mm_set1_epi16(stencilTestValue); - __m128i mFDY12Offset = _mm_setr_epi32(0, FDY12, FDY12 * 2, FDY12 * 3); - __m128i mFDY23Offset = _mm_setr_epi32(0, FDY23, FDY23 * 2, FDY23 * 3); - __m128i mFDY31Offset = _mm_setr_epi32(0, FDY31, FDY31 * 2, FDY31 * 3); - __m128i mFDY12x4 = _mm_set1_epi32(FDY12 * 4); - __m128i mFDY23x4 = _mm_set1_epi32(FDY23 * 4); - __m128i mFDY31x4 = _mm_set1_epi32(FDY31 * 4); - __m128i mFDX12 = _mm_set1_epi32(FDX12); - __m128i mFDX23 = _mm_set1_epi32(FDX23); - __m128i mFDX31 = _mm_set1_epi32(FDX31); - __m128i mClipCompare0 = _mm_setr_epi32(clipright, clipright - 1, clipright - 2, clipright - 3); - __m128i mClipCompare1 = _mm_setr_epi32(clipright - 4, clipright - 5, clipright - 6, clipright - 7); -#endif + int bmode = (int)args->uniforms->BlendMode(); + auto drawFunc = args->destBgra ? ScreenTriangle::TriDrawers32[bmode] : ScreenTriangle::TriDrawers8[bmode]; // Loop through blocks - for (int y = miny; y < maxy; y += q * num_cores) + for (int y = start_miny; y < maxy; y += q * num_cores) { for (int x = minx; x < maxx; x += q) { - // Corners of block - int x0 = x << 4; - int x1 = (x + q - 1) << 4; - int y0 = y << 4; - int y1 = (y + q - 1) << 4; - - // Evaluate half-space functions - bool a00 = C1 + DX12 * y0 - DY12 * x0 > 0; - bool a10 = C1 + DX12 * y0 - DY12 * x1 > 0; - bool a01 = C1 + DX12 * y1 - DY12 * x0 > 0; - bool a11 = C1 + DX12 * y1 - DY12 * x1 > 0; - int a = (a00 << 0) | (a10 << 1) | (a01 << 2) | (a11 << 3); - - bool b00 = C2 + DX23 * y0 - DY23 * x0 > 0; - bool b10 = C2 + DX23 * y0 - DY23 * x1 > 0; - bool b01 = C2 + DX23 * y1 - DY23 * x0 > 0; - bool b11 = C2 + DX23 * y1 - DY23 * x1 > 0; - int b = (b00 << 0) | (b10 << 1) | (b01 << 2) | (b11 << 3); - - bool c00 = C3 + DX31 * y0 - DY31 * x0 > 0; - bool c10 = C3 + DX31 * y0 - DY31 * x1 > 0; - bool c01 = C3 + DX31 * y1 - DY31 * x0 > 0; - bool c11 = C3 + DX31 * y1 - DY31 * x1 > 0; - int c = (c00 << 0) | (c10 << 1) | (c01 << 2) | (c11 << 3); - - // Stencil test the whole block, if possible - int block = x / 8 + y / 8 * stencilPitch; - uint8_t *stencilBlock = &stencilValues[block * 64]; - uint32_t *stencilBlockMask = &stencilMasks[block]; - bool blockIsSingleStencil = ((*stencilBlockMask) & 0xffffff00) == 0xffffff00; - bool skipBlock = blockIsSingleStencil && ((*stencilBlockMask) & 0xff) != stencilTestValue; + X = x; + Y = y; - // Skip block when outside an edge - if (a == 0 || b == 0 || c == 0 || skipBlock) - { - if (span->Length != 0) - { - span++; - span->Length = 0; - } + CoverageTest(); + if (Mask0 == 0 && Mask1 == 0) continue; - } - // Accept whole block when totally covered - if (a == 0xf && b == 0xf && c == 0xf && x + q <= clipright && y + q <= clipbottom && blockIsSingleStencil) + ClipTest(); + if (Mask0 == 0 && Mask1 == 0) + continue; + + // To do: make the stencil test use its own flag for comparison mode instead of abusing the subsector test.. + if (!subsectorTest) { - if (span->Length != 0) - { - span->Length++; - } - else - { - span->X = x; - span->Y = y; - span->Length = 1; - } + StencilEqualTest(); + if (Mask0 == 0 && Mask1 == 0) + continue; } - else // Partially covered block + else { - x0 = x << 4; - x1 = (x + q - 1) << 4; - int CY1 = C1 + DX12 * y0 - DY12 * x0; - int CY2 = C2 + DX23 * y0 - DY23 * x0; - int CY3 = C3 + DX31 * y0 - DY31 * x0; + StencilGreaterEqualTest(); + if (Mask0 == 0 && Mask1 == 0) + continue; - uint32_t mask0 = 0; - uint32_t mask1 = 0; - -#ifdef NO_SSE - for (int iy = 0; iy < 4; iy++) - { - int CX1 = CY1; - int CX2 = CY2; - int CX3 = CY3; - - for (int ix = 0; ix < q; ix++) - { - bool passStencilTest = blockIsSingleStencil || stencilBlock[ix + iy * q] == stencilTestValue; - bool covered = (CX1 > 0 && CX2 > 0 && CX3 > 0 && (x + ix) < clipright && (y + iy) < clipbottom && passStencilTest); - mask0 <<= 1; - mask0 |= (uint32_t)covered; - - CX1 -= FDY12; - CX2 -= FDY23; - CX3 -= FDY31; - } - - CY1 += FDX12; - CY2 += FDX23; - CY3 += FDX31; - } - - for (int iy = 4; iy < q; iy++) - { - int CX1 = CY1; - int CX2 = CY2; - int CX3 = CY3; - - for (int ix = 0; ix < q; ix++) - { - bool passStencilTest = blockIsSingleStencil || stencilBlock[ix + iy * q] == stencilTestValue; - bool covered = (CX1 > 0 && CX2 > 0 && CX3 > 0 && (x + ix) < clipright && (y + iy) < clipbottom && passStencilTest); - mask1 <<= 1; - mask1 |= (uint32_t)covered; - - CX1 -= FDY12; - CX2 -= FDY23; - CX3 -= FDY31; - } - - CY1 += FDX12; - CY2 += FDX23; - CY3 += FDX31; - } -#else - __m128i mSingleStencilMask = _mm_set1_epi32(blockIsSingleStencil ? 0xffffffff : 0); - __m128i mCY1 = _mm_sub_epi32(_mm_set1_epi32(CY1), mFDY12Offset); - __m128i mCY2 = _mm_sub_epi32(_mm_set1_epi32(CY2), mFDY23Offset); - __m128i mCY3 = _mm_sub_epi32(_mm_set1_epi32(CY3), mFDY31Offset); - __m128i mx = _mm_set1_epi32(x); - __m128i mClipTest0 = _mm_cmplt_epi32(mx, mClipCompare0); - __m128i mClipTest1 = _mm_cmplt_epi32(mx, mClipCompare1); - int iy; - for (iy = 0; iy < 4 && iy < clipbottom - y; iy++) - { - __m128i mstencilBlock = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(stencilBlock + iy * 8)), _mm_setzero_si128()); - __m128i mstencilTest = _mm_or_si128(_mm_cmpeq_epi16(mstencilBlock, mstencilTestValue), mSingleStencilMask); - __m128i mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest); - __m128i mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest); - __m128i mtest0 = _mm_and_si128(mstencilTest0, mClipTest0); - __m128i mtest1 = _mm_and_si128(mstencilTest1, mClipTest1); - - mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY1, _mm_setzero_si128()), mtest0); - mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0); - mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY3, _mm_setzero_si128()), mtest0); - mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128()), mtest1); - mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1); - mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1); - - mCY1 = _mm_add_epi32(mCY1, mFDX12); - mCY2 = _mm_add_epi32(mCY2, mFDX23); - mCY3 = _mm_add_epi32(mCY3, mFDX31); - - mask0 <<= 4; - mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3)))); - mask0 <<= 4; - mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3)))); - } - mask0 <<= (4 - iy) * 8; - - for (iy = 4; iy < q && iy < clipbottom - y; iy++) - { - __m128i mstencilBlock = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(stencilBlock + iy * 8)), _mm_setzero_si128()); - __m128i mstencilTest = _mm_or_si128(_mm_cmpeq_epi16(mstencilBlock, mstencilTestValue), mSingleStencilMask); - __m128i mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest); - __m128i mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest); - __m128i mtest0 = _mm_and_si128(mstencilTest0, mClipTest0); - __m128i mtest1 = _mm_and_si128(mstencilTest1, mClipTest1); - - mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY1, _mm_setzero_si128()), mtest0); - mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0); - mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY3, _mm_setzero_si128()), mtest0); - mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128()), mtest1); - mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1); - mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1); - - mCY1 = _mm_add_epi32(mCY1, mFDX12); - mCY2 = _mm_add_epi32(mCY2, mFDX23); - mCY3 = _mm_add_epi32(mCY3, mFDX31); - - mask1 <<= 4; - mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3)))); - mask1 <<= 4; - mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3)))); - } - mask1 <<= (q - iy) * 8; -#endif - - if (mask0 != 0xffffffff || mask1 != 0xffffffff) - { - if (span->Length > 0) - { - span++; - span->Length = 0; - } - - if (mask0 == 0 && mask1 == 0) - continue; - - partial->X = x; - partial->Y = y; - partial->Mask0 = mask0; - partial->Mask1 = mask1; - partial++; - } - else if (span->Length != 0) - { - span->Length++; - } - else - { - span->X = x; - span->Y = y; - span->Length = 1; - } + SubsectorTest(); + if (Mask0 == 0 && Mask1 == 0) + continue; } - } - - if (span->Length != 0) - { - span++; - span->Length = 0; + + if (writeColor) + drawFunc(X, Y, Mask0, Mask1, args); + if (writeStencil) + StencilWrite(); + if (writeSubsector) + SubsectorWrite(); } } - - thread->NumFullSpans = (int)(span - thread->FullSpans); - thread->NumPartialBlocks = (int)(partial - thread->PartialBlocks); } -void ScreenTriangle::SetupSubsector(const TriDrawTriangleArgs *args, WorkerThreadData *thread) -{ - const TriVertex &v1 = *args->v1; - const TriVertex &v2 = *args->v2; - const TriVertex &v3 = *args->v3; - int clipright = args->clipright; - int clipbottom = args->clipbottom; - - int stencilPitch = args->stencilPitch; - uint8_t * RESTRICT stencilValues = args->stencilValues; - uint32_t * RESTRICT stencilMasks = args->stencilMasks; - uint8_t stencilTestValue = args->uniforms->StencilTestValue(); - - uint32_t * RESTRICT subsectorGBuffer = args->subsectorGBuffer; - uint32_t subsectorDepth = args->uniforms->SubsectorDepth(); - int32_t pitch = args->pitch; - - TriFullSpan * RESTRICT span = thread->FullSpans; - TriPartialBlock * RESTRICT partial = thread->PartialBlocks; - - // 28.4 fixed-point coordinates #ifdef NO_SSE - const int Y1 = (int)round(16.0f * v1.y); - const int Y2 = (int)round(16.0f * v2.y); - const int Y3 = (int)round(16.0f * v3.y); - const int X1 = (int)round(16.0f * v1.x); - const int X2 = (int)round(16.0f * v2.x); - const int X3 = (int)round(16.0f * v3.x); -#else - int tempround[4 * 3]; - __m128 m16 = _mm_set1_ps(16.0f); - __m128 mhalf = _mm_set1_ps(0.5f); - _mm_storeu_si128((__m128i*)tempround, _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v1), m16), mhalf))); - _mm_storeu_si128((__m128i*)(tempround + 4), _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v2), m16), mhalf))); - _mm_storeu_si128((__m128i*)(tempround + 8), _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v3), m16), mhalf))); - const int X1 = tempround[0]; - const int X2 = tempround[4]; - const int X3 = tempround[8]; - const int Y1 = tempround[1]; - const int Y2 = tempround[5]; - const int Y3 = tempround[9]; -#endif +void TriangleBlock::SubsectorTest() +{ + uint32_t *subsector = subsectorGBuffer + X + Y * subsectorPitch; + uint32_t mask0 = 0; + uint32_t mask1 = 0; - // Deltas - const int DX12 = X1 - X2; - const int DX23 = X2 - X3; - const int DX31 = X3 - X1; - - const int DY12 = Y1 - Y2; - const int DY23 = Y2 - Y3; - const int DY31 = Y3 - Y1; - - // Fixed-point deltas - const int FDX12 = DX12 << 4; - const int FDX23 = DX23 << 4; - const int FDX31 = DX31 << 4; - - const int FDY12 = DY12 << 4; - const int FDY23 = DY23 << 4; - const int FDY31 = DY31 << 4; - - // Bounding rectangle - int minx = MAX((MIN(MIN(X1, X2), X3) + 0xF) >> 4, 0); - int maxx = MIN((MAX(MAX(X1, X2), X3) + 0xF) >> 4, clipright - 1); - int miny = MAX((MIN(MIN(Y1, Y2), Y3) + 0xF) >> 4, 0); - int maxy = MIN((MAX(MAX(Y1, Y2), Y3) + 0xF) >> 4, clipbottom - 1); - if (minx >= maxx || miny >= maxy) + for (int iy = 0; iy < 4; iy++) { - thread->NumFullSpans = 0; - thread->NumPartialBlocks = 0; - return; + for (int ix = 0; ix < q; ix++) + { + bool covered = subsector[ix] >= subsectorDepth; + mask0 <<= 1; + mask0 |= (uint32_t)covered; + } + subsector += subsectorPitch; + } + for (int iy = 4; iy < q; iy++) + { + for (int ix = 0; ix < q; ix++) + { + bool covered = subsector[ix] >= subsectorDepth; + mask1 <<= 1; + mask1 |= (uint32_t)covered; + } + subsector += subsectorPitch; } - // Block size, standard 8x8 (must be power of two) - const int q = 8; + Mask0 = Mask0 & mask0; + Mask1 = Mask1 & mask1; +} - // Start in corner of 8x8 block - minx &= ~(q - 1); - miny &= ~(q - 1); +#else - // Half-edge constants - int C1 = DY12 * X1 - DX12 * Y1; - int C2 = DY23 * X2 - DX23 * Y2; - int C3 = DY31 * X3 - DX31 * Y3; - - // Correct for fill convention - if (DY12 < 0 || (DY12 == 0 && DX12 > 0)) C1++; - if (DY23 < 0 || (DY23 == 0 && DX23 > 0)) C2++; - if (DY31 < 0 || (DY31 == 0 && DX31 > 0)) C3++; - - // First block line for this thread - int core = thread->core; - int num_cores = thread->num_cores; - int core_skip = (num_cores - ((miny / q) - core) % num_cores) % num_cores; - miny += core_skip * q; - - thread->StartX = minx; - thread->StartY = miny; - span->Length = 0; - -#ifndef NO_SSE +void TriangleBlock::SubsectorTest() +{ + uint32_t *subsector = subsectorGBuffer + X + Y * subsectorPitch; + uint32_t mask0 = 0; + uint32_t mask1 = 0; __m128i msubsectorDepth = _mm_set1_epi32(subsectorDepth); __m128i mnotxor = _mm_set1_epi32(0xffffffff); - __m128i mstencilTestValue = _mm_set1_epi16(stencilTestValue); - __m128i mFDY12Offset = _mm_setr_epi32(0, FDY12, FDY12 * 2, FDY12 * 3); - __m128i mFDY23Offset = _mm_setr_epi32(0, FDY23, FDY23 * 2, FDY23 * 3); - __m128i mFDY31Offset = _mm_setr_epi32(0, FDY31, FDY31 * 2, FDY31 * 3); - __m128i mFDY12x4 = _mm_set1_epi32(FDY12 * 4); - __m128i mFDY23x4 = _mm_set1_epi32(FDY23 * 4); - __m128i mFDY31x4 = _mm_set1_epi32(FDY31 * 4); - __m128i mFDX12 = _mm_set1_epi32(FDX12); - __m128i mFDX23 = _mm_set1_epi32(FDX23); - __m128i mFDX31 = _mm_set1_epi32(FDX31); - __m128i mClipCompare0 = _mm_setr_epi32(clipright, clipright - 1, clipright - 2, clipright - 3); - __m128i mClipCompare1 = _mm_setr_epi32(clipright - 4, clipright - 5, clipright - 6, clipright - 7); -#endif - // Loop through blocks - for (int y = miny; y < maxy; y += q * num_cores) + for (int iy = 0; iy < 4; iy++) { - for (int x = minx; x < maxx; x += q) - { - // Corners of block - int x0 = x << 4; - int x1 = (x + q - 1) << 4; - int y0 = y << 4; - int y1 = (y + q - 1) << 4; - - // Evaluate half-space functions - bool a00 = C1 + DX12 * y0 - DY12 * x0 > 0; - bool a10 = C1 + DX12 * y0 - DY12 * x1 > 0; - bool a01 = C1 + DX12 * y1 - DY12 * x0 > 0; - bool a11 = C1 + DX12 * y1 - DY12 * x1 > 0; - int a = (a00 << 0) | (a10 << 1) | (a01 << 2) | (a11 << 3); - - bool b00 = C2 + DX23 * y0 - DY23 * x0 > 0; - bool b10 = C2 + DX23 * y0 - DY23 * x1 > 0; - bool b01 = C2 + DX23 * y1 - DY23 * x0 > 0; - bool b11 = C2 + DX23 * y1 - DY23 * x1 > 0; - int b = (b00 << 0) | (b10 << 1) | (b01 << 2) | (b11 << 3); - - bool c00 = C3 + DX31 * y0 - DY31 * x0 > 0; - bool c10 = C3 + DX31 * y0 - DY31 * x1 > 0; - bool c01 = C3 + DX31 * y1 - DY31 * x0 > 0; - bool c11 = C3 + DX31 * y1 - DY31 * x1 > 0; - int c = (c00 << 0) | (c10 << 1) | (c01 << 2) | (c11 << 3); - - // Stencil test the whole block, if possible - int block = x / 8 + y / 8 * stencilPitch; - uint8_t *stencilBlock = &stencilValues[block * 64]; - uint32_t *stencilBlockMask = &stencilMasks[block]; - bool blockIsSingleStencil = ((*stencilBlockMask) & 0xffffff00) == 0xffffff00; - bool skipBlock = blockIsSingleStencil && ((*stencilBlockMask) & 0xff) < stencilTestValue; - - // Skip block when outside an edge - if (a == 0 || b == 0 || c == 0 || skipBlock) - { - if (span->Length != 0) - { - span++; - span->Length = 0; - } - continue; - } - - // Accept whole block when totally covered - if (a == 0xf && b == 0xf && c == 0xf && x + q <= clipright && y + q <= clipbottom && blockIsSingleStencil) - { - // Totally covered block still needs a subsector coverage test: - - uint32_t *subsector = subsectorGBuffer + x + y * pitch; - - uint32_t mask0 = 0; - uint32_t mask1 = 0; - -#ifdef NO_SSE - for (int iy = 0; iy < 4; iy++) - { - for (int ix = 0; ix < q; ix++) - { - bool covered = subsector[ix] >= subsectorDepth; - mask0 <<= 1; - mask0 |= (uint32_t)covered; - } - subsector += pitch; - } - for (int iy = 4; iy < q; iy++) - { - for (int ix = 0; ix < q; ix++) - { - bool covered = subsector[ix] >= subsectorDepth; - mask1 <<= 1; - mask1 |= (uint32_t)covered; - } - subsector += pitch; - } -#else - for (int iy = 0; iy < 4; iy++) - { - mask0 <<= 4; - mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)subsector), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3)))); - mask0 <<= 4; - mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(subsector + 4)), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3)))); - subsector += pitch; - } - for (int iy = 4; iy < q; iy++) - { - mask1 <<= 4; - mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)subsector), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3)))); - mask1 <<= 4; - mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(subsector + 4)), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3)))); - subsector += pitch; - } -#endif - - if (mask0 != 0xffffffff || mask1 != 0xffffffff) - { - if (span->Length > 0) - { - span++; - span->Length = 0; - } - - if (mask0 == 0 && mask1 == 0) - continue; - - partial->X = x; - partial->Y = y; - partial->Mask0 = mask0; - partial->Mask1 = mask1; - partial++; - } - else if (span->Length != 0) - { - span->Length++; - } - else - { - span->X = x; - span->Y = y; - span->Length = 1; - } - } - else // Partially covered block - { - x0 = x << 4; - x1 = (x + q - 1) << 4; - int CY1 = C1 + DX12 * y0 - DY12 * x0; - int CY2 = C2 + DX23 * y0 - DY23 * x0; - int CY3 = C3 + DX31 * y0 - DY31 * x0; - - uint32_t *subsector = subsectorGBuffer + x + y * pitch; - - uint32_t mask0 = 0; - uint32_t mask1 = 0; - -#ifdef NO_SSE - for (int iy = 0; iy < 4; iy++) - { - int CX1 = CY1; - int CX2 = CY2; - int CX3 = CY3; - - for (int ix = 0; ix < q; ix++) - { - bool passStencilTest = blockIsSingleStencil || stencilBlock[ix + iy * q] >= stencilTestValue; - bool covered = (CX1 > 0 && CX2 > 0 && CX3 > 0 && (x + ix) < clipright && (y + iy) < clipbottom && passStencilTest && subsector[ix] >= subsectorDepth); - mask0 <<= 1; - mask0 |= (uint32_t)covered; - - CX1 -= FDY12; - CX2 -= FDY23; - CX3 -= FDY31; - } - - CY1 += FDX12; - CY2 += FDX23; - CY3 += FDX31; - subsector += pitch; - } - - for (int iy = 4; iy < q; iy++) - { - int CX1 = CY1; - int CX2 = CY2; - int CX3 = CY3; - - for (int ix = 0; ix < q; ix++) - { - bool passStencilTest = blockIsSingleStencil || stencilBlock[ix + iy * q] >= stencilTestValue; - bool covered = (CX1 > 0 && CX2 > 0 && CX3 > 0 && (x + ix) < clipright && (y + iy) < clipbottom && passStencilTest && subsector[ix] >= subsectorDepth); - mask1 <<= 1; - mask1 |= (uint32_t)covered; - - CX1 -= FDY12; - CX2 -= FDY23; - CX3 -= FDY31; - } - - CY1 += FDX12; - CY2 += FDX23; - CY3 += FDX31; - subsector += pitch; - } -#else - __m128i mSingleStencilMask = _mm_set1_epi32(blockIsSingleStencil ? 0 : 0xffffffff); - __m128i mCY1 = _mm_sub_epi32(_mm_set1_epi32(CY1), mFDY12Offset); - __m128i mCY2 = _mm_sub_epi32(_mm_set1_epi32(CY2), mFDY23Offset); - __m128i mCY3 = _mm_sub_epi32(_mm_set1_epi32(CY3), mFDY31Offset); - __m128i mx = _mm_set1_epi32(x); - __m128i mClipTest0 = _mm_cmplt_epi32(mx, mClipCompare0); - __m128i mClipTest1 = _mm_cmplt_epi32(mx, mClipCompare1); - int iy; - for (iy = 0; iy < 4 && iy < clipbottom - y; iy++) - { - __m128i mstencilBlock = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(stencilBlock + iy * 8)), _mm_setzero_si128()); - __m128i mstencilTest = _mm_and_si128(_mm_cmplt_epi16(mstencilBlock, mstencilTestValue), mSingleStencilMask); - __m128i mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest); - __m128i mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest); - __m128i msubsectorTest0 = _mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)subsector), msubsectorDepth); - __m128i msubsectorTest1 = _mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(subsector + 4)), msubsectorDepth); - __m128i mtest0 = _mm_and_si128(_mm_xor_si128(_mm_or_si128(mstencilTest0, msubsectorTest0), mnotxor), mClipTest0); - __m128i mtest1 = _mm_and_si128(_mm_xor_si128(_mm_or_si128(mstencilTest1, msubsectorTest1), mnotxor), mClipTest1); - - mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY1, _mm_setzero_si128()), mtest0); - mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0); - mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY3, _mm_setzero_si128()), mtest0); - mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128()), mtest1); - mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1); - mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1); - - mCY1 = _mm_add_epi32(mCY1, mFDX12); - mCY2 = _mm_add_epi32(mCY2, mFDX23); - mCY3 = _mm_add_epi32(mCY3, mFDX31); - - mask0 <<= 4; - mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3)))); - mask0 <<= 4; - mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3)))); - - subsector += pitch; - } - mask0 <<= (4 - iy) * 8; - - for (iy = 4; iy < q && iy < clipbottom - y; iy++) - { - __m128i mstencilBlock = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(stencilBlock + iy * 8)), _mm_setzero_si128()); - __m128i mstencilTest = _mm_and_si128(_mm_cmplt_epi16(mstencilBlock, mstencilTestValue), mSingleStencilMask); - __m128i mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest); - __m128i mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest); - __m128i msubsectorTest0 = _mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)subsector), msubsectorDepth); - __m128i msubsectorTest1 = _mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(subsector + 4)), msubsectorDepth); - __m128i mtest0 = _mm_and_si128(_mm_xor_si128(_mm_or_si128(mstencilTest0, msubsectorTest0), mnotxor), mClipTest0); - __m128i mtest1 = _mm_and_si128(_mm_xor_si128(_mm_or_si128(mstencilTest1, msubsectorTest1), mnotxor), mClipTest1); - - mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY1, _mm_setzero_si128()), mtest0); - mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0); - mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY3, _mm_setzero_si128()), mtest0); - mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128()), mtest1); - mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1); - mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1); - - mCY1 = _mm_add_epi32(mCY1, mFDX12); - mCY2 = _mm_add_epi32(mCY2, mFDX23); - mCY3 = _mm_add_epi32(mCY3, mFDX31); - - mask1 <<= 4; - mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3)))); - mask1 <<= 4; - mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3)))); - - subsector += pitch; - } - mask1 <<= (q - iy) * 8; -#endif - - if (mask0 != 0xffffffff || mask1 != 0xffffffff) - { - if (span->Length > 0) - { - span++; - span->Length = 0; - } - - if (mask0 == 0 && mask1 == 0) - continue; - - partial->X = x; - partial->Y = y; - partial->Mask0 = mask0; - partial->Mask1 = mask1; - partial++; - } - else if (span->Length != 0) - { - span->Length++; - } - else - { - span->X = x; - span->Y = y; - span->Length = 1; - } - } - } - - if (span->Length != 0) - { - span++; - span->Length = 0; - } + mask0 <<= 4; + mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)subsector), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3)))); + mask0 <<= 4; + mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(subsector + 4)), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3)))); + subsector += subsectorPitch; + } + for (int iy = 4; iy < q; iy++) + { + mask1 <<= 4; + mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)subsector), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3)))); + mask1 <<= 4; + mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(subsector + 4)), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3)))); + subsector += subsectorPitch; } - thread->NumFullSpans = (int)(span - thread->FullSpans); - thread->NumPartialBlocks = (int)(partial - thread->PartialBlocks); + Mask0 = Mask0 & mask0; + Mask1 = Mask1 & mask1; } -void ScreenTriangle::StencilWrite(const TriDrawTriangleArgs *args, WorkerThreadData *thread) +#endif + +void TriangleBlock::ClipTest() { - uint8_t * RESTRICT stencilValues = args->stencilValues; - uint32_t * RESTRICT stencilMasks = args->stencilMasks; - uint32_t stencilWriteValue = args->uniforms->StencilWriteValue(); - uint32_t stencilPitch = args->stencilPitch; - - int numSpans = thread->NumFullSpans; - auto fullSpans = thread->FullSpans; - int numBlocks = thread->NumPartialBlocks; - auto partialBlocks = thread->PartialBlocks; - - for (int i = 0; i < numSpans; i++) + static const uint32_t clipxmask[8] = { - const auto &span = fullSpans[i]; - - int block = span.X / 8 + span.Y / 8 * stencilPitch; - uint8_t *stencilBlock = &stencilValues[block * 64]; - uint32_t *stencilBlockMask = &stencilMasks[block]; - - int width = span.Length; - for (int x = 0; x < width; x++) - stencilBlockMask[x] = 0xffffff00 | stencilWriteValue; + 0, + 0x80808080, + 0xc0c0c0c0, + 0xe0e0e0e0, + 0xf0f0f0f0, + 0xf8f8f8f8, + 0xfcfcfcfc, + 0xfefefefe + }; + + static const uint32_t clipymask[8] = + { + 0, + 0xff000000, + 0xffff0000, + 0xffffff00, + 0xffffffff, + 0xffffffff, + 0xffffffff, + 0xffffffff + }; + + uint32_t xmask = (X + 8 <= clipright) ? 0xffffffff : clipxmask[clipright - X]; + uint32_t ymask0 = (Y + 4 <= clipbottom) ? 0xffffffff : clipymask[clipbottom - Y]; + uint32_t ymask1 = (Y + 8 <= clipbottom) ? 0xffffffff : clipymask[clipbottom - Y - 4]; + + Mask0 = Mask0 & xmask & ymask0; + Mask1 = Mask1 & xmask & ymask1; +} + +#ifdef NO_SSE + +void TriangleBlock::StencilEqualTest() +{ + // Stencil test the whole block, if possible + int block = (X >> 3) + (Y >> 3) * stencilPitch; + uint8_t *stencilBlock = &stencilValues[block * 64]; + uint32_t *stencilBlockMask = &stencilMasks[block]; + bool blockIsSingleStencil = ((*stencilBlockMask) & 0xffffff00) == 0xffffff00; + bool skipBlock = blockIsSingleStencil && ((*stencilBlockMask) & 0xff) != stencilTestValue; + if (skipBlock) + { + Mask0 = 0; + Mask1 = 0; } - - for (int i = 0; i < numBlocks; i++) + else if (!blockIsSingleStencil) { - const auto &block = partialBlocks[i]; - - uint32_t mask0 = block.Mask0; - uint32_t mask1 = block.Mask1; - - int sblock = block.X / 8 + block.Y / 8 * stencilPitch; - uint8_t *stencilBlock = &stencilValues[sblock * 64]; - uint32_t *stencilBlockMask = &stencilMasks[sblock]; - - bool isSingleValue = ((*stencilBlockMask) & 0xffffff00) == 0xffffff00; + uint32_t mask0 = 0; + uint32_t mask1 = 0; + + for (int iy = 0; iy < 4; iy++) + { + for (int ix = 0; ix < q; ix++) + { + bool passStencilTest = stencilBlock[ix + iy * q] == stencilTestValue; + mask0 <<= 1; + mask0 |= (uint32_t)passStencilTest; + } + } + + for (int iy = 4; iy < q; iy++) + { + for (int ix = 0; ix < q; ix++) + { + bool passStencilTest = stencilBlock[ix + iy * q] == stencilTestValue; + mask1 <<= 1; + mask1 |= (uint32_t)passStencilTest; + } + } + + Mask0 = Mask0 & mask0; + Mask1 = Mask1 & mask1; + } +} + +#else + +void TriangleBlock::StencilEqualTest() +{ + // Stencil test the whole block, if possible + int block = (X >> 3) + (Y >> 3) * stencilPitch; + uint8_t *stencilBlock = &stencilValues[block * 64]; + uint32_t *stencilBlockMask = &stencilMasks[block]; + bool blockIsSingleStencil = ((*stencilBlockMask) & 0xffffff00) == 0xffffff00; + bool skipBlock = blockIsSingleStencil && ((*stencilBlockMask) & 0xff) != stencilTestValue; + if (skipBlock) + { + Mask0 = 0; + Mask1 = 0; + } + else if (!blockIsSingleStencil) + { + __m128i mstencilTestValue = _mm_set1_epi16(stencilTestValue); + uint32_t mask0 = 0; + uint32_t mask1 = 0; + + for (int iy = 0; iy < 4; iy++) + { + __m128i mstencilBlock = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(stencilBlock + iy * 8)), _mm_setzero_si128()); + __m128i mstencilTest = _mm_cmpeq_epi16(mstencilBlock, mstencilTestValue); + __m128i mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest); + __m128i mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest); + + mask0 <<= 4; + mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mstencilTest0, _MM_SHUFFLE(0, 1, 2, 3)))); + mask0 <<= 4; + mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mstencilTest1, _MM_SHUFFLE(0, 1, 2, 3)))); + } + + for (int iy = 4; iy < q; iy++) + { + __m128i mstencilBlock = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(stencilBlock + iy * 8)), _mm_setzero_si128()); + __m128i mstencilTest = _mm_cmpeq_epi16(mstencilBlock, mstencilTestValue); + __m128i mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest); + __m128i mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest); + + mask1 <<= 4; + mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mstencilTest0, _MM_SHUFFLE(0, 1, 2, 3)))); + mask1 <<= 4; + mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mstencilTest1, _MM_SHUFFLE(0, 1, 2, 3)))); + } + + Mask0 = Mask0 & mask0; + Mask1 = Mask1 & mask1; + } +} + +#endif + +void TriangleBlock::StencilGreaterEqualTest() +{ + // Stencil test the whole block, if possible + int block = (X >> 3) + (Y >> 3) * stencilPitch; + uint8_t *stencilBlock = &stencilValues[block * 64]; + uint32_t *stencilBlockMask = &stencilMasks[block]; + bool blockIsSingleStencil = ((*stencilBlockMask) & 0xffffff00) == 0xffffff00; + bool skipBlock = blockIsSingleStencil && ((*stencilBlockMask) & 0xff) < stencilTestValue; + if (skipBlock) + { + Mask0 = 0; + Mask1 = 0; + } + else if (!blockIsSingleStencil) + { + uint32_t mask0 = 0; + uint32_t mask1 = 0; + + for (int iy = 0; iy < 4; iy++) + { + for (int ix = 0; ix < q; ix++) + { + bool passStencilTest = stencilBlock[ix + iy * q] >= stencilTestValue; + mask0 <<= 1; + mask0 |= (uint32_t)passStencilTest; + } + } + + for (int iy = 4; iy < q; iy++) + { + for (int ix = 0; ix < q; ix++) + { + bool passStencilTest = stencilBlock[ix + iy * q] >= stencilTestValue; + mask1 <<= 1; + mask1 |= (uint32_t)passStencilTest; + } + } + + Mask0 = Mask0 & mask0; + Mask1 = Mask1 & mask1; + } +} + +#ifdef NO_SSE + +void TriangleBlock::CoverageTest() +{ + // Corners of block + int x0 = X << 4; + int x1 = (X + q - 1) << 4; + int y0 = Y << 4; + int y1 = (Y + q - 1) << 4; + + // Evaluate half-space functions + bool a00 = C1 + DX12 * y0 - DY12 * x0 > 0; + bool a10 = C1 + DX12 * y0 - DY12 * x1 > 0; + bool a01 = C1 + DX12 * y1 - DY12 * x0 > 0; + bool a11 = C1 + DX12 * y1 - DY12 * x1 > 0; + int a = (a00 << 0) | (a10 << 1) | (a01 << 2) | (a11 << 3); + + bool b00 = C2 + DX23 * y0 - DY23 * x0 > 0; + bool b10 = C2 + DX23 * y0 - DY23 * x1 > 0; + bool b01 = C2 + DX23 * y1 - DY23 * x0 > 0; + bool b11 = C2 + DX23 * y1 - DY23 * x1 > 0; + int b = (b00 << 0) | (b10 << 1) | (b01 << 2) | (b11 << 3); + + bool c00 = C3 + DX31 * y0 - DY31 * x0 > 0; + bool c10 = C3 + DX31 * y0 - DY31 * x1 > 0; + bool c01 = C3 + DX31 * y1 - DY31 * x0 > 0; + bool c11 = C3 + DX31 * y1 - DY31 * x1 > 0; + int c = (c00 << 0) | (c10 << 1) | (c01 << 2) | (c11 << 3); + + if (a == 0 || b == 0 || c == 0) // Skip block when outside an edge + { + Mask0 = 0; + Mask1 = 0; + } + else if (a == 0xf && b == 0xf && c == 0xf) // Accept whole block when totally covered + { + Mask0 = 0xffffffff; + Mask1 = 0xffffffff; + } + else // Partially covered block + { + x0 = X << 4; + x1 = (X + q - 1) << 4; + int CY1 = C1 + DX12 * y0 - DY12 * x0; + int CY2 = C2 + DX23 * y0 - DY23 * x0; + int CY3 = C3 + DX31 * y0 - DY31 * x0; + + uint32_t mask0 = 0; + uint32_t mask1 = 0; + + for (int iy = 0; iy < 4; iy++) + { + int CX1 = CY1; + int CX2 = CY2; + int CX3 = CY3; + + for (int ix = 0; ix < q; ix++) + { + bool covered = CX1 > 0 && CX2 > 0 && CX3 > 0; + mask0 <<= 1; + mask0 |= (uint32_t)covered; + + CX1 -= FDY12; + CX2 -= FDY23; + CX3 -= FDY31; + } + + CY1 += FDX12; + CY2 += FDX23; + CY3 += FDX31; + } + + for (int iy = 4; iy < q; iy++) + { + int CX1 = CY1; + int CX2 = CY2; + int CX3 = CY3; + + for (int ix = 0; ix < q; ix++) + { + bool covered = CX1 > 0 && CX2 > 0 && CX3 > 0; + mask1 <<= 1; + mask1 |= (uint32_t)covered; + + CX1 -= FDY12; + CX2 -= FDY23; + CX3 -= FDY31; + } + + CY1 += FDX12; + CY2 += FDX23; + CY3 += FDX31; + } + + Mask0 = mask0; + Mask1 = mask1; + } +} + +#else + +void TriangleBlock::CoverageTest() +{ + // Corners of block + int x0 = X << 4; + int x1 = (X + q - 1) << 4; + int y0 = Y << 4; + int y1 = (Y + q - 1) << 4; + + // Evaluate half-space functions + bool a00 = C1 + DX12 * y0 - DY12 * x0 > 0; + bool a10 = C1 + DX12 * y0 - DY12 * x1 > 0; + bool a01 = C1 + DX12 * y1 - DY12 * x0 > 0; + bool a11 = C1 + DX12 * y1 - DY12 * x1 > 0; + int a = (a00 << 0) | (a10 << 1) | (a01 << 2) | (a11 << 3); + + bool b00 = C2 + DX23 * y0 - DY23 * x0 > 0; + bool b10 = C2 + DX23 * y0 - DY23 * x1 > 0; + bool b01 = C2 + DX23 * y1 - DY23 * x0 > 0; + bool b11 = C2 + DX23 * y1 - DY23 * x1 > 0; + int b = (b00 << 0) | (b10 << 1) | (b01 << 2) | (b11 << 3); + + bool c00 = C3 + DX31 * y0 - DY31 * x0 > 0; + bool c10 = C3 + DX31 * y0 - DY31 * x1 > 0; + bool c01 = C3 + DX31 * y1 - DY31 * x0 > 0; + bool c11 = C3 + DX31 * y1 - DY31 * x1 > 0; + int c = (c00 << 0) | (c10 << 1) | (c01 << 2) | (c11 << 3); + + if (a == 0 || b == 0 || c == 0) // Skip block when outside an edge + { + Mask0 = 0; + Mask1 = 0; + } + else if (a == 0xf && b == 0xf && c == 0xf) // Accept whole block when totally covered + { + Mask0 = 0xffffffff; + Mask1 = 0xffffffff; + } + else // Partially covered block + { + x0 = X << 4; + x1 = (X + q - 1) << 4; + int CY1 = C1 + DX12 * y0 - DY12 * x0; + int CY2 = C2 + DX23 * y0 - DY23 * x0; + int CY3 = C3 + DX31 * y0 - DY31 * x0; + + uint32_t mask0 = 0; + uint32_t mask1 = 0; + + __m128i mCY1 = _mm_sub_epi32(_mm_set1_epi32(CY1), mFDY12Offset); + __m128i mCY2 = _mm_sub_epi32(_mm_set1_epi32(CY2), mFDY23Offset); + __m128i mCY3 = _mm_sub_epi32(_mm_set1_epi32(CY3), mFDY31Offset); + for (int iy = 0; iy < 4; iy++) + { + __m128i mtest0 = _mm_cmpgt_epi32(mCY1, _mm_setzero_si128()); + mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0); + mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY3, _mm_setzero_si128()), mtest0); + __m128i mtest1 = _mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128()); + mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1); + mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1); + + mCY1 = _mm_add_epi32(mCY1, mFDX12); + mCY2 = _mm_add_epi32(mCY2, mFDX23); + mCY3 = _mm_add_epi32(mCY3, mFDX31); + + mask0 <<= 4; + mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3)))); + mask0 <<= 4; + mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3)))); + } + + for (int iy = 4; iy < q; iy++) + { + __m128i mtest0 = _mm_cmpgt_epi32(mCY1, _mm_setzero_si128()); + mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0); + mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY3, _mm_setzero_si128()), mtest0); + __m128i mtest1 = _mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128()); + mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1); + mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1); + + mCY1 = _mm_add_epi32(mCY1, mFDX12); + mCY2 = _mm_add_epi32(mCY2, mFDX23); + mCY3 = _mm_add_epi32(mCY3, mFDX31); + + mask1 <<= 4; + mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3)))); + mask1 <<= 4; + mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3)))); + } + + Mask0 = mask0; + Mask1 = mask1; + } +} + +#endif + +void TriangleBlock::StencilWrite() +{ + int block = (X >> 3) + (Y >> 3) * stencilPitch; + uint8_t *stencilBlock = &stencilValues[block * 64]; + uint32_t &stencilBlockMask = stencilMasks[block]; + uint32_t writeValue = stencilWriteValue; + + if (Mask0 == 0xffffffff && Mask1 == 0xffffffff) + { + stencilBlockMask = 0xffffff00 | writeValue; + } + else + { + uint32_t mask0 = Mask0; + uint32_t mask1 = Mask1; + + bool isSingleValue = (stencilBlockMask & 0xffffff00) == 0xffffff00; if (isSingleValue) { - uint8_t value = (*stencilBlockMask) & 0xff; + uint8_t value = stencilBlockMask & 0xff; for (int v = 0; v < 64; v++) stencilBlock[v] = value; - *stencilBlockMask = 0; + stencilBlockMask = 0; } - + int count = 0; for (int v = 0; v < 32; v++) { - if ((mask0 & (1 << 31)) || stencilBlock[v] == stencilWriteValue) + if ((mask0 & (1 << 31)) || stencilBlock[v] == writeValue) { - stencilBlock[v] = stencilWriteValue; + stencilBlock[v] = writeValue; count++; } mask0 <<= 1; } for (int v = 32; v < 64; v++) { - if ((mask1 & (1 << 31)) || stencilBlock[v] == stencilWriteValue) + if ((mask1 & (1 << 31)) || stencilBlock[v] == writeValue) { - stencilBlock[v] = stencilWriteValue; + stencilBlock[v] = writeValue; count++; } mask1 <<= 1; } - + if (count == 64) - *stencilBlockMask = 0xffffff00 | stencilWriteValue; + stencilBlockMask = 0xffffff00 | writeValue; } } -void ScreenTriangle::SubsectorWrite(const TriDrawTriangleArgs *args, WorkerThreadData *thread) +void TriangleBlock::SubsectorWrite() { - uint32_t * RESTRICT subsectorGBuffer = args->subsectorGBuffer; - uint32_t subsectorDepth = args->uniforms->SubsectorDepth(); - int pitch = args->pitch; + auto pitch = subsectorPitch; + uint32_t *subsector = subsectorGBuffer + X + Y * pitch; - int numSpans = thread->NumFullSpans; - auto fullSpans = thread->FullSpans; - int numBlocks = thread->NumPartialBlocks; - auto partialBlocks = thread->PartialBlocks; - - for (int i = 0; i < numSpans; i++) + if (Mask0 == 0xffffffff && Mask1 == 0xffffffff) { - const auto &span = fullSpans[i]; - - uint32_t *subsector = subsectorGBuffer + span.X + span.Y * pitch; - int width = span.Length * 8; - int height = 8; - for (int y = 0; y < height; y++) + for (int y = 0; y < 8; y++) { - for (int x = 0; x < width; x++) + for (int x = 0; x < 8; x++) subsector[x] = subsectorDepth; subsector += pitch; } } - - for (int i = 0; i < numBlocks; i++) + else { - const auto &block = partialBlocks[i]; - - uint32_t *subsector = subsectorGBuffer + block.X + block.Y * pitch; - uint32_t mask0 = block.Mask0; - uint32_t mask1 = block.Mask1; + uint32_t mask0 = Mask0; + uint32_t mask1 = Mask1; for (int y = 0; y < 4; y++) { for (int x = 0; x < 8; x++) @@ -922,7 +796,13 @@ void ScreenTriangle::SubsectorWrite(const TriDrawTriangleArgs *args, WorkerThrea } } -void(*ScreenTriangle::TriDrawers8[])(const TriDrawTriangleArgs *, WorkerThreadData *) = +void ScreenTriangle::Draw(const TriDrawTriangleArgs *args, WorkerThreadData *thread) +{ + TriangleBlock block(args); + block.Loop(args, thread); +} + +void(*ScreenTriangle::TriDrawers8[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *) = { &TriScreenDrawer8::Execute, // TextureOpaque &TriScreenDrawer8::Execute, // TextureMasked @@ -950,14 +830,14 @@ void(*ScreenTriangle::TriDrawers8[])(const TriDrawTriangleArgs *, WorkerThreadDa #ifdef NO_SSE -void(*ScreenTriangle::TriDrawers32[])(const TriDrawTriangleArgs *, WorkerThreadData *) = +void(*ScreenTriangle::TriDrawers32[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *) = { nullptr }; #else -void(*ScreenTriangle::TriDrawers32[])(const TriDrawTriangleArgs *, WorkerThreadData *) = +void(*ScreenTriangle::TriDrawers32[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *) = { &TriScreenDrawer32::Execute, // TextureOpaque &TriScreenDrawer32::Execute, // TextureMasked diff --git a/src/polyrenderer/drawers/screen_triangle.h b/src/polyrenderer/drawers/screen_triangle.h index ab1364990..5c7787cb4 100644 --- a/src/polyrenderer/drawers/screen_triangle.h +++ b/src/polyrenderer/drawers/screen_triangle.h @@ -28,34 +28,10 @@ class FString; class PolyDrawArgs; -struct TriFullSpan -{ - uint16_t X; - uint16_t Y; - uint32_t Length; -}; - -struct TriPartialBlock -{ - uint16_t X; - uint16_t Y; - uint32_t Mask0; - uint32_t Mask1; -}; - struct WorkerThreadData { int32_t core; int32_t num_cores; - uint32_t *temp; - - // Triangle working data: - TriFullSpan *FullSpans; - TriPartialBlock *PartialBlocks; - uint32_t NumFullSpans; - uint32_t NumPartialBlocks; - int32_t StartX; - int32_t StartY; }; struct TriVertex @@ -74,15 +50,14 @@ struct TriDrawTriangleArgs TriVertex *v1; TriVertex *v2; TriVertex *v3; - int32_t clipleft; int32_t clipright; - int32_t cliptop; int32_t clipbottom; uint8_t *stencilValues; uint32_t *stencilMasks; int32_t stencilPitch; uint32_t *subsectorGBuffer; const PolyDrawArgs *uniforms; + bool destBgra; }; class RectDrawArgs; @@ -116,13 +91,10 @@ enum class TriBlendMode class ScreenTriangle { public: - static void SetupNormal(const TriDrawTriangleArgs *args, WorkerThreadData *thread); - static void SetupSubsector(const TriDrawTriangleArgs *args, WorkerThreadData *thread); - static void StencilWrite(const TriDrawTriangleArgs *args, WorkerThreadData *thread); - static void SubsectorWrite(const TriDrawTriangleArgs *args, WorkerThreadData *thread); + static void Draw(const TriDrawTriangleArgs *args, WorkerThreadData *thread); - static void(*TriDrawers8[])(const TriDrawTriangleArgs *, WorkerThreadData *); - static void(*TriDrawers32[])(const TriDrawTriangleArgs *, WorkerThreadData *); + static void(*TriDrawers8[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *); + static void(*TriDrawers32[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *); static void(*RectDrawers8[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *); static void(*RectDrawers32[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *); }; diff --git a/src/swrenderer/drawers/r_thread.h b/src/swrenderer/drawers/r_thread.h index 8e1af8ae9..6927d14c7 100644 --- a/src/swrenderer/drawers/r_thread.h +++ b/src/swrenderer/drawers/r_thread.h @@ -23,7 +23,6 @@ #pragma once #include "r_draw.h" -#include "polyrenderer/drawers/screen_triangle.h" #include #include #include @@ -37,12 +36,6 @@ EXTERN_CVAR(Bool, r_multithreaded) class DrawerThread { public: - DrawerThread() - { - FullSpansBuffer.resize(MAXWIDTH / 8 * (MAXHEIGHT / 8)); - PartialBlocksBuffer.resize(MAXWIDTH / 8 * (MAXHEIGHT / 8)); - } - std::thread thread; size_t current_queue = 0; @@ -55,10 +48,6 @@ public: // Working buffer used by the tilted (sloped) span drawer const uint8_t *tiltlighting[MAXWIDTH]; - // Working buffer used by the triangler drawer - std::vector FullSpansBuffer; - std::vector PartialBlocksBuffer; - // Checks if a line is rendered by this thread bool line_skipped_by_thread(int line) { From da6bfe65ff35b3d408de6515562c6ce4372782ad Mon Sep 17 00:00:00 2001 From: Magnus Norddahl Date: Sat, 1 Apr 2017 04:38:15 +0200 Subject: [PATCH 02/12] - only calculate gradients once --- src/polyrenderer/drawers/poly_drawer32_sse2.h | 26 +------------ src/polyrenderer/drawers/poly_drawer8.h | 27 +------------- src/polyrenderer/drawers/poly_triangle.cpp | 2 + src/polyrenderer/drawers/screen_triangle.h | 37 ++++++++++++++++--- 4 files changed, 38 insertions(+), 54 deletions(-) diff --git a/src/polyrenderer/drawers/poly_drawer32_sse2.h b/src/polyrenderer/drawers/poly_drawer32_sse2.h index fba694f61..119c50746 100644 --- a/src/polyrenderer/drawers/poly_drawer32_sse2.h +++ b/src/polyrenderer/drawers/poly_drawer32_sse2.h @@ -319,17 +319,9 @@ private: // Calculate gradients const TriVertex &v1 = *args->v1; - const TriVertex &v2 = *args->v2; - const TriVertex &v3 = *args->v3; - ScreenTriangleStepVariables gradientX; - ScreenTriangleStepVariables gradientY; + ScreenTriangleStepVariables gradientX = args->gradientX; + ScreenTriangleStepVariables gradientY = args->gradientY; ScreenTriangleStepVariables blockPosY; - gradientX.W = FindGradientX(v1.x, v1.y, v2.x, v2.y, v3.x, v3.y, v1.w, v2.w, v3.w); - gradientY.W = FindGradientY(v1.x, v1.y, v2.x, v2.y, v3.x, v3.y, v1.w, v2.w, v3.w); - gradientX.U = FindGradientX(v1.x, v1.y, v2.x, v2.y, v3.x, v3.y, v1.u * v1.w, v2.u * v2.w, v3.u * v3.w); - gradientY.U = FindGradientY(v1.x, v1.y, v2.x, v2.y, v3.x, v3.y, v1.u * v1.w, v2.u * v2.w, v3.u * v3.w); - gradientX.V = FindGradientX(v1.x, v1.y, v2.x, v2.y, v3.x, v3.y, v1.v * v1.w, v2.v * v2.w, v3.v * v3.w); - gradientY.V = FindGradientY(v1.x, v1.y, v2.x, v2.y, v3.x, v3.y, v1.v * v1.w, v2.v * v2.w, v3.v * v3.w); blockPosY.W = v1.w + gradientX.W * (destX - v1.x) + gradientY.W * (destY - v1.y); blockPosY.U = v1.u * v1.w + gradientX.U * (destX - v1.x) + gradientY.U * (destY - v1.y); blockPosY.V = v1.v * v1.w + gradientX.V * (destX - v1.x) + gradientY.V * (destY - v1.y); @@ -645,20 +637,6 @@ private: } } } - - static float FindGradientX(float x0, float y0, float x1, float y1, float x2, float y2, float c0, float c1, float c2) - { - float top = (c1 - c2) * (y0 - y2) - (c0 - c2) * (y1 - y2); - float bottom = (x1 - x2) * (y0 - y2) - (x0 - x2) * (y1 - y2); - return top / bottom; - } - - static float FindGradientY(float x0, float y0, float x1, float y1, float x2, float y2, float c0, float c1, float c2) - { - float top = (c1 - c2) * (x0 - x2) - (c0 - c2) * (x1 - x2); - float bottom = (x0 - x2) * (y1 - y2) - (x1 - x2) * (y0 - y2); - return top / bottom; - } }; template diff --git a/src/polyrenderer/drawers/poly_drawer8.h b/src/polyrenderer/drawers/poly_drawer8.h index b916b7c2d..8838035be 100644 --- a/src/polyrenderer/drawers/poly_drawer8.h +++ b/src/polyrenderer/drawers/poly_drawer8.h @@ -221,17 +221,9 @@ public: // Calculate gradients const TriVertex &v1 = *args->v1; - const TriVertex &v2 = *args->v2; - const TriVertex &v3 = *args->v3; - ScreenTriangleStepVariables gradientX; - ScreenTriangleStepVariables gradientY; + ScreenTriangleStepVariables gradientX = args->gradientX; + ScreenTriangleStepVariables gradientY = args->gradientY; ScreenTriangleStepVariables blockPosY; - gradientX.W = FindGradientX(v1.x, v1.y, v2.x, v2.y, v3.x, v3.y, v1.w, v2.w, v3.w); - gradientY.W = FindGradientY(v1.x, v1.y, v2.x, v2.y, v3.x, v3.y, v1.w, v2.w, v3.w); - gradientX.U = FindGradientX(v1.x, v1.y, v2.x, v2.y, v3.x, v3.y, v1.u * v1.w, v2.u * v2.w, v3.u * v3.w); - gradientY.U = FindGradientY(v1.x, v1.y, v2.x, v2.y, v3.x, v3.y, v1.u * v1.w, v2.u * v2.w, v3.u * v3.w); - gradientX.V = FindGradientX(v1.x, v1.y, v2.x, v2.y, v3.x, v3.y, v1.v * v1.w, v2.v * v2.w, v3.v * v3.w); - gradientY.V = FindGradientY(v1.x, v1.y, v2.x, v2.y, v3.x, v3.y, v1.v * v1.w, v2.v * v2.w, v3.v * v3.w); blockPosY.W = v1.w + gradientX.W * (destX - v1.x) + gradientY.W * (destY - v1.y); blockPosY.U = v1.u * v1.w + gradientX.U * (destX - v1.x) + gradientY.U * (destY - v1.y); blockPosY.V = v1.v * v1.w + gradientX.V * (destX - v1.x) + gradientY.V * (destY - v1.y); @@ -405,21 +397,6 @@ public: } } } - -private: - static float FindGradientX(float x0, float y0, float x1, float y1, float x2, float y2, float c0, float c1, float c2) - { - float top = (c1 - c2) * (y0 - y2) - (c0 - c2) * (y1 - y2); - float bottom = (x1 - x2) * (y0 - y2) - (x0 - x2) * (y1 - y2); - return top / bottom; - } - - static float FindGradientY(float x0, float y0, float x1, float y1, float x2, float y2, float c0, float c1, float c2) - { - float top = (c1 - c2) * (x0 - x2) - (c0 - c2) * (x1 - x2); - float bottom = (x0 - x2) * (y1 - y2) - (x1 - x2) * (y0 - y2); - return top / bottom; - } }; template diff --git a/src/polyrenderer/drawers/poly_triangle.cpp b/src/polyrenderer/drawers/poly_triangle.cpp index 0bd03821c..3e7caec90 100644 --- a/src/polyrenderer/drawers/poly_triangle.cpp +++ b/src/polyrenderer/drawers/poly_triangle.cpp @@ -227,6 +227,7 @@ void PolyTriangleDrawer::draw_shaded_triangle(const ShadedTriVertex *vert, bool args->v1 = &clippedvert[numclipvert - 1]; args->v2 = &clippedvert[i - 1]; args->v3 = &clippedvert[i - 2]; + args->CalculateGradients(); ScreenTriangle::Draw(args, thread); } @@ -238,6 +239,7 @@ void PolyTriangleDrawer::draw_shaded_triangle(const ShadedTriVertex *vert, bool args->v1 = &clippedvert[0]; args->v2 = &clippedvert[i - 1]; args->v3 = &clippedvert[i]; + args->CalculateGradients(); ScreenTriangle::Draw(args, thread); } diff --git a/src/polyrenderer/drawers/screen_triangle.h b/src/polyrenderer/drawers/screen_triangle.h index 5c7787cb4..3f8d33f73 100644 --- a/src/polyrenderer/drawers/screen_triangle.h +++ b/src/polyrenderer/drawers/screen_triangle.h @@ -43,6 +43,11 @@ struct TriVertex float u, v; }; +struct ScreenTriangleStepVariables +{ + float W, U, V; +}; + struct TriDrawTriangleArgs { uint8_t *dest; @@ -58,6 +63,33 @@ struct TriDrawTriangleArgs uint32_t *subsectorGBuffer; const PolyDrawArgs *uniforms; bool destBgra; + ScreenTriangleStepVariables gradientX; + ScreenTriangleStepVariables gradientY; + + void CalculateGradients() + { + gradientX.W = FindGradientX(v1->x, v1->y, v2->x, v2->y, v3->x, v3->y, v1->w, v2->w, v3->w); + gradientY.W = FindGradientY(v1->x, v1->y, v2->x, v2->y, v3->x, v3->y, v1->w, v2->w, v3->w); + gradientX.U = FindGradientX(v1->x, v1->y, v2->x, v2->y, v3->x, v3->y, v1->u * v1->w, v2->u * v2->w, v3->u * v3->w); + gradientY.U = FindGradientY(v1->x, v1->y, v2->x, v2->y, v3->x, v3->y, v1->u * v1->w, v2->u * v2->w, v3->u * v3->w); + gradientX.V = FindGradientX(v1->x, v1->y, v2->x, v2->y, v3->x, v3->y, v1->v * v1->w, v2->v * v2->w, v3->v * v3->w); + gradientY.V = FindGradientY(v1->x, v1->y, v2->x, v2->y, v3->x, v3->y, v1->v * v1->w, v2->v * v2->w, v3->v * v3->w); + } + +private: + static float FindGradientX(float x0, float y0, float x1, float y1, float x2, float y2, float c0, float c1, float c2) + { + float top = (c1 - c2) * (y0 - y2) - (c0 - c2) * (y1 - y2); + float bottom = (x1 - x2) * (y0 - y2) - (x0 - x2) * (y1 - y2); + return top / bottom; + } + + static float FindGradientY(float x0, float y0, float x1, float y1, float x2, float y2, float c0, float c1, float c2) + { + float top = (c1 - c2) * (x0 - x2) - (c0 - c2) * (x1 - x2); + float bottom = (x0 - x2) * (y1 - y2) - (x1 - x2) * (y0 - y2); + return top / bottom; + } }; class RectDrawArgs; @@ -99,11 +131,6 @@ public: static void(*RectDrawers32[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *); }; -struct ScreenTriangleStepVariables -{ - float W, U, V; -}; - namespace TriScreenDrawerModes { enum class BlendModes { Opaque, Masked, AddClamp, SubClamp, RevSubClamp, AddSrcColorOneMinusSrcColor, Shaded, AddClampShaded }; From 59ae50aecb7a33a4c475a8a364800e92f22f554f Mon Sep 17 00:00:00 2001 From: "alexey.lysiuk" Date: Sat, 1 Apr 2017 10:24:05 +0300 Subject: [PATCH 03/12] Fixed compilation warning reported by GCC/Clang src/gl/textures/gl_texture.cpp:691:25: warning: comparison of integers of different signs: 'unsigned int' and 'int' [-Wsign-compare] --- src/gl/textures/gl_texture.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gl/textures/gl_texture.cpp b/src/gl/textures/gl_texture.cpp index 63e29d240..bef0da5d5 100644 --- a/src/gl/textures/gl_texture.cpp +++ b/src/gl/textures/gl_texture.cpp @@ -688,7 +688,7 @@ void gl_ParseBrightmap(FScanner &sc, int deflump) void AddAutoBrightmaps() { int num = Wads.GetNumLumps(); - for (unsigned i = 0; i < num; i++) + for (int i = 0; i < num; i++) { const char *name = Wads.GetLumpFullName(i); if (strstr(name, "brightmaps/auto/") == name) From 893607c96c1d61539d7cd33f0ab10cc435ca2132 Mon Sep 17 00:00:00 2001 From: "alexey.lysiuk" Date: Sat, 1 Apr 2017 10:36:35 +0300 Subject: [PATCH 04/12] Fixed crash in decals handling caused by reference to undefined class https://forum.drdteam.org/viewtopic.php?t=7589 --- src/decallib.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/decallib.cpp b/src/decallib.cpp index d4ca5984b..d9f22499e 100644 --- a/src/decallib.cpp +++ b/src/decallib.cpp @@ -370,6 +370,12 @@ void FDecalLib::ReadAllDecals () for (i = 0; i < PClassActor::AllActorClasses.Size(); i++) { AActor *def = (AActor*)GetDefaultByType (PClassActor::AllActorClasses[i]); + if (nullptr == def) + { + // This is referenced but undefined class + // The corresponding warning should be already reported by DECORATE parser + continue; + } FName v = ENamedName(intptr_t(def->DecalGenerator)); if (v.IsValidName()) From 41f2f61b94f26f9ce035caa7e72dcfa52345af15 Mon Sep 17 00:00:00 2001 From: Christoph Oelckers Date: Sat, 1 Apr 2017 10:42:47 +0200 Subject: [PATCH 05/12] - minor VM optimization: Precalculate a function's frame size when compiling it instead of doing it each time it gets called. This made up ca. 10% of the 'call' instruction's execution time. --- src/scripting/backend/vmbuilder.cpp | 1 + src/scripting/vm/vm.h | 1 + src/scripting/vm/vmframe.cpp | 4 +--- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/scripting/backend/vmbuilder.cpp b/src/scripting/backend/vmbuilder.cpp index 4ed4f5c4e..e4682432c 100644 --- a/src/scripting/backend/vmbuilder.cpp +++ b/src/scripting/backend/vmbuilder.cpp @@ -139,6 +139,7 @@ void VMFunctionBuilder::MakeFunction(VMScriptFunction *func) func->NumRegA = Registers[REGT_POINTER].MostUsed; func->NumRegS = Registers[REGT_STRING].MostUsed; func->MaxParam = MaxParam; + func->StackSize = VMFrame::FrameSize(func->NumRegD, func->NumRegF, func->NumRegS, func->NumRegA, func->MaxParam, func->ExtraSpace); // Technically, there's no reason why we can't end the function with // entries on the parameter stack, but it means the caller probably diff --git a/src/scripting/vm/vm.h b/src/scripting/vm/vm.h index 42558da09..b95be74e3 100644 --- a/src/scripting/vm/vm.h +++ b/src/scripting/vm/vm.h @@ -716,6 +716,7 @@ public: int ExtraSpace; int CodeSize; // Size of code in instructions (not bytes) unsigned LineInfoCount; + unsigned StackSize; VM_UBYTE NumRegD; VM_UBYTE NumRegF; VM_UBYTE NumRegS; diff --git a/src/scripting/vm/vmframe.cpp b/src/scripting/vm/vmframe.cpp index e2ec3865a..4a0a2538a 100644 --- a/src/scripting/vm/vmframe.cpp +++ b/src/scripting/vm/vmframe.cpp @@ -273,10 +273,8 @@ VMFrameStack::~VMFrameStack() VMFrame *VMFrameStack::AllocFrame(VMScriptFunction *func) { - int size = VMFrame::FrameSize(func->NumRegD, func->NumRegF, func->NumRegS, func->NumRegA, - func->MaxParam, func->ExtraSpace); - VMFrame *frame = Alloc(size); frame->Func = func; + VMFrame *frame = Alloc(func->StackSize); frame->NumRegD = func->NumRegD; frame->NumRegF = func->NumRegF; frame->NumRegS = func->NumRegS; From e780cd82977ab50fd134f4e92b1a969cc33deab6 Mon Sep 17 00:00:00 2001 From: Christoph Oelckers Date: Sat, 1 Apr 2017 12:04:31 +0200 Subject: [PATCH 06/12] - seems the wrong version of this got committed... --- src/scripting/vm/vmframe.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripting/vm/vmframe.cpp b/src/scripting/vm/vmframe.cpp index 4a0a2538a..9cbf2adae 100644 --- a/src/scripting/vm/vmframe.cpp +++ b/src/scripting/vm/vmframe.cpp @@ -273,8 +273,8 @@ VMFrameStack::~VMFrameStack() VMFrame *VMFrameStack::AllocFrame(VMScriptFunction *func) { - frame->Func = func; VMFrame *frame = Alloc(func->StackSize); + frame->Func = func; frame->NumRegD = func->NumRegD; frame->NumRegF = func->NumRegF; frame->NumRegS = func->NumRegS; From 36ad485eddd4ef157dacd436bef32ca70f347204 Mon Sep 17 00:00:00 2001 From: "alexey.lysiuk" Date: Sat, 1 Apr 2017 13:16:31 +0300 Subject: [PATCH 07/12] Proper default value for GL framebuffer lock in Cocoa backend No more error when running with +map command line parameter with classic HUD: > VM execution aborted: Attempt to draw to screen outside a draw function > Called from BaseStatusBar.DrawImage [Native] > Called from DoomStatusBar.DrawFullScreenStuff at gzdoom.pk3:zscript/statusbar/doom_sbar.txt, line 140 > Called from DoomStatusBar.Draw at gzdoom.pk3:zscript/statusbar/doom_sbar.txt, line 41 --- src/posix/cocoa/i_video.mm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/posix/cocoa/i_video.mm b/src/posix/cocoa/i_video.mm index 906fc413d..e11a7c7f5 100644 --- a/src/posix/cocoa/i_video.mm +++ b/src/posix/cocoa/i_video.mm @@ -1146,7 +1146,7 @@ void CocoaFrameBuffer::Flip() SDLGLFB::SDLGLFB(void*, const int width, const int height, int, int, const bool fullscreen, bool bgra) : DFrameBuffer(width, height, bgra) -, m_lock(-1) +, m_lock(0) , m_isUpdatePending(false) { CGGammaValue gammaTable[GAMMA_TABLE_SIZE]; From 9dc89331097037d6de6d443c7cd938a18a3cb862 Mon Sep 17 00:00:00 2001 From: Christoph Oelckers Date: Sat, 1 Apr 2017 12:30:03 +0200 Subject: [PATCH 08/12] - let 'stat think' also print the number of active thinkers. --- src/dthinker.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/dthinker.cpp b/src/dthinker.cpp index cbac457c6..af1771c5b 100644 --- a/src/dthinker.cpp +++ b/src/dthinker.cpp @@ -43,6 +43,7 @@ #include "virtual.h" +static int ThinkCount; static cycle_t ThinkCycles; extern cycle_t BotSupportCycles; extern cycle_t ActionCycles; @@ -462,6 +463,7 @@ void DThinker::RunThinkers () { int i, count; + ThinkCount = 0; ThinkCycles.Reset(); BotSupportCycles.Reset(); ActionCycles.Reset(); @@ -525,6 +527,7 @@ int DThinker::TickThinkers (FThinkerList *list, FThinkerList *dest) if (!(node->ObjectFlags & OF_EuthanizeMe)) { // Only tick thinkers not scheduled for destruction + ThinkCount++; node->CallTick(); node->ObjectFlags &= ~OF_JustSpawned; GC::CheckGC(); @@ -753,6 +756,6 @@ DEFINE_ACTION_FUNCTION(DThinkerIterator, Reinit) ADD_STAT (think) { FString out; - out.Format ("Think time = %04.2f ms, Action = %04.2f ms", ThinkCycles.TimeMS(), ActionCycles.TimeMS()); + out.Format ("Think time = %04.2f ms - %d thinkers, Action = %04.2f ms", ThinkCycles.TimeMS(), ThinkCount, ActionCycles.TimeMS()); return out; } From 54764c136d7f425134853e7f077a7f4b1baa8394 Mon Sep 17 00:00:00 2001 From: Christoph Oelckers Date: Sat, 1 Apr 2017 12:59:58 +0200 Subject: [PATCH 09/12] - changed the 2D draw abort check to use a dedicated variable, that gets set in Begin2D and unset at the end of D_Display. This is really all the extent where 2D draw operations may be allowed. Trying to detect this from other variables is not reliable. --- src/d_main.cpp | 2 +- src/gl/system/gl_framebuffer.cpp | 3 ++- src/gl/system/gl_swframebuffer.h | 1 - src/v_video.cpp | 1 + src/v_video.h | 4 +++- src/win32/fb_d3d9.cpp | 2 +- src/win32/win32swiface.h | 1 - 7 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/d_main.cpp b/src/d_main.cpp index 80a37b21e..21433acdb 100644 --- a/src/d_main.cpp +++ b/src/d_main.cpp @@ -937,7 +937,7 @@ void D_Display () I_FreezeTime(false); GSnd->SetSfxPaused(false, 1); } - + screen->End2D(); cycles.Unclock(); FrameCycles = cycles; } diff --git a/src/gl/system/gl_framebuffer.cpp b/src/gl/system/gl_framebuffer.cpp index 08afb72ae..cef60ed7a 100644 --- a/src/gl/system/gl_framebuffer.cpp +++ b/src/gl/system/gl_framebuffer.cpp @@ -359,8 +359,9 @@ FNativePalette *OpenGLFrameBuffer::CreatePalette(FRemapTable *remap) // // //========================================================================== -bool OpenGLFrameBuffer::Begin2D(bool) +bool OpenGLFrameBuffer::Begin2D(bool copy3d) { + Super::Begin2D(copy3d); ClearClipRect(); gl_RenderState.mViewMatrix.loadIdentity(); gl_RenderState.mProjectionMatrix.ortho(0, GetWidth(), GetHeight(), 0, -1.0f, 1.0f); diff --git a/src/gl/system/gl_swframebuffer.h b/src/gl/system/gl_swframebuffer.h index 04f961493..c37cf6bf0 100644 --- a/src/gl/system/gl_swframebuffer.h +++ b/src/gl/system/gl_swframebuffer.h @@ -33,7 +33,6 @@ public: OpenGLSWFrameBuffer(void *hMonitor, int width, int height, int bits, int refreshHz, bool fullscreen, bool bgra); ~OpenGLSWFrameBuffer(); - bool HasBegun2D() override { return In2D || IsLocked(); } bool IsValid() override; bool Lock(bool buffered) override; void Unlock() override; diff --git a/src/v_video.cpp b/src/v_video.cpp index 3073e259c..f27c0c499 100644 --- a/src/v_video.cpp +++ b/src/v_video.cpp @@ -1122,6 +1122,7 @@ void DFrameBuffer::SetBlendingRect (int x1, int y1, int x2, int y2) bool DFrameBuffer::Begin2D (bool copy3d) { + isIn2D = true; ClearClipRect(); return false; } diff --git a/src/v_video.h b/src/v_video.h index d836e1a1f..1ea7c7bbf 100644 --- a/src/v_video.h +++ b/src/v_video.h @@ -413,9 +413,10 @@ public: // avoid copying the software buffer to the screen. // Returns true if hardware-accelerated 2D has been entered, false if not. virtual bool Begin2D(bool copy3d); + void End2D() { isIn2D = false; } // Returns true if Begin2D has been called and 2D drawing is now active - virtual bool HasBegun2D() { return IsLocked(); } + bool HasBegun2D() { return isIn2D; } // DrawTexture calls after Begin2D use native textures. @@ -463,6 +464,7 @@ protected: private: uint32_t LastMS, LastSec, FrameCount, LastCount, LastTic; + bool isIn2D = false; }; diff --git a/src/win32/fb_d3d9.cpp b/src/win32/fb_d3d9.cpp index 332e2416d..85331a347 100644 --- a/src/win32/fb_d3d9.cpp +++ b/src/win32/fb_d3d9.cpp @@ -2557,7 +2557,7 @@ bool D3DPal::Update() bool D3DFB::Begin2D(bool copy3d) { - ClearClipRect(); + Super::Begin2D(copy3d); if (!Accel2D) { return false; diff --git a/src/win32/win32swiface.h b/src/win32/win32swiface.h index 7cf8728e0..5c0a456e4 100644 --- a/src/win32/win32swiface.h +++ b/src/win32/win32swiface.h @@ -105,7 +105,6 @@ public: D3DFB (UINT adapter, int width, int height, bool bgra, bool fullscreen); ~D3DFB (); - bool HasBegun2D() override { return In2D || IsLocked(); } bool IsValid (); bool Lock (bool buffered); void Unlock (); From 699d4882d5374d13bda42352b9ae202c14a40ad4 Mon Sep 17 00:00:00 2001 From: Christoph Oelckers Date: Sat, 1 Apr 2017 13:08:45 +0200 Subject: [PATCH 10/12] - fixed: The ammo display in Doom should not be drawn if the current weapon does not use ammo. --- wadsrc/static/zscript/statusbar/doom_sbar.txt | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/wadsrc/static/zscript/statusbar/doom_sbar.txt b/wadsrc/static/zscript/statusbar/doom_sbar.txt index f5ac66ac2..4b61a169e 100644 --- a/wadsrc/static/zscript/statusbar/doom_sbar.txt +++ b/wadsrc/static/zscript/statusbar/doom_sbar.txt @@ -48,10 +48,8 @@ class DoomStatusBar : BaseStatusBar DrawImage("STTPRCNT", (90, 171), DI_ITEM_OFFSETS); DrawImage("STTPRCNT", (221, 171), DI_ITEM_OFFSETS); - Inventory a1, a2; - int amt1; - [a1, a2, amt1] = GetCurrentAmmo(); - DrawString(mHUDFont, FormatNumber(amt1, 3), (44, 171), DI_TEXT_ALIGN_RIGHT|DI_NOSHADOW); + Inventory a1 = GetCurrentAmmo(); + if (a1 != null) DrawString(mHUDFont, FormatNumber(a1.Amount, 3), (44, 171), DI_TEXT_ALIGN_RIGHT|DI_NOSHADOW); DrawString(mHUDFont, FormatNumber(CPlayer.health, 3), (90, 171), DI_TEXT_ALIGN_RIGHT|DI_NOSHADOW); DrawString(mHUDFont, FormatNumber(GetArmorAmount(), 3), (221, 171), DI_TEXT_ALIGN_RIGHT|DI_NOSHADOW); @@ -76,7 +74,7 @@ class DoomStatusBar : BaseStatusBar else image = ""; DrawImage(image, (239, 191), DI_ITEM_OFFSETS); - int maxamt; + int amt1, maxamt; [amt1, maxamt] = GetAmount("Clip"); DrawString(mIndexFont, FormatNumber(amt1, 3), (288, 173), DI_TEXT_ALIGN_RIGHT); DrawString(mIndexFont, FormatNumber(maxamt, 3), (314, 173), DI_TEXT_ALIGN_RIGHT); From 7dae43bf7f0a04cc93c69d8a416ed5c82493337d Mon Sep 17 00:00:00 2001 From: Christoph Oelckers Date: Sat, 1 Apr 2017 13:18:19 +0200 Subject: [PATCH 11/12] - fixed: The health chain in Hexen wasn't drawn on the main status bar. Did some reordering to ensure it won't get skipped. --- .../static/zscript/statusbar/hexen_sbar.txt | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/wadsrc/static/zscript/statusbar/hexen_sbar.txt b/wadsrc/static/zscript/statusbar/hexen_sbar.txt index d1418763f..35475a258 100644 --- a/wadsrc/static/zscript/statusbar/hexen_sbar.txt +++ b/wadsrc/static/zscript/statusbar/hexen_sbar.txt @@ -104,13 +104,24 @@ class HexenStatusBar : BaseStatusBar protected void DrawMainBar (double TicFrac) { DrawImage("H2BAR", (0, 134), DI_ITEM_OFFSETS); + + String Gem; + if (CPlayer.mo is "ClericPlayer") Gem = "LIFEGMC2"; + else if (CPlayer.mo is "MagePlayer") Gem = "LIFEGMM2"; + else Gem = "LIFEGMF2"; + + int inthealth = mHealthInterpolator2.GetValue(); + DrawGem("CHAIN", "LIFEGMF2", inthealth, CPlayer.mo.GetMaxHealth(true), (30, 193), -23, 49, 15, (multiplayer? DI_TRANSLATABLE : 0) | DI_ITEM_LEFT_TOP); + + DrawImage("LFEDGE", (0, 192), DI_ITEM_OFFSETS); + DrawImage("RTEDGE", (277, 192), DI_ITEM_OFFSETS); + if (!automapactive) { if (isInventoryBarVisible()) { DrawImage("INVBAR", (38, 162), DI_ITEM_OFFSETS); DrawInventoryBar(diparms_sbar, (52, 163), 7, DI_ITEM_LEFT_TOP, HX_SHADOW); - return; } else { @@ -229,7 +240,6 @@ class HexenStatusBar : BaseStatusBar } else // automap { - DrawImage("H2BAR", (0, 134), DI_ITEM_OFFSETS); DrawImage("KEYBAR", (38, 162), DI_ITEM_OFFSETS); int cnt = 0; Vector2 keypos = (46, 164); @@ -247,17 +257,6 @@ class HexenStatusBar : BaseStatusBar DrawHexenArmor(HEXENARMOR_HELM, "ARMSLOT3", (212, 164), DI_ITEM_OFFSETS); DrawHexenArmor(HEXENARMOR_AMULET, "ARMSLOT4", (243, 164), DI_ITEM_OFFSETS); } - - String Gem; - if (CPlayer.mo is "ClericPlayer") Gem = "LIFEGMC2"; - else if (CPlayer.mo is "MagePlayer") Gem = "LIFEGMM2"; - else Gem = "LIFEGMF2"; - - int inthealth = mHealthInterpolator2.GetValue(); - DrawGem("CHAIN", "LIFEGMF2", inthealth, CPlayer.mo.GetMaxHealth(true), (30, 193), -23, 49, 15, (multiplayer? DI_TRANSLATABLE : 0) | DI_ITEM_LEFT_TOP); - - DrawImage("LFEDGE", (0, 193), DI_ITEM_OFFSETS); - DrawImage("RTEDGE", (277, 193), DI_ITEM_OFFSETS); } } From 553906b186de9dc8e58c0ab5509e8db2ca2eeab9 Mon Sep 17 00:00:00 2001 From: Christoph Oelckers Date: Sat, 1 Apr 2017 13:33:42 +0200 Subject: [PATCH 12/12] - fixed: BaseStatusbar::GetAmount returned the default item's amount if the player held none of the given type. This should only be done for MaxAmount. --- wadsrc/static/zscript/statusbar/statusbar.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wadsrc/static/zscript/statusbar/statusbar.txt b/wadsrc/static/zscript/statusbar/statusbar.txt index 0af3442d2..434d4bd6a 100644 --- a/wadsrc/static/zscript/statusbar/statusbar.txt +++ b/wadsrc/static/zscript/statusbar/statusbar.txt @@ -452,7 +452,7 @@ class BaseStatusBar native ui int, int GetAmount(class item) { let it = CPlayer.mo.FindInventory(item); - int ret1 = it? it.Amount : GetDefaultByType(item).Amount; + int ret1 = it? it.Amount : 0; int ret2 = it? it.MaxAmount : GetDefaultByType(item).MaxAmount; return ret1, ret2; }