/* ** Triangle drawers ** Copyright (c) 2016 Magnus Norddahl ** ** This software is provided 'as-is', without any express or implied ** warranty. In no event will the authors be held liable for any damages ** arising from the use of this software. ** ** Permission is granted to anyone to use this software for any purpose, ** including commercial applications, and to alter it and redistribute it ** freely, subject to the following restrictions: ** ** 1. The origin of this software must not be misrepresented; you must not ** claim that you wrote the original software. If you use this software ** in a product, an acknowledgment in the product documentation would be ** appreciated but is not required. ** 2. Altered source versions must be plainly marked as such, and must not be ** misrepresented as being the original software. ** 3. This notice may not be removed or altered from any source distribution. ** */ #include #include "templates.h" #include "doomdef.h" #include "i_system.h" #include "w_wad.h" #include "v_video.h" #include "doomstat.h" #include "st_stuff.h" #include "g_game.h" #include "g_level.h" #include "r_data/r_translate.h" #include "v_palette.h" #include "r_data/colormaps.h" #include "poly_triangle.h" #include "swrenderer/drawers/r_draw_rgba.h" #include "screen_triangle.h" #ifndef NO_SSE #include "poly_drawer32_sse2.h" #endif #include "poly_drawer8.h" class TriangleBlock { public: TriangleBlock(const TriDrawTriangleArgs *args); void Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thread); private: // Block size, standard 8x8 (must be power of two) static const int q = 8; // Deltas int DX12, DX23, DX31; int DY12, DY23, DY31; // Fixed-point deltas int FDX12, FDX23, FDX31; int FDY12, FDY23, FDY31; // Half-edge constants int C1, C2, C3; // Stencil buffer int stencilPitch; uint8_t * RESTRICT stencilValues; uint32_t * RESTRICT stencilMasks; uint8_t stencilTestValue; uint32_t stencilWriteValue; // Viewport clipping int clipright; int clipbottom; // Subsector buffer uint32_t * RESTRICT subsectorGBuffer; uint32_t subsectorDepth; int32_t subsectorPitch; // Triangle bounding block int minx, miny; int maxx, maxy; // Active block int X, Y; uint32_t Mask0, Mask1; #ifndef NO_SSE __m128i mFDY12Offset; __m128i mFDY23Offset; __m128i mFDY31Offset; __m128i mFDY12x4; __m128i mFDY23x4; __m128i mFDY31x4; __m128i mFDX12; __m128i mFDX23; __m128i mFDX31; #endif void CoverageTest(); void StencilEqualTest(); void StencilGreaterEqualTest(); void SubsectorTest(); void ClipTest(); void StencilWrite(); void SubsectorWrite(); }; TriangleBlock::TriangleBlock(const TriDrawTriangleArgs *args) { const TriVertex &v1 = *args->v1; const TriVertex &v2 = *args->v2; const TriVertex &v3 = *args->v3; clipright = args->clipright; clipbottom = args->clipbottom; stencilPitch = args->stencilPitch; stencilValues = args->stencilValues; stencilMasks = args->stencilMasks; stencilTestValue = args->uniforms->StencilTestValue(); stencilWriteValue = args->uniforms->StencilWriteValue(); subsectorGBuffer = args->subsectorGBuffer; subsectorDepth = args->uniforms->SubsectorDepth(); subsectorPitch = args->pitch; // 28.4 fixed-point coordinates #ifdef NO_SSE const int Y1 = (int)round(16.0f * v1.y); const int Y2 = (int)round(16.0f * v2.y); const int Y3 = (int)round(16.0f * v3.y); const int X1 = (int)round(16.0f * v1.x); const int X2 = (int)round(16.0f * v2.x); const int X3 = (int)round(16.0f * v3.x); #else int tempround[4 * 3]; __m128 m16 = _mm_set1_ps(16.0f); __m128 mhalf = _mm_set1_ps(65536.5f); __m128i m65536 = _mm_set1_epi32(65536); _mm_storeu_si128((__m128i*)tempround, _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v1), m16), mhalf)), m65536)); _mm_storeu_si128((__m128i*)(tempround + 4), _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v2), m16), mhalf)), m65536)); _mm_storeu_si128((__m128i*)(tempround + 8), _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v3), m16), mhalf)), m65536)); const int X1 = tempround[0]; const int X2 = tempround[4]; const int X3 = tempround[8]; const int Y1 = tempround[1]; const int Y2 = tempround[5]; const int Y3 = tempround[9]; #endif // Deltas DX12 = X1 - X2; DX23 = X2 - X3; DX31 = X3 - X1; DY12 = Y1 - Y2; DY23 = Y2 - Y3; DY31 = Y3 - Y1; // Fixed-point deltas FDX12 = DX12 << 4; FDX23 = DX23 << 4; FDX31 = DX31 << 4; FDY12 = DY12 << 4; FDY23 = DY23 << 4; FDY31 = DY31 << 4; // Bounding rectangle minx = MAX((MIN(MIN(X1, X2), X3) + 0xF) >> 4, 0); maxx = MIN((MAX(MAX(X1, X2), X3) + 0xF) >> 4, clipright - 1); miny = MAX((MIN(MIN(Y1, Y2), Y3) + 0xF) >> 4, 0); maxy = MIN((MAX(MAX(Y1, Y2), Y3) + 0xF) >> 4, clipbottom - 1); if (minx >= maxx || miny >= maxy) { return; } // Start in corner of 8x8 block minx &= ~(q - 1); miny &= ~(q - 1); // Half-edge constants C1 = DY12 * X1 - DX12 * Y1; C2 = DY23 * X2 - DX23 * Y2; C3 = DY31 * X3 - DX31 * Y3; // Correct for fill convention if (DY12 < 0 || (DY12 == 0 && DX12 > 0)) C1++; if (DY23 < 0 || (DY23 == 0 && DX23 > 0)) C2++; if (DY31 < 0 || (DY31 == 0 && DX31 > 0)) C3++; #ifndef NO_SSE mFDY12Offset = _mm_setr_epi32(0, FDY12, FDY12 * 2, FDY12 * 3); mFDY23Offset = _mm_setr_epi32(0, FDY23, FDY23 * 2, FDY23 * 3); mFDY31Offset = _mm_setr_epi32(0, FDY31, FDY31 * 2, FDY31 * 3); mFDY12x4 = _mm_set1_epi32(FDY12 * 4); mFDY23x4 = _mm_set1_epi32(FDY23 * 4); mFDY31x4 = _mm_set1_epi32(FDY31 * 4); mFDX12 = _mm_set1_epi32(FDX12); mFDX23 = _mm_set1_epi32(FDX23); mFDX31 = _mm_set1_epi32(FDX31); #endif } void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thread) { // First block line for this thread int core = thread->core; int num_cores = thread->num_cores; int core_skip = (num_cores - ((miny / q) - core) % num_cores) % num_cores; int start_miny = miny + core_skip * q; bool subsectorTest = args->uniforms->SubsectorTest(); bool writeColor = args->uniforms->WriteColor(); bool writeStencil = args->uniforms->WriteStencil(); bool writeSubsector = args->uniforms->WriteSubsector(); int bmode = (int)args->uniforms->BlendMode(); auto drawFunc = args->destBgra ? ScreenTriangle::TriDrawers32[bmode] : ScreenTriangle::TriDrawers8[bmode]; // Loop through blocks for (int y = start_miny; y < maxy; y += q * num_cores) { for (int x = minx; x < maxx; x += q) { X = x; Y = y; CoverageTest(); if (Mask0 == 0 && Mask1 == 0) continue; ClipTest(); if (Mask0 == 0 && Mask1 == 0) continue; // To do: make the stencil test use its own flag for comparison mode instead of abusing the subsector test.. if (!subsectorTest) { StencilEqualTest(); if (Mask0 == 0 && Mask1 == 0) continue; } else { StencilGreaterEqualTest(); if (Mask0 == 0 && Mask1 == 0) continue; SubsectorTest(); if (Mask0 == 0 && Mask1 == 0) continue; } if (writeColor) drawFunc(X, Y, Mask0, Mask1, args); if (writeStencil) StencilWrite(); if (writeSubsector) SubsectorWrite(); } } } #ifdef NO_SSE void TriangleBlock::SubsectorTest() { uint32_t *subsector = subsectorGBuffer + X + Y * subsectorPitch; uint32_t mask0 = 0; uint32_t mask1 = 0; for (int iy = 0; iy < 4; iy++) { for (int ix = 0; ix < q; ix++) { bool covered = subsector[ix] >= subsectorDepth; mask0 <<= 1; mask0 |= (uint32_t)covered; } subsector += subsectorPitch; } for (int iy = 4; iy < q; iy++) { for (int ix = 0; ix < q; ix++) { bool covered = subsector[ix] >= subsectorDepth; mask1 <<= 1; mask1 |= (uint32_t)covered; } subsector += subsectorPitch; } Mask0 = Mask0 & mask0; Mask1 = Mask1 & mask1; } #else void TriangleBlock::SubsectorTest() { uint32_t *subsector = subsectorGBuffer + X + Y * subsectorPitch; uint32_t mask0 = 0; uint32_t mask1 = 0; __m128i msubsectorDepth = _mm_set1_epi32(subsectorDepth); __m128i mnotxor = _mm_set1_epi32(0xffffffff); for (int iy = 0; iy < 4; iy++) { mask0 <<= 4; mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)subsector), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3)))); mask0 <<= 4; mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(subsector + 4)), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3)))); subsector += subsectorPitch; } for (int iy = 4; iy < q; iy++) { mask1 <<= 4; mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)subsector), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3)))); mask1 <<= 4; mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(subsector + 4)), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3)))); subsector += subsectorPitch; } Mask0 = Mask0 & mask0; Mask1 = Mask1 & mask1; } #endif void TriangleBlock::ClipTest() { static const uint32_t clipxmask[8] = { 0, 0x80808080, 0xc0c0c0c0, 0xe0e0e0e0, 0xf0f0f0f0, 0xf8f8f8f8, 0xfcfcfcfc, 0xfefefefe }; static const uint32_t clipymask[8] = { 0, 0xff000000, 0xffff0000, 0xffffff00, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }; uint32_t xmask = (X + 8 <= clipright) ? 0xffffffff : clipxmask[clipright - X]; uint32_t ymask0 = (Y + 4 <= clipbottom) ? 0xffffffff : clipymask[clipbottom - Y]; uint32_t ymask1 = (Y + 8 <= clipbottom) ? 0xffffffff : clipymask[clipbottom - Y - 4]; Mask0 = Mask0 & xmask & ymask0; Mask1 = Mask1 & xmask & ymask1; } #ifdef NO_SSE void TriangleBlock::StencilEqualTest() { // Stencil test the whole block, if possible int block = (X >> 3) + (Y >> 3) * stencilPitch; uint8_t *stencilBlock = &stencilValues[block * 64]; uint32_t *stencilBlockMask = &stencilMasks[block]; bool blockIsSingleStencil = ((*stencilBlockMask) & 0xffffff00) == 0xffffff00; bool skipBlock = blockIsSingleStencil && ((*stencilBlockMask) & 0xff) != stencilTestValue; if (skipBlock) { Mask0 = 0; Mask1 = 0; } else if (!blockIsSingleStencil) { uint32_t mask0 = 0; uint32_t mask1 = 0; for (int iy = 0; iy < 4; iy++) { for (int ix = 0; ix < q; ix++) { bool passStencilTest = stencilBlock[ix + iy * q] == stencilTestValue; mask0 <<= 1; mask0 |= (uint32_t)passStencilTest; } } for (int iy = 4; iy < q; iy++) { for (int ix = 0; ix < q; ix++) { bool passStencilTest = stencilBlock[ix + iy * q] == stencilTestValue; mask1 <<= 1; mask1 |= (uint32_t)passStencilTest; } } Mask0 = Mask0 & mask0; Mask1 = Mask1 & mask1; } } #else void TriangleBlock::StencilEqualTest() { // Stencil test the whole block, if possible int block = (X >> 3) + (Y >> 3) * stencilPitch; uint8_t *stencilBlock = &stencilValues[block * 64]; uint32_t *stencilBlockMask = &stencilMasks[block]; bool blockIsSingleStencil = ((*stencilBlockMask) & 0xffffff00) == 0xffffff00; bool skipBlock = blockIsSingleStencil && ((*stencilBlockMask) & 0xff) != stencilTestValue; if (skipBlock) { Mask0 = 0; Mask1 = 0; } else if (!blockIsSingleStencil) { __m128i mstencilTestValue = _mm_set1_epi16(stencilTestValue); uint32_t mask0 = 0; uint32_t mask1 = 0; for (int iy = 0; iy < 4; iy++) { __m128i mstencilBlock = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(stencilBlock + iy * 8)), _mm_setzero_si128()); __m128i mstencilTest = _mm_cmpeq_epi16(mstencilBlock, mstencilTestValue); __m128i mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest); __m128i mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest); mask0 <<= 4; mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mstencilTest0, _MM_SHUFFLE(0, 1, 2, 3)))); mask0 <<= 4; mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mstencilTest1, _MM_SHUFFLE(0, 1, 2, 3)))); } for (int iy = 4; iy < q; iy++) { __m128i mstencilBlock = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(stencilBlock + iy * 8)), _mm_setzero_si128()); __m128i mstencilTest = _mm_cmpeq_epi16(mstencilBlock, mstencilTestValue); __m128i mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest); __m128i mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest); mask1 <<= 4; mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mstencilTest0, _MM_SHUFFLE(0, 1, 2, 3)))); mask1 <<= 4; mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mstencilTest1, _MM_SHUFFLE(0, 1, 2, 3)))); } Mask0 = Mask0 & mask0; Mask1 = Mask1 & mask1; } } #endif void TriangleBlock::StencilGreaterEqualTest() { // Stencil test the whole block, if possible int block = (X >> 3) + (Y >> 3) * stencilPitch; uint8_t *stencilBlock = &stencilValues[block * 64]; uint32_t *stencilBlockMask = &stencilMasks[block]; bool blockIsSingleStencil = ((*stencilBlockMask) & 0xffffff00) == 0xffffff00; bool skipBlock = blockIsSingleStencil && ((*stencilBlockMask) & 0xff) < stencilTestValue; if (skipBlock) { Mask0 = 0; Mask1 = 0; } else if (!blockIsSingleStencil) { uint32_t mask0 = 0; uint32_t mask1 = 0; for (int iy = 0; iy < 4; iy++) { for (int ix = 0; ix < q; ix++) { bool passStencilTest = stencilBlock[ix + iy * q] >= stencilTestValue; mask0 <<= 1; mask0 |= (uint32_t)passStencilTest; } } for (int iy = 4; iy < q; iy++) { for (int ix = 0; ix < q; ix++) { bool passStencilTest = stencilBlock[ix + iy * q] >= stencilTestValue; mask1 <<= 1; mask1 |= (uint32_t)passStencilTest; } } Mask0 = Mask0 & mask0; Mask1 = Mask1 & mask1; } } #ifdef NO_SSE void TriangleBlock::CoverageTest() { // Corners of block int x0 = X << 4; int x1 = (X + q - 1) << 4; int y0 = Y << 4; int y1 = (Y + q - 1) << 4; // Evaluate half-space functions bool a00 = C1 + DX12 * y0 - DY12 * x0 > 0; bool a10 = C1 + DX12 * y0 - DY12 * x1 > 0; bool a01 = C1 + DX12 * y1 - DY12 * x0 > 0; bool a11 = C1 + DX12 * y1 - DY12 * x1 > 0; int a = (a00 << 0) | (a10 << 1) | (a01 << 2) | (a11 << 3); bool b00 = C2 + DX23 * y0 - DY23 * x0 > 0; bool b10 = C2 + DX23 * y0 - DY23 * x1 > 0; bool b01 = C2 + DX23 * y1 - DY23 * x0 > 0; bool b11 = C2 + DX23 * y1 - DY23 * x1 > 0; int b = (b00 << 0) | (b10 << 1) | (b01 << 2) | (b11 << 3); bool c00 = C3 + DX31 * y0 - DY31 * x0 > 0; bool c10 = C3 + DX31 * y0 - DY31 * x1 > 0; bool c01 = C3 + DX31 * y1 - DY31 * x0 > 0; bool c11 = C3 + DX31 * y1 - DY31 * x1 > 0; int c = (c00 << 0) | (c10 << 1) | (c01 << 2) | (c11 << 3); if (a == 0 || b == 0 || c == 0) // Skip block when outside an edge { Mask0 = 0; Mask1 = 0; } else if (a == 0xf && b == 0xf && c == 0xf) // Accept whole block when totally covered { Mask0 = 0xffffffff; Mask1 = 0xffffffff; } else // Partially covered block { x0 = X << 4; x1 = (X + q - 1) << 4; int CY1 = C1 + DX12 * y0 - DY12 * x0; int CY2 = C2 + DX23 * y0 - DY23 * x0; int CY3 = C3 + DX31 * y0 - DY31 * x0; uint32_t mask0 = 0; uint32_t mask1 = 0; for (int iy = 0; iy < 4; iy++) { int CX1 = CY1; int CX2 = CY2; int CX3 = CY3; for (int ix = 0; ix < q; ix++) { bool covered = CX1 > 0 && CX2 > 0 && CX3 > 0; mask0 <<= 1; mask0 |= (uint32_t)covered; CX1 -= FDY12; CX2 -= FDY23; CX3 -= FDY31; } CY1 += FDX12; CY2 += FDX23; CY3 += FDX31; } for (int iy = 4; iy < q; iy++) { int CX1 = CY1; int CX2 = CY2; int CX3 = CY3; for (int ix = 0; ix < q; ix++) { bool covered = CX1 > 0 && CX2 > 0 && CX3 > 0; mask1 <<= 1; mask1 |= (uint32_t)covered; CX1 -= FDY12; CX2 -= FDY23; CX3 -= FDY31; } CY1 += FDX12; CY2 += FDX23; CY3 += FDX31; } Mask0 = mask0; Mask1 = mask1; } } #else void TriangleBlock::CoverageTest() { // Corners of block int x0 = X << 4; int x1 = (X + q - 1) << 4; int y0 = Y << 4; int y1 = (Y + q - 1) << 4; // Evaluate half-space functions bool a00 = C1 + DX12 * y0 - DY12 * x0 > 0; bool a10 = C1 + DX12 * y0 - DY12 * x1 > 0; bool a01 = C1 + DX12 * y1 - DY12 * x0 > 0; bool a11 = C1 + DX12 * y1 - DY12 * x1 > 0; int a = (a00 << 0) | (a10 << 1) | (a01 << 2) | (a11 << 3); bool b00 = C2 + DX23 * y0 - DY23 * x0 > 0; bool b10 = C2 + DX23 * y0 - DY23 * x1 > 0; bool b01 = C2 + DX23 * y1 - DY23 * x0 > 0; bool b11 = C2 + DX23 * y1 - DY23 * x1 > 0; int b = (b00 << 0) | (b10 << 1) | (b01 << 2) | (b11 << 3); bool c00 = C3 + DX31 * y0 - DY31 * x0 > 0; bool c10 = C3 + DX31 * y0 - DY31 * x1 > 0; bool c01 = C3 + DX31 * y1 - DY31 * x0 > 0; bool c11 = C3 + DX31 * y1 - DY31 * x1 > 0; int c = (c00 << 0) | (c10 << 1) | (c01 << 2) | (c11 << 3); if (a == 0 || b == 0 || c == 0) // Skip block when outside an edge { Mask0 = 0; Mask1 = 0; } else if (a == 0xf && b == 0xf && c == 0xf) // Accept whole block when totally covered { Mask0 = 0xffffffff; Mask1 = 0xffffffff; } else // Partially covered block { x0 = X << 4; x1 = (X + q - 1) << 4; int CY1 = C1 + DX12 * y0 - DY12 * x0; int CY2 = C2 + DX23 * y0 - DY23 * x0; int CY3 = C3 + DX31 * y0 - DY31 * x0; uint32_t mask0 = 0; uint32_t mask1 = 0; __m128i mCY1 = _mm_sub_epi32(_mm_set1_epi32(CY1), mFDY12Offset); __m128i mCY2 = _mm_sub_epi32(_mm_set1_epi32(CY2), mFDY23Offset); __m128i mCY3 = _mm_sub_epi32(_mm_set1_epi32(CY3), mFDY31Offset); for (int iy = 0; iy < 4; iy++) { __m128i mtest0 = _mm_cmpgt_epi32(mCY1, _mm_setzero_si128()); mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0); mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY3, _mm_setzero_si128()), mtest0); __m128i mtest1 = _mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128()); mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1); mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1); mCY1 = _mm_add_epi32(mCY1, mFDX12); mCY2 = _mm_add_epi32(mCY2, mFDX23); mCY3 = _mm_add_epi32(mCY3, mFDX31); mask0 <<= 4; mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3)))); mask0 <<= 4; mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3)))); } for (int iy = 4; iy < q; iy++) { __m128i mtest0 = _mm_cmpgt_epi32(mCY1, _mm_setzero_si128()); mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0); mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY3, _mm_setzero_si128()), mtest0); __m128i mtest1 = _mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128()); mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1); mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1); mCY1 = _mm_add_epi32(mCY1, mFDX12); mCY2 = _mm_add_epi32(mCY2, mFDX23); mCY3 = _mm_add_epi32(mCY3, mFDX31); mask1 <<= 4; mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3)))); mask1 <<= 4; mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3)))); } Mask0 = mask0; Mask1 = mask1; } } #endif void TriangleBlock::StencilWrite() { int block = (X >> 3) + (Y >> 3) * stencilPitch; uint8_t *stencilBlock = &stencilValues[block * 64]; uint32_t &stencilBlockMask = stencilMasks[block]; uint32_t writeValue = stencilWriteValue; if (Mask0 == 0xffffffff && Mask1 == 0xffffffff) { stencilBlockMask = 0xffffff00 | writeValue; } else { uint32_t mask0 = Mask0; uint32_t mask1 = Mask1; bool isSingleValue = (stencilBlockMask & 0xffffff00) == 0xffffff00; if (isSingleValue) { uint8_t value = stencilBlockMask & 0xff; for (int v = 0; v < 64; v++) stencilBlock[v] = value; stencilBlockMask = 0; } int count = 0; for (int v = 0; v < 32; v++) { if ((mask0 & (1 << 31)) || stencilBlock[v] == writeValue) { stencilBlock[v] = writeValue; count++; } mask0 <<= 1; } for (int v = 32; v < 64; v++) { if ((mask1 & (1 << 31)) || stencilBlock[v] == writeValue) { stencilBlock[v] = writeValue; count++; } mask1 <<= 1; } if (count == 64) stencilBlockMask = 0xffffff00 | writeValue; } } void TriangleBlock::SubsectorWrite() { auto pitch = subsectorPitch; uint32_t *subsector = subsectorGBuffer + X + Y * pitch; if (Mask0 == 0xffffffff && Mask1 == 0xffffffff) { for (int y = 0; y < 8; y++) { for (int x = 0; x < 8; x++) subsector[x] = subsectorDepth; subsector += pitch; } } else { uint32_t mask0 = Mask0; uint32_t mask1 = Mask1; for (int y = 0; y < 4; y++) { for (int x = 0; x < 8; x++) { if (mask0 & (1 << 31)) subsector[x] = subsectorDepth; mask0 <<= 1; } subsector += pitch; } for (int y = 4; y < 8; y++) { for (int x = 0; x < 8; x++) { if (mask1 & (1 << 31)) subsector[x] = subsectorDepth; mask1 <<= 1; } subsector += pitch; } } } void ScreenTriangle::Draw(const TriDrawTriangleArgs *args, WorkerThreadData *thread) { TriangleBlock block(args); block.Loop(args, thread); } void(*ScreenTriangle::TriDrawers8[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *) = { &TriScreenDrawer8::Execute, // TextureOpaque &TriScreenDrawer8::Execute, // TextureMasked &TriScreenDrawer8::Execute, // TextureAdd &TriScreenDrawer8::Execute, // TextureSub &TriScreenDrawer8::Execute, // TextureRevSub &TriScreenDrawer8::Execute, // TextureAddSrcColor &TriScreenDrawer8::Execute, // TranslatedOpaque &TriScreenDrawer8::Execute, // TranslatedMasked &TriScreenDrawer8::Execute, // TranslatedAdd &TriScreenDrawer8::Execute, // TranslatedSub &TriScreenDrawer8::Execute, // TranslatedRevSub &TriScreenDrawer8::Execute, // TranslatedAddSrcColor &TriScreenDrawer8::Execute, // Shaded &TriScreenDrawer8::Execute, // AddShaded &TriScreenDrawer8::Execute, // Stencil &TriScreenDrawer8::Execute, // AddStencil &TriScreenDrawer8::Execute, // FillOpaque &TriScreenDrawer8::Execute, // FillAdd &TriScreenDrawer8::Execute, // FillSub &TriScreenDrawer8::Execute, // FillRevSub &TriScreenDrawer8::Execute, // FillAddSrcColor &TriScreenDrawer8::Execute // Skycap }; #ifdef NO_SSE void(*ScreenTriangle::TriDrawers32[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *) = { nullptr }; #else void(*ScreenTriangle::TriDrawers32[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *) = { &TriScreenDrawer32::Execute, // TextureOpaque &TriScreenDrawer32::Execute, // TextureMasked &TriScreenDrawer32::Execute, // TextureAdd &TriScreenDrawer32::Execute, // TextureSub &TriScreenDrawer32::Execute, // TextureRevSub &TriScreenDrawer32::Execute, // TextureAddSrcColor &TriScreenDrawer32::Execute, // TranslatedOpaque &TriScreenDrawer32::Execute, // TranslatedMasked &TriScreenDrawer32::Execute, // TranslatedAdd &TriScreenDrawer32::Execute, // TranslatedSub &TriScreenDrawer32::Execute, // TranslatedRevSub &TriScreenDrawer32::Execute, // TranslatedAddSrcColor &TriScreenDrawer32::Execute, // Shaded &TriScreenDrawer32::Execute, // AddShaded &TriScreenDrawer32::Execute, // Stencil &TriScreenDrawer32::Execute, // AddStencil &TriScreenDrawer32::Execute, // FillOpaque &TriScreenDrawer32::Execute, // FillAdd &TriScreenDrawer32::Execute, // FillSub &TriScreenDrawer32::Execute, // FillRevSub &TriScreenDrawer32::Execute, // FillAddSrcColor &TriScreenDrawer32::Execute // Skycap }; #endif void(*ScreenTriangle::RectDrawers8[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *) = { &RectScreenDrawer8::Execute, // TextureOpaque &RectScreenDrawer8::Execute, // TextureMasked &RectScreenDrawer8::Execute, // TextureAdd &RectScreenDrawer8::Execute, // TextureSub &RectScreenDrawer8::Execute, // TextureRevSub &RectScreenDrawer8::Execute, // TextureAddSrcColor &RectScreenDrawer8::Execute, // TranslatedOpaque &RectScreenDrawer8::Execute, // TranslatedMasked &RectScreenDrawer8::Execute, // TranslatedAdd &RectScreenDrawer8::Execute, // TranslatedSub &RectScreenDrawer8::Execute, // TranslatedRevSub &RectScreenDrawer8::Execute, // TranslatedAddSrcColor &RectScreenDrawer8::Execute, // Shaded &RectScreenDrawer8::Execute, // AddShaded &RectScreenDrawer8::Execute, // Stencil &RectScreenDrawer8::Execute, // AddStencil &RectScreenDrawer8::Execute, // FillOpaque &RectScreenDrawer8::Execute, // FillAdd &RectScreenDrawer8::Execute, // FillSub &RectScreenDrawer8::Execute, // FillRevSub &RectScreenDrawer8::Execute, // FillAddSrcColor &RectScreenDrawer8::Execute // Skycap }; #ifdef NO_SSE void(*ScreenTriangle::RectDrawers32[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *) = { nullptr }; #else void(*ScreenTriangle::RectDrawers32[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *) = { &RectScreenDrawer32::Execute, // TextureOpaque &RectScreenDrawer32::Execute, // TextureMasked &RectScreenDrawer32::Execute, // TextureAdd &RectScreenDrawer32::Execute, // TextureSub &RectScreenDrawer32::Execute, // TextureRevSub &RectScreenDrawer32::Execute, // TextureAddSrcColor &RectScreenDrawer32::Execute, // TranslatedOpaque &RectScreenDrawer32::Execute, // TranslatedMasked &RectScreenDrawer32::Execute, // TranslatedAdd &RectScreenDrawer32::Execute, // TranslatedSub &RectScreenDrawer32::Execute, // TranslatedRevSub &RectScreenDrawer32::Execute, // TranslatedAddSrcColor &RectScreenDrawer32::Execute, // Shaded &RectScreenDrawer32::Execute, // AddShaded &RectScreenDrawer32::Execute, // Stencil &RectScreenDrawer32::Execute, // AddStencil &RectScreenDrawer32::Execute, // FillOpaque &RectScreenDrawer32::Execute, // FillAdd &RectScreenDrawer32::Execute, // FillSub &RectScreenDrawer32::Execute, // FillRevSub &RectScreenDrawer32::Execute, // FillAddSrcColor &RectScreenDrawer32::Execute // Skycap }; #endif