From 4a0b3c3bab8706c27d73c20a13a9775403861a8e Mon Sep 17 00:00:00 2001 From: Magnus Norddahl Date: Sat, 1 Apr 2017 16:42:24 +0200 Subject: [PATCH 1/6] - speed up triangle setup slightly --- src/polyrenderer/drawers/screen_triangle.cpp | 226 ++++++++++++++----- 1 file changed, 174 insertions(+), 52 deletions(-) diff --git a/src/polyrenderer/drawers/screen_triangle.cpp b/src/polyrenderer/drawers/screen_triangle.cpp index ea34c175c..6b3a14098 100644 --- a/src/polyrenderer/drawers/screen_triangle.cpp +++ b/src/polyrenderer/drawers/screen_triangle.cpp @@ -96,6 +96,15 @@ private: __m128i mFDX12; __m128i mFDX23; __m128i mFDX31; + __m128i mC1; + __m128i mC2; + __m128i mC3; + __m128i mDX12; + __m128i mDY12; + __m128i mDX23; + __m128i mDY23; + __m128i mDX31; + __m128i mDY31; #endif void CoverageTest(); @@ -203,6 +212,15 @@ TriangleBlock::TriangleBlock(const TriDrawTriangleArgs *args) mFDX12 = _mm_set1_epi32(FDX12); mFDX23 = _mm_set1_epi32(FDX23); mFDX31 = _mm_set1_epi32(FDX31); + mC1 = _mm_set1_epi32(C1); + mC2 = _mm_set1_epi32(C2); + mC3 = _mm_set1_epi32(C3); + mDX12 = _mm_set1_epi32(DX12); + mDY12 = _mm_set1_epi32(DY12); + mDX23 = _mm_set1_epi32(DX23); + mDY23 = _mm_set1_epi32(DY23); + mDX31 = _mm_set1_epi32(DX31); + mDY31 = _mm_set1_epi32(DY31); #endif } @@ -432,30 +450,44 @@ void TriangleBlock::StencilEqualTest() uint32_t mask0 = 0; uint32_t mask1 = 0; - for (int iy = 0; iy < 4; iy++) + for (int iy = 0; iy < 2; iy++) { - __m128i mstencilBlock = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(stencilBlock + iy * 8)), _mm_setzero_si128()); - __m128i mstencilTest = _mm_cmpeq_epi16(mstencilBlock, mstencilTestValue); + __m128i mstencilBlock = _mm_loadu_si128((const __m128i *)stencilBlock); + + __m128i mstencilTest = _mm_cmpeq_epi16(_mm_unpacklo_epi8(mstencilBlock, _mm_setzero_si128()), mstencilTestValue); __m128i mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest); __m128i mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest); + __m128i first = _mm_packs_epi32(_mm_shuffle_epi32(mstencilTest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mstencilTest0, _MM_SHUFFLE(0, 1, 2, 3))); - mask0 <<= 4; - mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mstencilTest0, _MM_SHUFFLE(0, 1, 2, 3)))); - mask0 <<= 4; - mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mstencilTest1, _MM_SHUFFLE(0, 1, 2, 3)))); + mstencilTest = _mm_cmpeq_epi16(_mm_unpackhi_epi8(mstencilBlock, _mm_setzero_si128()), mstencilTestValue); + mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest); + mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest); + __m128i second = _mm_packs_epi32(_mm_shuffle_epi32(mstencilTest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mstencilTest0, _MM_SHUFFLE(0, 1, 2, 3))); + + mask0 <<= 16; + mask0 |= _mm_movemask_epi8(_mm_packs_epi16(second, first)); + + stencilBlock += 16; } - for (int iy = 4; iy < q; iy++) + for (int iy = 0; iy < 2; iy++) { - __m128i mstencilBlock = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(stencilBlock + iy * 8)), _mm_setzero_si128()); - __m128i mstencilTest = _mm_cmpeq_epi16(mstencilBlock, mstencilTestValue); + __m128i mstencilBlock = _mm_loadu_si128((const __m128i *)stencilBlock); + + __m128i mstencilTest = _mm_cmpeq_epi16(_mm_unpacklo_epi8(mstencilBlock, _mm_setzero_si128()), mstencilTestValue); __m128i mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest); __m128i mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest); + __m128i first = _mm_packs_epi32(_mm_shuffle_epi32(mstencilTest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mstencilTest0, _MM_SHUFFLE(0, 1, 2, 3))); - mask1 <<= 4; - mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mstencilTest0, _MM_SHUFFLE(0, 1, 2, 3)))); - mask1 <<= 4; - mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mstencilTest1, _MM_SHUFFLE(0, 1, 2, 3)))); + mstencilTest = _mm_cmpeq_epi16(_mm_unpackhi_epi8(mstencilBlock, _mm_setzero_si128()), mstencilTestValue); + mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest); + mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest); + __m128i second = _mm_packs_epi32(_mm_shuffle_epi32(mstencilTest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mstencilTest0, _MM_SHUFFLE(0, 1, 2, 3))); + + mask1 <<= 16; + mask1 |= _mm_movemask_epi8(_mm_packs_epi16(second, first)); + + stencilBlock += 16; } Mask0 = Mask0 & mask0; @@ -617,50 +649,46 @@ void TriangleBlock::CoverageTest() int y0 = Y << 4; int y1 = (Y + q - 1) << 4; + __m128i mY = _mm_set_epi32(y0, y0, y1, y1); + __m128i mX = _mm_set_epi32(x0, x0, x1, x1); + // Evaluate half-space functions - bool a00 = C1 + DX12 * y0 - DY12 * x0 > 0; - bool a10 = C1 + DX12 * y0 - DY12 * x1 > 0; - bool a01 = C1 + DX12 * y1 - DY12 * x0 > 0; - bool a11 = C1 + DX12 * y1 - DY12 * x1 > 0; - int a = (a00 << 0) | (a10 << 1) | (a01 << 2) | (a11 << 3); + __m128i mCY1 = _mm_sub_epi32( + _mm_add_epi32(mC1, _mm_shuffle_epi32(_mm_mul_epu32(mDX12, mY), _MM_SHUFFLE(0, 0, 2, 2))), + _mm_shuffle_epi32(_mm_mul_epu32(mDY12, mX), _MM_SHUFFLE(0, 2, 0, 2))); + __m128i mA = _mm_cmpgt_epi32(mCY1, _mm_setzero_si128()); - bool b00 = C2 + DX23 * y0 - DY23 * x0 > 0; - bool b10 = C2 + DX23 * y0 - DY23 * x1 > 0; - bool b01 = C2 + DX23 * y1 - DY23 * x0 > 0; - bool b11 = C2 + DX23 * y1 - DY23 * x1 > 0; - int b = (b00 << 0) | (b10 << 1) | (b01 << 2) | (b11 << 3); + __m128i mCY2 = _mm_sub_epi32( + _mm_add_epi32(mC2, _mm_shuffle_epi32(_mm_mul_epu32(mDX23, mY), _MM_SHUFFLE(0, 0, 2, 2))), + _mm_shuffle_epi32(_mm_mul_epu32(mDY23, mX), _MM_SHUFFLE(0, 2, 0, 2))); + __m128i mB = _mm_cmpgt_epi32(mCY2, _mm_setzero_si128()); - bool c00 = C3 + DX31 * y0 - DY31 * x0 > 0; - bool c10 = C3 + DX31 * y0 - DY31 * x1 > 0; - bool c01 = C3 + DX31 * y1 - DY31 * x0 > 0; - bool c11 = C3 + DX31 * y1 - DY31 * x1 > 0; - int c = (c00 << 0) | (c10 << 1) | (c01 << 2) | (c11 << 3); + __m128i mCY3 = _mm_sub_epi32( + _mm_add_epi32(mC3, _mm_shuffle_epi32(_mm_mul_epu32(mDX31, mY), _MM_SHUFFLE(0, 0, 2, 2))), + _mm_shuffle_epi32(_mm_mul_epu32(mDY31, mX), _MM_SHUFFLE(0, 2, 0, 2))); + __m128i mC = _mm_cmpgt_epi32(mCY3, _mm_setzero_si128()); - if (a == 0 || b == 0 || c == 0) // Skip block when outside an edge + int abc = _mm_movemask_epi8(_mm_packs_epi16(_mm_packs_epi32(mA, mB), _mm_packs_epi32(mC, _mm_setzero_si128()))); + + if ((abc & 0xf) == 0 || (abc & 0xf0) == 0 || (abc & 0xf00) == 0) // Skip block when outside an edge { Mask0 = 0; Mask1 = 0; } - else if (a == 0xf && b == 0xf && c == 0xf) // Accept whole block when totally covered + else if (abc == 0xfff) // Accept whole block when totally covered { Mask0 = 0xffffffff; Mask1 = 0xffffffff; } else // Partially covered block { - x0 = X << 4; - x1 = (X + q - 1) << 4; - int CY1 = C1 + DX12 * y0 - DY12 * x0; - int CY2 = C2 + DX23 * y0 - DY23 * x0; - int CY3 = C3 + DX31 * y0 - DY31 * x0; - uint32_t mask0 = 0; uint32_t mask1 = 0; - __m128i mCY1 = _mm_sub_epi32(_mm_set1_epi32(CY1), mFDY12Offset); - __m128i mCY2 = _mm_sub_epi32(_mm_set1_epi32(CY2), mFDY23Offset); - __m128i mCY3 = _mm_sub_epi32(_mm_set1_epi32(CY3), mFDY31Offset); - for (int iy = 0; iy < 4; iy++) + mCY1 = _mm_sub_epi32(_mm_shuffle_epi32(mCY1, _MM_SHUFFLE(0, 0, 0, 0)), mFDY12Offset); + mCY2 = _mm_sub_epi32(_mm_shuffle_epi32(mCY2, _MM_SHUFFLE(0, 0, 0, 0)), mFDY23Offset); + mCY3 = _mm_sub_epi32(_mm_shuffle_epi32(mCY3, _MM_SHUFFLE(0, 0, 0, 0)), mFDY31Offset); + for (int iy = 0; iy < 2; iy++) { __m128i mtest0 = _mm_cmpgt_epi32(mCY1, _mm_setzero_si128()); mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0); @@ -668,18 +696,27 @@ void TriangleBlock::CoverageTest() __m128i mtest1 = _mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128()); mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1); mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1); - mCY1 = _mm_add_epi32(mCY1, mFDX12); mCY2 = _mm_add_epi32(mCY2, mFDX23); mCY3 = _mm_add_epi32(mCY3, mFDX31); + __m128i first = _mm_packs_epi32(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3))); - mask0 <<= 4; - mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3)))); - mask0 <<= 4; - mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3)))); + mtest0 = _mm_cmpgt_epi32(mCY1, _mm_setzero_si128()); + mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0); + mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY3, _mm_setzero_si128()), mtest0); + mtest1 = _mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128()); + mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1); + mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1); + mCY1 = _mm_add_epi32(mCY1, mFDX12); + mCY2 = _mm_add_epi32(mCY2, mFDX23); + mCY3 = _mm_add_epi32(mCY3, mFDX31); + __m128i second = _mm_packs_epi32(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3))); + + mask0 <<= 16; + mask0 |= _mm_movemask_epi8(_mm_packs_epi16(second, first)); } - for (int iy = 4; iy < q; iy++) + for (int iy = 0; iy < 2; iy++) { __m128i mtest0 = _mm_cmpgt_epi32(mCY1, _mm_setzero_si128()); mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0); @@ -687,15 +724,24 @@ void TriangleBlock::CoverageTest() __m128i mtest1 = _mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128()); mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1); mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1); - mCY1 = _mm_add_epi32(mCY1, mFDX12); mCY2 = _mm_add_epi32(mCY2, mFDX23); mCY3 = _mm_add_epi32(mCY3, mFDX31); + __m128i first = _mm_packs_epi32(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3))); - mask1 <<= 4; - mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3)))); - mask1 <<= 4; - mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3)))); + mtest0 = _mm_cmpgt_epi32(mCY1, _mm_setzero_si128()); + mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0); + mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY3, _mm_setzero_si128()), mtest0); + mtest1 = _mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128()); + mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1); + mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1); + mCY1 = _mm_add_epi32(mCY1, mFDX12); + mCY2 = _mm_add_epi32(mCY2, mFDX23); + mCY3 = _mm_add_epi32(mCY3, mFDX31); + __m128i second = _mm_packs_epi32(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3))); + + mask1 <<= 16; + mask1 |= _mm_movemask_epi8(_mm_packs_epi16(second, first)); } Mask0 = mask0; @@ -755,6 +801,8 @@ void TriangleBlock::StencilWrite() } } +#ifdef NO_SSE + void TriangleBlock::SubsectorWrite() { auto pitch = subsectorPitch; @@ -796,6 +844,80 @@ void TriangleBlock::SubsectorWrite() } } +#else + +void TriangleBlock::SubsectorWrite() +{ + auto pitch = subsectorPitch; + uint32_t *subsector = subsectorGBuffer + X + Y * pitch; + __m128i msubsectorDepth = _mm_set1_epi32(subsectorDepth); + + if (Mask0 == 0xffffffff && Mask1 == 0xffffffff) + { + _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); + _mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch; + _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); + _mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch; + _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); + _mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch; + _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); + _mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch; + _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); + _mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch; + _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); + _mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch; + _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); + _mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch; + _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); + _mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); + } + else + { + __m128i mxormask = _mm_set1_epi32(0xffffffff); + __m128i topfour = _mm_setr_epi32(1 << 31, 1 << 30, 1 << 29, 1 << 28); + __m128i bottomfour = _mm_setr_epi32(1 << 27, 1 << 26, 1 << 25, 1 << 24); + + __m128i mmask0 = _mm_set1_epi32(Mask0); + __m128i mmask1 = _mm_set1_epi32(Mask1); + + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4)); + mmask0 = _mm_slli_si128(mmask0, 1); + subsector += pitch; + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4)); + mmask0 = _mm_slli_si128(mmask0, 1); + subsector += pitch; + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4)); + mmask0 = _mm_slli_si128(mmask0, 1); + subsector += pitch; + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4)); + mmask0 = _mm_slli_si128(mmask0, 1); + subsector += pitch; + + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4)); + mmask1 = _mm_slli_si128(mmask1, 1); + subsector += pitch; + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4)); + mmask1 = _mm_slli_si128(mmask1, 1); + subsector += pitch; + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4)); + mmask1 = _mm_slli_si128(mmask1, 1); + subsector += pitch; + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4)); + mmask1 = _mm_slli_si128(mmask1, 1); + subsector += pitch; + } +} + +#endif + void ScreenTriangle::Draw(const TriDrawTriangleArgs *args, WorkerThreadData *thread) { TriangleBlock block(args); From dfd3535e0250b2f3e47163cb66f16766bc5a9200 Mon Sep 17 00:00:00 2001 From: Christoph Oelckers Date: Sat, 1 Apr 2017 19:46:38 +0200 Subject: [PATCH 2/6] - added a dedicated player class for streamed music formats (i.e. MP3, Ogg and Flac) The idea is to have more control on the game side instead of dealing with these formats in the backend, which was done for FMod because it already had the decoders implemented. However, with OpenAL this setup makes no sense and only complicates future extensions that can be better handled at a higher level. --- src/CMakeLists.txt | 1 + src/sound/i_music.cpp | 5 + src/sound/i_musicinterns.h | 1 + src/sound/i_sound.h | 3 +- src/sound/i_soundinternal.h | 2 +- src/sound/mpg123_decoder.cpp | 4 +- src/sound/mpg123_decoder.h | 2 +- src/sound/musicformats/music_libsndfile.cpp | 264 ++++++++++++++++++++ src/sound/oalsound.cpp | 4 +- src/sound/sndfile_decoder.cpp | 4 +- src/sound/sndfile_decoder.h | 2 +- 11 files changed, 281 insertions(+), 11 deletions(-) create mode 100644 src/sound/musicformats/music_libsndfile.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9af325791..dcabeb2a0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1245,6 +1245,7 @@ set (PCH_SOURCES sound/musicformats/music_cd.cpp sound/musicformats/music_dumb.cpp sound/musicformats/music_gme.cpp + sound/musicformats/music_libsndfile.cpp sound/musicformats/music_mus_midiout.cpp sound/musicformats/music_smf_midiout.cpp sound/musicformats/music_hmi_midiout.cpp diff --git a/src/sound/i_music.cpp b/src/sound/i_music.cpp index bf3ca81de..e00fdae4a 100644 --- a/src/sound/i_music.cpp +++ b/src/sound/i_music.cpp @@ -488,6 +488,11 @@ retry_as_sndsys: { info = MOD_OpenSong(*reader); } + if (info == nullptr) + { + info = SndFile_OpenSong(*reader); + if (info != nullptr) reader = nullptr; + } if (info == NULL) { diff --git a/src/sound/i_musicinterns.h b/src/sound/i_musicinterns.h index ba69741a5..227464c11 100644 --- a/src/sound/i_musicinterns.h +++ b/src/sound/i_musicinterns.h @@ -672,6 +672,7 @@ MusInfo *MOD_OpenSong(FileReader &reader); const char *GME_CheckFormat(uint32_t header); MusInfo *GME_OpenSong(FileReader &reader, const char *fmt); +MusInfo *SndFile_OpenSong(FileReader &fr); // -------------------------------------------------------------------------- diff --git a/src/sound/i_sound.h b/src/sound/i_sound.h index cd58c6f64..5f1917d68 100644 --- a/src/sound/i_sound.h +++ b/src/sound/i_sound.h @@ -160,8 +160,7 @@ public: virtual MIDIDevice* CreateMIDIDevice() const = 0; -protected: - virtual SoundDecoder *CreateDecoder(FileReader *reader); + static SoundDecoder *CreateDecoder(FileReader *reader); }; extern SoundRenderer *GSnd; diff --git a/src/sound/i_soundinternal.h b/src/sound/i_soundinternal.h index 786ff5dca..bb154a195 100644 --- a/src/sound/i_soundinternal.h +++ b/src/sound/i_soundinternal.h @@ -132,7 +132,7 @@ struct SoundDecoder virtual size_t read(char *buffer, size_t bytes) = 0; virtual TArray readAll(); - virtual bool seek(size_t ms_offset) = 0; + virtual bool seek(size_t ms_offset, bool ms) = 0; virtual size_t getSampleOffset() = 0; virtual size_t getSampleLength() { return 0; } diff --git a/src/sound/mpg123_decoder.cpp b/src/sound/mpg123_decoder.cpp index 605970bc9..1aa5a0e2f 100644 --- a/src/sound/mpg123_decoder.cpp +++ b/src/sound/mpg123_decoder.cpp @@ -134,14 +134,14 @@ size_t MPG123Decoder::read(char *buffer, size_t bytes) return amt; } -bool MPG123Decoder::seek(size_t ms_offset) +bool MPG123Decoder::seek(size_t ms_offset, bool ms) { int enc, channels; long srate; if(mpg123_getformat(MPG123, &srate, &channels, &enc) == MPG123_OK) { - size_t smp_offset = (size_t)((double)ms_offset / 1000. * srate); + size_t smp_offset = ms? (size_t)((double)ms_offset / 1000. * srate) : ms_offset; if(mpg123_seek(MPG123, (off_t)smp_offset, SEEK_SET) >= 0) { Done = false; diff --git a/src/sound/mpg123_decoder.h b/src/sound/mpg123_decoder.h index 59e1df2ca..e1c786888 100644 --- a/src/sound/mpg123_decoder.h +++ b/src/sound/mpg123_decoder.h @@ -16,7 +16,7 @@ struct MPG123Decoder : public SoundDecoder virtual void getInfo(int *samplerate, ChannelConfig *chans, SampleType *type); virtual size_t read(char *buffer, size_t bytes); - virtual bool seek(size_t ms_offset); + virtual bool seek(size_t ms_offset, bool ms); virtual size_t getSampleOffset(); virtual size_t getSampleLength(); diff --git a/src/sound/musicformats/music_libsndfile.cpp b/src/sound/musicformats/music_libsndfile.cpp new file mode 100644 index 000000000..6676d498f --- /dev/null +++ b/src/sound/musicformats/music_libsndfile.cpp @@ -0,0 +1,264 @@ +/* +** music_libsndfile.cpp +** Uses libsndfile for streaming music formats +** +**--------------------------------------------------------------------------- +** Copyright 2017 Christoph Oelckers +** All rights reserved. +** +** Redistribution and use in source and binary forms, with or without +** modification, are permitted provided that the following conditions +** are met: +** +** 1. Redistributions of source code must retain the above copyright +** notice, this list of conditions and the following disclaimer. +** 2. Redistributions in binary form must reproduce the above copyright +** notice, this list of conditions and the following disclaimer in the +** documentation and/or other materials provided with the distribution. +** 3. The name of the author may not be used to endorse or promote products +** derived from this software without specific prior written permission. +** +** THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +** IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +** OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +** IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, +** INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +** NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +** DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +** THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +** (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +** THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**--------------------------------------------------------------------------- +** +*/ + +// HEADER FILES ------------------------------------------------------------ + +#include "i_musicinterns.h" +#include "c_cvars.h" +#include "critsec.h" +#include "v_text.h" +#include "files.h" +#include "templates.h" +#include "sndfile_decoder.h" +#include "mpg123_decoder.h" + +// MACROS ------------------------------------------------------------------ + +// TYPES ------------------------------------------------------------------- + +class SndFileSong : public StreamSong +{ +public: + SndFileSong(FileReader *reader, SoundDecoder *decoder, uint32_t loop_start, uint32_t loop_end); + ~SndFileSong(); + bool SetSubsong(int subsong); + void Play(bool looping, int subsong); + FString GetStats(); + +protected: + FCriticalSection CritSec; + FileReader *Reader; + SoundDecoder *Decoder; + int Channels; + int SampleRate; + + uint32_t Loop_Start; + uint32_t Loop_End; + + int CalcSongLength(); + + static bool Read(SoundStream *stream, void *buff, int len, void *userdata); +}; + +// EXTERNAL FUNCTION PROTOTYPES -------------------------------------------- + +// PUBLIC FUNCTION PROTOTYPES ---------------------------------------------- + +// PRIVATE FUNCTION PROTOTYPES --------------------------------------------- + +// EXTERNAL DATA DECLARATIONS ---------------------------------------------- + +// PUBLIC DATA DEFINITIONS ------------------------------------------------- + +// PRIVATE DATA DEFINITIONS ------------------------------------------------ + +// CODE -------------------------------------------------------------------- + +//========================================================================== +// +// GME_OpenSong +// +//========================================================================== + +MusInfo *SndFile_OpenSong(FileReader &fr) +{ + uint8_t signature[4]; + + fr.Seek(0, SEEK_SET); + fr.Read(signature, 4); + uint32_t loop_start = 0, loop_end = ~0u; + + if (!memcmp(signature, "OggS", 4) || !memcmp(signature, "fLaC", 4)) + { + // Todo: Read loop points from metadata + + + // ms to samples. + //size_t smp_offset = ms? (size_t)((double)ms_offset / 1000. * SndInfo.samplerate) : ms_offset; + + } + fr.Seek(0, SEEK_SET); + auto decoder = SoundRenderer::CreateDecoder(&fr); + if (decoder == nullptr) return nullptr; + return new SndFileSong(&fr, decoder, loop_start, loop_end); +} + +//========================================================================== +// +// SndFileSong - Constructor +// +//========================================================================== + +SndFileSong::SndFileSong(FileReader *reader, SoundDecoder *decoder, uint32_t loop_start, uint32_t loop_end) +{ + ChannelConfig iChannels; + SampleType Type; + + decoder->getInfo(&SampleRate, &iChannels, &Type); + + Loop_Start = loop_start; + Loop_End = clamp(loop_end, 0, (uint32_t)decoder->getSampleLength()); + Reader = reader; + Decoder = decoder; + Channels = iChannels == ChannelConfig_Stereo? 2:1; + m_Stream = GSnd->CreateStream(Read, 32*1024, iChannels == ChannelConfig_Stereo? 0 : SoundStream::Mono, SampleRate, this); +} + +//========================================================================== +// +// SndFileSong - Destructor +// +//========================================================================== + +SndFileSong::~SndFileSong() +{ + Stop(); + if (m_Stream != nullptr) + { + delete m_Stream; + m_Stream = nullptr; + } + if (Decoder != nullptr) + { + delete Decoder; + } + if (Reader != nullptr) + { + delete Reader; + } +} + + +//========================================================================== +// +// SndFileSong :: Play +// +//========================================================================== + +void SndFileSong::Play(bool looping, int track) +{ + m_Status = STATE_Stopped; + m_Looping = looping; + if (m_Stream->Play(looping, 1)) + { + m_Status = STATE_Playing; + } +} + +//========================================================================== +// +// SndFileSong :: SetSubsong +// +//========================================================================== + +bool SndFileSong::SetSubsong(int track) +{ + return false; +} + +//========================================================================== +// +// SndFileSong :: GetStats +// +//========================================================================== + +FString SndFileSong::GetStats() +{ + FString out; + + size_t SamplePos; + + SamplePos = Decoder->getSampleOffset(); + int time = int (SamplePos / SampleRate); + + out.Format( + "Track: " TEXTCOLOR_YELLOW "%s, %dHz" TEXTCOLOR_NORMAL + " Time:" TEXTCOLOR_YELLOW "%02d:%02d" TEXTCOLOR_NORMAL, + Channels == 2? "Stereo" : "Mono", SampleRate, + time/60, + time % 60); + return out; +} + +//========================================================================== +// +// SndFileSong :: Read STATIC +// +//========================================================================== + +bool SndFileSong::Read(SoundStream *stream, void *vbuff, int ilen, void *userdata) +{ + char *buff = (char*)vbuff; + SndFileSong *song = (SndFileSong *)userdata; + song->CritSec.Enter(); + + size_t len = size_t(ilen); + size_t currentpos = song->Decoder->getSampleOffset(); + size_t framestoread = len / (song->Channels*2); + bool err = false; + if (!song->m_Looping) + { + size_t maxpos = song->Decoder->getSampleLength(); + if (currentpos == maxpos) + { + memset(buff, 0, len); + song->CritSec.Leave(); + return false; + } + if (currentpos + framestoread > maxpos) + { + size_t got = song->Decoder->read(buff, (maxpos - currentpos) * song->Channels * 2); + memset(buff + got, 0, len - got); + } + else + { + size_t got = song->Decoder->read(buff, len); + err = (got != len); + } + } + else + { + if (currentpos + framestoread > song->Loop_End) + { + size_t endblock = (song->Loop_End - currentpos) * song->Channels * 2; + err = (song->Decoder->read(buff, endblock) != endblock); + buff = buff + endblock; + len -= endblock; + song->Decoder->seek(song->Loop_Start, false); + } + err |= song->Decoder->read(buff, len) != len; + } + song->CritSec.Leave(); + return !err; +} diff --git a/src/sound/oalsound.cpp b/src/sound/oalsound.cpp index 90639624a..d38435be1 100644 --- a/src/sound/oalsound.cpp +++ b/src/sound/oalsound.cpp @@ -212,7 +212,7 @@ class OpenALSoundStream : public SoundStream size_t got = self->Decoder->read((char*)ptr, length); if(got < (unsigned int)length) { - if(!self->Looping || !self->Decoder->seek(0)) + if(!self->Looping || !self->Decoder->seek(0, false)) return false; got += self->Decoder->read((char*)ptr+got, length-got); } @@ -361,7 +361,7 @@ public: virtual bool SetPosition(unsigned int ms_pos) { std::unique_lock lock(Renderer->StreamLock); - if(!Decoder->seek(ms_pos)) + if(!Decoder->seek(ms_pos, true)) return false; if(!Playing.load()) diff --git a/src/sound/sndfile_decoder.cpp b/src/sound/sndfile_decoder.cpp index 5a957eb27..9d0d2331b 100644 --- a/src/sound/sndfile_decoder.cpp +++ b/src/sound/sndfile_decoder.cpp @@ -132,9 +132,9 @@ TArray SndFileDecoder::readAll() return output; } -bool SndFileDecoder::seek(size_t ms_offset) +bool SndFileDecoder::seek(size_t ms_offset, bool ms) { - size_t smp_offset = (size_t)((double)ms_offset / 1000. * SndInfo.samplerate); + size_t smp_offset = ms? (size_t)((double)ms_offset / 1000. * SndInfo.samplerate) : ms_offset; if(sf_seek(SndFile, smp_offset, SEEK_SET) < 0) return false; return true; diff --git a/src/sound/sndfile_decoder.h b/src/sound/sndfile_decoder.h index f53f7e52a..0c4cfe935 100644 --- a/src/sound/sndfile_decoder.h +++ b/src/sound/sndfile_decoder.h @@ -13,7 +13,7 @@ struct SndFileDecoder : public SoundDecoder virtual size_t read(char *buffer, size_t bytes); virtual TArray readAll(); - virtual bool seek(size_t ms_offset); + virtual bool seek(size_t ms_offset, bool ms); virtual size_t getSampleOffset(); virtual size_t getSampleLength(); From 3f552ea95f716d4255e28f51339ac401873056fe Mon Sep 17 00:00:00 2001 From: Christoph Oelckers Date: Sat, 1 Apr 2017 21:40:36 +0200 Subject: [PATCH 3/6] - added loop tag reading to the new streaming music class. This is somewhat brute-force thanks to the surprising lack of good documentation for the Ogg headers. The only other option would have been some rather bloated library for a function that should be 25-30 lines at most. --- src/sound/musicformats/music_libsndfile.cpp | 97 ++++++++++++++++++--- 1 file changed, 87 insertions(+), 10 deletions(-) diff --git a/src/sound/musicformats/music_libsndfile.cpp b/src/sound/musicformats/music_libsndfile.cpp index 6676d498f..a338d344f 100644 --- a/src/sound/musicformats/music_libsndfile.cpp +++ b/src/sound/musicformats/music_libsndfile.cpp @@ -42,6 +42,7 @@ #include "templates.h" #include "sndfile_decoder.h" #include "mpg123_decoder.h" +#include "m_fixed.h" // MACROS ------------------------------------------------------------------ @@ -50,7 +51,7 @@ class SndFileSong : public StreamSong { public: - SndFileSong(FileReader *reader, SoundDecoder *decoder, uint32_t loop_start, uint32_t loop_end); + SndFileSong(FileReader *reader, SoundDecoder *decoder, uint32_t loop_start, uint32_t loop_end, bool startass, bool endass); ~SndFileSong(); bool SetSubsong(int subsong); void Play(bool looping, int subsong); @@ -87,7 +88,83 @@ protected: //========================================================================== // -// GME_OpenSong +// try to find the LOOP_START/LOOP_END tags +// +// This is a brute force implementation, thanks in no snall part +// that no decent documentation of Ogg headers seems to exist and +// all available tag libraries are horrendously bloated. +// So if we want to do this without any new third party dependencies, +// thanks to the lack of anything that would help to do this properly, +// this was the only solution. +// +//========================================================================== + +void FindLoopTags(FileReader *fr, uint32_t *start, bool *startass, uint32_t *end, bool *endass) +{ + unsigned char testbuf[256]; + + fr->Seek(0, SEEK_SET); + long got = fr->Read(testbuf, 256); + auto eqp = testbuf - 1; + int count; + while(true) + { + unsigned char *c = (unsigned char *)memchr(eqp + 1, '=', 256 - (eqp + 1 - testbuf)); + if (c == nullptr) return; // If there is no '=' in the first 256 bytes there's also no metadata. + + eqp = c; + while (*c >= 32 && *c < 127) c--; + if (*c != 0) + { + // doesn't look like a valid tag, so try again + continue; + } + c -= 3; + int len = LittleLong(*(int*)c); + if (len > 1000000 || len <= (eqp - c + 1)) + { + // length looks fishy so retry with the next '=' + continue; + } + c -= 4; + count = LittleLong(*(int*)c); + if (count <= 0 || count > 1000) + { + // very unlikely to have 1000 tags + continue; + } + c += 4; + fr->Seek(long(c - testbuf), SEEK_SET); + break; // looks like we found something. + } + for (int i = 0; i < count; i++) + { + int length = 0; + fr->Read(&length, 4); + length = LittleLong(length); + if (length == 0 || length > 1000000) return; // looks like we lost it... + if (length > 25) + { + // This tag is too long to be a valid time stamp so don't even bother. + fr->Seek(length, SEEK_CUR); + continue; + } + fr->Read(testbuf, length); + testbuf[length] = 0; + if (strnicmp((char*)testbuf, "LOOP_START=", 11) == 0) + { + S_ParseTimeTag((char*)testbuf + 11, startass, start); + } + else if (strnicmp((char*)testbuf, "LOOP_END=", 9) == 0) + { + S_ParseTimeTag((char*)testbuf + 9, endass, end); + } + } +} + +//========================================================================== +// +// SndFile_OpenSong // //========================================================================== @@ -98,20 +175,17 @@ MusInfo *SndFile_OpenSong(FileReader &fr) fr.Seek(0, SEEK_SET); fr.Read(signature, 4); uint32_t loop_start = 0, loop_end = ~0u; + bool startass = false, endass = false; if (!memcmp(signature, "OggS", 4) || !memcmp(signature, "fLaC", 4)) { // Todo: Read loop points from metadata - - - // ms to samples. - //size_t smp_offset = ms? (size_t)((double)ms_offset / 1000. * SndInfo.samplerate) : ms_offset; - + FindLoopTags(&fr, &loop_start, &startass, &loop_end, &endass); } fr.Seek(0, SEEK_SET); auto decoder = SoundRenderer::CreateDecoder(&fr); if (decoder == nullptr) return nullptr; - return new SndFileSong(&fr, decoder, loop_start, loop_end); + return new SndFileSong(&fr, decoder, loop_start, loop_end, startass, endass); } //========================================================================== @@ -120,13 +194,16 @@ MusInfo *SndFile_OpenSong(FileReader &fr) // //========================================================================== -SndFileSong::SndFileSong(FileReader *reader, SoundDecoder *decoder, uint32_t loop_start, uint32_t loop_end) +SndFileSong::SndFileSong(FileReader *reader, SoundDecoder *decoder, uint32_t loop_start, uint32_t loop_end, bool startass, bool endass) { ChannelConfig iChannels; SampleType Type; decoder->getInfo(&SampleRate, &iChannels, &Type); - + + if (!startass) loop_start = Scale(loop_start, SampleRate, 1000); + if (!endass) loop_end = Scale(loop_end, SampleRate, 1000); + Loop_Start = loop_start; Loop_End = clamp(loop_end, 0, (uint32_t)decoder->getSampleLength()); Reader = reader; From 0ed60b8df6b21bc5dfc5041c59cf1104f4431148 Mon Sep 17 00:00:00 2001 From: Magnus Norddahl Date: Sat, 1 Apr 2017 22:14:04 +0200 Subject: [PATCH 4/6] - fix OpenGLSWFrameBuffer::Begin2D --- src/gl/system/gl_swframebuffer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gl/system/gl_swframebuffer.cpp b/src/gl/system/gl_swframebuffer.cpp index 3bb0fd8d8..6963b65ca 100644 --- a/src/gl/system/gl_swframebuffer.cpp +++ b/src/gl/system/gl_swframebuffer.cpp @@ -2410,7 +2410,7 @@ bool OpenGLSWFrameBuffer::OpenGLPal::Update() bool OpenGLSWFrameBuffer::Begin2D(bool copy3d) { - ClearClipRect(); + Super::Begin2D(copy3d); if (!Accel2D) { return false; From 4a9dffa70ddad445eb2f540adf1a4b2339eed17d Mon Sep 17 00:00:00 2001 From: Magnus Norddahl Date: Sat, 1 Apr 2017 23:21:06 +0200 Subject: [PATCH 5/6] - change subsector gbuffer to use a 8x8 block memory layout to avoid having to clip memory reads and writes --- src/polyrenderer/drawers/poly_buffer.cpp | 3 +- src/polyrenderer/drawers/poly_buffer.h | 2 + src/polyrenderer/drawers/screen_triangle.cpp | 165 ++++++++----------- 3 files changed, 70 insertions(+), 100 deletions(-) diff --git a/src/polyrenderer/drawers/poly_buffer.cpp b/src/polyrenderer/drawers/poly_buffer.cpp index e8d37af4c..143d16bbd 100644 --- a/src/polyrenderer/drawers/poly_buffer.cpp +++ b/src/polyrenderer/drawers/poly_buffer.cpp @@ -48,7 +48,8 @@ void PolySubsectorGBuffer::Resize(int newwidth, int newheight) { width = newwidth; height = newheight; - values.resize(width * height); + int count = BlockWidth() * BlockHeight(); + values.resize(count * 64); } ///////////////////////////////////////////////////////////////////////////// diff --git a/src/polyrenderer/drawers/poly_buffer.h b/src/polyrenderer/drawers/poly_buffer.h index c272a7710..a376cec46 100644 --- a/src/polyrenderer/drawers/poly_buffer.h +++ b/src/polyrenderer/drawers/poly_buffer.h @@ -33,6 +33,8 @@ public: void Resize(int newwidth, int newheight); int Width() const { return width; } int Height() const { return height; } + int BlockWidth() const { return (width + 7) / 8; } + int BlockHeight() const { return (height + 7) / 8; } uint32_t *Values() { return values.data(); } private: diff --git a/src/polyrenderer/drawers/screen_triangle.cpp b/src/polyrenderer/drawers/screen_triangle.cpp index 6b3a14098..f8c300dc7 100644 --- a/src/polyrenderer/drawers/screen_triangle.cpp +++ b/src/polyrenderer/drawers/screen_triangle.cpp @@ -133,7 +133,7 @@ TriangleBlock::TriangleBlock(const TriDrawTriangleArgs *args) subsectorGBuffer = args->subsectorGBuffer; subsectorDepth = args->uniforms->SubsectorDepth(); - subsectorPitch = args->pitch; + subsectorPitch = args->stencilPitch; // 28.4 fixed-point coordinates #ifdef NO_SSE @@ -288,29 +288,24 @@ void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thre void TriangleBlock::SubsectorTest() { - uint32_t *subsector = subsectorGBuffer + X + Y * subsectorPitch; + int block = (X >> 3) + (Y >> 3) * subsectorPitch; + uint32_t *subsector = subsectorGBuffer + block * 64; uint32_t mask0 = 0; uint32_t mask1 = 0; - for (int iy = 0; iy < 4; iy++) + for (int i = 0; i < 32; i++) { - for (int ix = 0; ix < q; ix++) - { - bool covered = subsector[ix] >= subsectorDepth; - mask0 <<= 1; - mask0 |= (uint32_t)covered; - } - subsector += subsectorPitch; + bool covered = *subsector >= subsectorDepth; + mask0 <<= 1; + mask0 |= (uint32_t)covered; + subsector++; } - for (int iy = 4; iy < q; iy++) + for (int i = 0; i < 32; i++) { - for (int ix = 0; ix < q; ix++) - { - bool covered = subsector[ix] >= subsectorDepth; - mask1 <<= 1; - mask1 |= (uint32_t)covered; - } - subsector += subsectorPitch; + bool covered = *subsector >= subsectorDepth; + mask1 <<= 1; + mask1 |= (uint32_t)covered; + subsector++; } Mask0 = Mask0 & mask0; @@ -321,27 +316,24 @@ void TriangleBlock::SubsectorTest() void TriangleBlock::SubsectorTest() { - uint32_t *subsector = subsectorGBuffer + X + Y * subsectorPitch; + int block = (X >> 3) + (Y >> 3) * subsectorPitch; + uint32_t *subsector = subsectorGBuffer + block * 64; uint32_t mask0 = 0; uint32_t mask1 = 0; __m128i msubsectorDepth = _mm_set1_epi32(subsectorDepth); __m128i mnotxor = _mm_set1_epi32(0xffffffff); - for (int iy = 0; iy < 4; iy++) + for (int iy = 0; iy < 8; iy++) { mask0 <<= 4; mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)subsector), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3)))); - mask0 <<= 4; - mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(subsector + 4)), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3)))); - subsector += subsectorPitch; + subsector += 4; } - for (int iy = 4; iy < q; iy++) + for (int iy = 0; iy < 8; iy++) { mask1 <<= 4; mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)subsector), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3)))); - mask1 <<= 4; - mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(subsector + 4)), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3)))); - subsector += subsectorPitch; + subsector += 4; } Mask0 = Mask0 & mask0; @@ -805,41 +797,33 @@ void TriangleBlock::StencilWrite() void TriangleBlock::SubsectorWrite() { - auto pitch = subsectorPitch; - uint32_t *subsector = subsectorGBuffer + X + Y * pitch; + int block = (X >> 3) + (Y >> 3) * subsectorPitch; + uint32_t *subsector = subsectorGBuffer + block * 64; if (Mask0 == 0xffffffff && Mask1 == 0xffffffff) { - for (int y = 0; y < 8; y++) + for (int i = 0; i < 64; i++) { - for (int x = 0; x < 8; x++) - subsector[x] = subsectorDepth; - subsector += pitch; + *(subsector++) = subsectorDepth; } } else { uint32_t mask0 = Mask0; uint32_t mask1 = Mask1; - for (int y = 0; y < 4; y++) + for (int i = 0; i < 32; i++) { - for (int x = 0; x < 8; x++) - { - if (mask0 & (1 << 31)) - subsector[x] = subsectorDepth; - mask0 <<= 1; - } - subsector += pitch; + if (mask0 & (1 << 31)) + *subsector = subsectorDepth; + mask0 <<= 1; + subsector++; } - for (int y = 4; y < 8; y++) + for (int i = 0; i < 32; i++) { - for (int x = 0; x < 8; x++) - { - if (mask1 & (1 << 31)) - subsector[x] = subsectorDepth; - mask1 <<= 1; - } - subsector += pitch; + if (mask1 & (1 << 31)) + *subsector = subsectorDepth; + mask1 <<= 1; + subsector++; } } } @@ -848,71 +832,54 @@ void TriangleBlock::SubsectorWrite() void TriangleBlock::SubsectorWrite() { - auto pitch = subsectorPitch; - uint32_t *subsector = subsectorGBuffer + X + Y * pitch; + int block = (X >> 3) + (Y >> 3) * subsectorPitch; + uint32_t *subsector = subsectorGBuffer + block * 64; __m128i msubsectorDepth = _mm_set1_epi32(subsectorDepth); if (Mask0 == 0xffffffff && Mask1 == 0xffffffff) { + _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4; + _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4; + _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4; + _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4; + _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4; + _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4; + _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4; + _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4; + _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4; + _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4; + _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4; + _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4; + _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4; + _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4; + _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4; _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); - _mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch; - _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); - _mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch; - _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); - _mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch; - _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); - _mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch; - _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); - _mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch; - _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); - _mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch; - _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); - _mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch; - _mm_storeu_si128((__m128i*)subsector, msubsectorDepth); - _mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); } else { __m128i mxormask = _mm_set1_epi32(0xffffffff); __m128i topfour = _mm_setr_epi32(1 << 31, 1 << 30, 1 << 29, 1 << 28); - __m128i bottomfour = _mm_setr_epi32(1 << 27, 1 << 26, 1 << 25, 1 << 24); __m128i mmask0 = _mm_set1_epi32(Mask0); __m128i mmask1 = _mm_set1_epi32(Mask1); - _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); - _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4)); - mmask0 = _mm_slli_si128(mmask0, 1); - subsector += pitch; - _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); - _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4)); - mmask0 = _mm_slli_si128(mmask0, 1); - subsector += pitch; - _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); - _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4)); - mmask0 = _mm_slli_si128(mmask0, 1); - subsector += pitch; - _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); - _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4)); - mmask0 = _mm_slli_si128(mmask0, 1); - subsector += pitch; + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask0 = _mm_slli_epi32(mmask0, 4); subsector += 4; + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask0 = _mm_slli_epi32(mmask0, 4); subsector += 4; + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask0 = _mm_slli_epi32(mmask0, 4); subsector += 4; + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask0 = _mm_slli_epi32(mmask0, 4); subsector += 4; + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask0 = _mm_slli_epi32(mmask0, 4); subsector += 4; + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask0 = _mm_slli_epi32(mmask0, 4); subsector += 4; + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask0 = _mm_slli_epi32(mmask0, 4); subsector += 4; + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); subsector += 4; - _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); - _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4)); - mmask1 = _mm_slli_si128(mmask1, 1); - subsector += pitch; - _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); - _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4)); - mmask1 = _mm_slli_si128(mmask1, 1); - subsector += pitch; - _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); - _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4)); - mmask1 = _mm_slli_si128(mmask1, 1); - subsector += pitch; - _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); - _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4)); - mmask1 = _mm_slli_si128(mmask1, 1); - subsector += pitch; + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask1 = _mm_slli_epi32(mmask1, 4); subsector += 4; + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask1 = _mm_slli_epi32(mmask1, 4); subsector += 4; + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask1 = _mm_slli_epi32(mmask1, 4); subsector += 4; + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask1 = _mm_slli_epi32(mmask1, 4); subsector += 4; + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask1 = _mm_slli_epi32(mmask1, 4); subsector += 4; + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask1 = _mm_slli_epi32(mmask1, 4); subsector += 4; + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask1 = _mm_slli_epi32(mmask1, 4); subsector += 4; + _mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); subsector += 4; } } From 4268090738cae8e393820596da3d51e678a62512 Mon Sep 17 00:00:00 2001 From: Magnus Norddahl Date: Sun, 2 Apr 2017 00:47:18 +0200 Subject: [PATCH 6/6] - switch OpenGLSWFrameBuffer to use m_Lock instead of LockCount --- src/gl/system/gl_swframebuffer.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/gl/system/gl_swframebuffer.cpp b/src/gl/system/gl_swframebuffer.cpp index 6963b65ca..ee5c0d58d 100644 --- a/src/gl/system/gl_swframebuffer.cpp +++ b/src/gl/system/gl_swframebuffer.cpp @@ -1095,7 +1095,7 @@ bool OpenGLSWFrameBuffer::IsValid() bool OpenGLSWFrameBuffer::Lock(bool buffered) { - if (LockCount++ > 0) + if (m_Lock++ > 0) { return false; } @@ -1130,16 +1130,16 @@ bool OpenGLSWFrameBuffer::Lock(bool buffered) void OpenGLSWFrameBuffer::Unlock() { - if (LockCount == 0) + if (m_Lock == 0) { return; } - if (UpdatePending && LockCount == 1) + if (UpdatePending && m_Lock == 1) { Update(); } - else if (--LockCount == 0) + else if (--m_Lock == 0) { Buffer = nullptr; } @@ -1171,13 +1171,13 @@ void OpenGLSWFrameBuffer::Update() return; } - if (LockCount != 1) + if (m_Lock != 1) { I_FatalError("Framebuffer must have exactly 1 lock to be updated"); - if (LockCount > 0) + if (m_Lock > 0) { UpdatePending = true; - --LockCount; + --m_Lock; } return; } @@ -1220,7 +1220,7 @@ void OpenGLSWFrameBuffer::Update() BlitCycles.Clock(); #endif - LockCount = 0; + m_Lock = 0; Draw3DPart(In2D <= 1); if (In2D == 0) { @@ -1276,7 +1276,7 @@ void OpenGLSWFrameBuffer::Flip() bool OpenGLSWFrameBuffer::PaintToWindow() { - if (LockCount != 0) + if (m_Lock != 0) { return false; } @@ -1662,7 +1662,7 @@ void OpenGLSWFrameBuffer::GetScreenshotBuffer(const uint8_t *&buffer, int &pitch void OpenGLSWFrameBuffer::ReleaseScreenshotBuffer() { - if (LockCount > 0) + if (m_Lock > 0) { Super::ReleaseScreenshotBuffer(); }