- speed up triangle setup slightly

This commit is contained in:
Magnus Norddahl 2017-04-01 16:42:24 +02:00
parent 553906b186
commit 4a0b3c3bab

View file

@ -96,6 +96,15 @@ private:
__m128i mFDX12;
__m128i mFDX23;
__m128i mFDX31;
__m128i mC1;
__m128i mC2;
__m128i mC3;
__m128i mDX12;
__m128i mDY12;
__m128i mDX23;
__m128i mDY23;
__m128i mDX31;
__m128i mDY31;
#endif
void CoverageTest();
@ -203,6 +212,15 @@ TriangleBlock::TriangleBlock(const TriDrawTriangleArgs *args)
mFDX12 = _mm_set1_epi32(FDX12);
mFDX23 = _mm_set1_epi32(FDX23);
mFDX31 = _mm_set1_epi32(FDX31);
mC1 = _mm_set1_epi32(C1);
mC2 = _mm_set1_epi32(C2);
mC3 = _mm_set1_epi32(C3);
mDX12 = _mm_set1_epi32(DX12);
mDY12 = _mm_set1_epi32(DY12);
mDX23 = _mm_set1_epi32(DX23);
mDY23 = _mm_set1_epi32(DY23);
mDX31 = _mm_set1_epi32(DX31);
mDY31 = _mm_set1_epi32(DY31);
#endif
}
@ -432,30 +450,44 @@ void TriangleBlock::StencilEqualTest()
uint32_t mask0 = 0;
uint32_t mask1 = 0;
for (int iy = 0; iy < 4; iy++)
for (int iy = 0; iy < 2; iy++)
{
__m128i mstencilBlock = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(stencilBlock + iy * 8)), _mm_setzero_si128());
__m128i mstencilTest = _mm_cmpeq_epi16(mstencilBlock, mstencilTestValue);
__m128i mstencilBlock = _mm_loadu_si128((const __m128i *)stencilBlock);
__m128i mstencilTest = _mm_cmpeq_epi16(_mm_unpacklo_epi8(mstencilBlock, _mm_setzero_si128()), mstencilTestValue);
__m128i mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest);
__m128i mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest);
__m128i first = _mm_packs_epi32(_mm_shuffle_epi32(mstencilTest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mstencilTest0, _MM_SHUFFLE(0, 1, 2, 3)));
mask0 <<= 4;
mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mstencilTest0, _MM_SHUFFLE(0, 1, 2, 3))));
mask0 <<= 4;
mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mstencilTest1, _MM_SHUFFLE(0, 1, 2, 3))));
mstencilTest = _mm_cmpeq_epi16(_mm_unpackhi_epi8(mstencilBlock, _mm_setzero_si128()), mstencilTestValue);
mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest);
mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest);
__m128i second = _mm_packs_epi32(_mm_shuffle_epi32(mstencilTest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mstencilTest0, _MM_SHUFFLE(0, 1, 2, 3)));
mask0 <<= 16;
mask0 |= _mm_movemask_epi8(_mm_packs_epi16(second, first));
stencilBlock += 16;
}
for (int iy = 4; iy < q; iy++)
for (int iy = 0; iy < 2; iy++)
{
__m128i mstencilBlock = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(stencilBlock + iy * 8)), _mm_setzero_si128());
__m128i mstencilTest = _mm_cmpeq_epi16(mstencilBlock, mstencilTestValue);
__m128i mstencilBlock = _mm_loadu_si128((const __m128i *)stencilBlock);
__m128i mstencilTest = _mm_cmpeq_epi16(_mm_unpacklo_epi8(mstencilBlock, _mm_setzero_si128()), mstencilTestValue);
__m128i mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest);
__m128i mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest);
__m128i first = _mm_packs_epi32(_mm_shuffle_epi32(mstencilTest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mstencilTest0, _MM_SHUFFLE(0, 1, 2, 3)));
mask1 <<= 4;
mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mstencilTest0, _MM_SHUFFLE(0, 1, 2, 3))));
mask1 <<= 4;
mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mstencilTest1, _MM_SHUFFLE(0, 1, 2, 3))));
mstencilTest = _mm_cmpeq_epi16(_mm_unpackhi_epi8(mstencilBlock, _mm_setzero_si128()), mstencilTestValue);
mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest);
mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest);
__m128i second = _mm_packs_epi32(_mm_shuffle_epi32(mstencilTest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mstencilTest0, _MM_SHUFFLE(0, 1, 2, 3)));
mask1 <<= 16;
mask1 |= _mm_movemask_epi8(_mm_packs_epi16(second, first));
stencilBlock += 16;
}
Mask0 = Mask0 & mask0;
@ -617,50 +649,46 @@ void TriangleBlock::CoverageTest()
int y0 = Y << 4;
int y1 = (Y + q - 1) << 4;
__m128i mY = _mm_set_epi32(y0, y0, y1, y1);
__m128i mX = _mm_set_epi32(x0, x0, x1, x1);
// Evaluate half-space functions
bool a00 = C1 + DX12 * y0 - DY12 * x0 > 0;
bool a10 = C1 + DX12 * y0 - DY12 * x1 > 0;
bool a01 = C1 + DX12 * y1 - DY12 * x0 > 0;
bool a11 = C1 + DX12 * y1 - DY12 * x1 > 0;
int a = (a00 << 0) | (a10 << 1) | (a01 << 2) | (a11 << 3);
__m128i mCY1 = _mm_sub_epi32(
_mm_add_epi32(mC1, _mm_shuffle_epi32(_mm_mul_epu32(mDX12, mY), _MM_SHUFFLE(0, 0, 2, 2))),
_mm_shuffle_epi32(_mm_mul_epu32(mDY12, mX), _MM_SHUFFLE(0, 2, 0, 2)));
__m128i mA = _mm_cmpgt_epi32(mCY1, _mm_setzero_si128());
bool b00 = C2 + DX23 * y0 - DY23 * x0 > 0;
bool b10 = C2 + DX23 * y0 - DY23 * x1 > 0;
bool b01 = C2 + DX23 * y1 - DY23 * x0 > 0;
bool b11 = C2 + DX23 * y1 - DY23 * x1 > 0;
int b = (b00 << 0) | (b10 << 1) | (b01 << 2) | (b11 << 3);
__m128i mCY2 = _mm_sub_epi32(
_mm_add_epi32(mC2, _mm_shuffle_epi32(_mm_mul_epu32(mDX23, mY), _MM_SHUFFLE(0, 0, 2, 2))),
_mm_shuffle_epi32(_mm_mul_epu32(mDY23, mX), _MM_SHUFFLE(0, 2, 0, 2)));
__m128i mB = _mm_cmpgt_epi32(mCY2, _mm_setzero_si128());
bool c00 = C3 + DX31 * y0 - DY31 * x0 > 0;
bool c10 = C3 + DX31 * y0 - DY31 * x1 > 0;
bool c01 = C3 + DX31 * y1 - DY31 * x0 > 0;
bool c11 = C3 + DX31 * y1 - DY31 * x1 > 0;
int c = (c00 << 0) | (c10 << 1) | (c01 << 2) | (c11 << 3);
__m128i mCY3 = _mm_sub_epi32(
_mm_add_epi32(mC3, _mm_shuffle_epi32(_mm_mul_epu32(mDX31, mY), _MM_SHUFFLE(0, 0, 2, 2))),
_mm_shuffle_epi32(_mm_mul_epu32(mDY31, mX), _MM_SHUFFLE(0, 2, 0, 2)));
__m128i mC = _mm_cmpgt_epi32(mCY3, _mm_setzero_si128());
if (a == 0 || b == 0 || c == 0) // Skip block when outside an edge
int abc = _mm_movemask_epi8(_mm_packs_epi16(_mm_packs_epi32(mA, mB), _mm_packs_epi32(mC, _mm_setzero_si128())));
if ((abc & 0xf) == 0 || (abc & 0xf0) == 0 || (abc & 0xf00) == 0) // Skip block when outside an edge
{
Mask0 = 0;
Mask1 = 0;
}
else if (a == 0xf && b == 0xf && c == 0xf) // Accept whole block when totally covered
else if (abc == 0xfff) // Accept whole block when totally covered
{
Mask0 = 0xffffffff;
Mask1 = 0xffffffff;
}
else // Partially covered block
{
x0 = X << 4;
x1 = (X + q - 1) << 4;
int CY1 = C1 + DX12 * y0 - DY12 * x0;
int CY2 = C2 + DX23 * y0 - DY23 * x0;
int CY3 = C3 + DX31 * y0 - DY31 * x0;
uint32_t mask0 = 0;
uint32_t mask1 = 0;
__m128i mCY1 = _mm_sub_epi32(_mm_set1_epi32(CY1), mFDY12Offset);
__m128i mCY2 = _mm_sub_epi32(_mm_set1_epi32(CY2), mFDY23Offset);
__m128i mCY3 = _mm_sub_epi32(_mm_set1_epi32(CY3), mFDY31Offset);
for (int iy = 0; iy < 4; iy++)
mCY1 = _mm_sub_epi32(_mm_shuffle_epi32(mCY1, _MM_SHUFFLE(0, 0, 0, 0)), mFDY12Offset);
mCY2 = _mm_sub_epi32(_mm_shuffle_epi32(mCY2, _MM_SHUFFLE(0, 0, 0, 0)), mFDY23Offset);
mCY3 = _mm_sub_epi32(_mm_shuffle_epi32(mCY3, _MM_SHUFFLE(0, 0, 0, 0)), mFDY31Offset);
for (int iy = 0; iy < 2; iy++)
{
__m128i mtest0 = _mm_cmpgt_epi32(mCY1, _mm_setzero_si128());
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0);
@ -668,18 +696,27 @@ void TriangleBlock::CoverageTest()
__m128i mtest1 = _mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128());
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1);
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1);
mCY1 = _mm_add_epi32(mCY1, mFDX12);
mCY2 = _mm_add_epi32(mCY2, mFDX23);
mCY3 = _mm_add_epi32(mCY3, mFDX31);
__m128i first = _mm_packs_epi32(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3)));
mask0 <<= 4;
mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3))));
mask0 <<= 4;
mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3))));
mtest0 = _mm_cmpgt_epi32(mCY1, _mm_setzero_si128());
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0);
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY3, _mm_setzero_si128()), mtest0);
mtest1 = _mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128());
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1);
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1);
mCY1 = _mm_add_epi32(mCY1, mFDX12);
mCY2 = _mm_add_epi32(mCY2, mFDX23);
mCY3 = _mm_add_epi32(mCY3, mFDX31);
__m128i second = _mm_packs_epi32(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3)));
mask0 <<= 16;
mask0 |= _mm_movemask_epi8(_mm_packs_epi16(second, first));
}
for (int iy = 4; iy < q; iy++)
for (int iy = 0; iy < 2; iy++)
{
__m128i mtest0 = _mm_cmpgt_epi32(mCY1, _mm_setzero_si128());
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0);
@ -687,15 +724,24 @@ void TriangleBlock::CoverageTest()
__m128i mtest1 = _mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128());
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1);
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1);
mCY1 = _mm_add_epi32(mCY1, mFDX12);
mCY2 = _mm_add_epi32(mCY2, mFDX23);
mCY3 = _mm_add_epi32(mCY3, mFDX31);
__m128i first = _mm_packs_epi32(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3)));
mask1 <<= 4;
mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3))));
mask1 <<= 4;
mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3))));
mtest0 = _mm_cmpgt_epi32(mCY1, _mm_setzero_si128());
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0);
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY3, _mm_setzero_si128()), mtest0);
mtest1 = _mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128());
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1);
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1);
mCY1 = _mm_add_epi32(mCY1, mFDX12);
mCY2 = _mm_add_epi32(mCY2, mFDX23);
mCY3 = _mm_add_epi32(mCY3, mFDX31);
__m128i second = _mm_packs_epi32(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3)));
mask1 <<= 16;
mask1 |= _mm_movemask_epi8(_mm_packs_epi16(second, first));
}
Mask0 = mask0;
@ -755,6 +801,8 @@ void TriangleBlock::StencilWrite()
}
}
#ifdef NO_SSE
void TriangleBlock::SubsectorWrite()
{
auto pitch = subsectorPitch;
@ -796,6 +844,80 @@ void TriangleBlock::SubsectorWrite()
}
}
#else
void TriangleBlock::SubsectorWrite()
{
auto pitch = subsectorPitch;
uint32_t *subsector = subsectorGBuffer + X + Y * pitch;
__m128i msubsectorDepth = _mm_set1_epi32(subsectorDepth);
if (Mask0 == 0xffffffff && Mask1 == 0xffffffff)
{
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth);
_mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch;
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth);
_mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch;
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth);
_mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch;
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth);
_mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch;
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth);
_mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch;
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth);
_mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch;
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth);
_mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch;
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth);
_mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth);
}
else
{
__m128i mxormask = _mm_set1_epi32(0xffffffff);
__m128i topfour = _mm_setr_epi32(1 << 31, 1 << 30, 1 << 29, 1 << 28);
__m128i bottomfour = _mm_setr_epi32(1 << 27, 1 << 26, 1 << 25, 1 << 24);
__m128i mmask0 = _mm_set1_epi32(Mask0);
__m128i mmask1 = _mm_set1_epi32(Mask1);
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector);
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4));
mmask0 = _mm_slli_si128(mmask0, 1);
subsector += pitch;
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector);
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4));
mmask0 = _mm_slli_si128(mmask0, 1);
subsector += pitch;
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector);
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4));
mmask0 = _mm_slli_si128(mmask0, 1);
subsector += pitch;
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector);
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4));
mmask0 = _mm_slli_si128(mmask0, 1);
subsector += pitch;
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector);
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4));
mmask1 = _mm_slli_si128(mmask1, 1);
subsector += pitch;
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector);
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4));
mmask1 = _mm_slli_si128(mmask1, 1);
subsector += pitch;
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector);
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4));
mmask1 = _mm_slli_si128(mmask1, 1);
subsector += pitch;
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector);
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4));
mmask1 = _mm_slli_si128(mmask1, 1);
subsector += pitch;
}
}
#endif
void ScreenTriangle::Draw(const TriDrawTriangleArgs *args, WorkerThreadData *thread)
{
TriangleBlock block(args);