mirror of
https://github.com/ZDoom/qzdoom.git
synced 2024-11-10 23:02:08 +00:00
- speed up triangle setup slightly
This commit is contained in:
parent
553906b186
commit
4a0b3c3bab
1 changed files with 174 additions and 52 deletions
|
@ -96,6 +96,15 @@ private:
|
|||
__m128i mFDX12;
|
||||
__m128i mFDX23;
|
||||
__m128i mFDX31;
|
||||
__m128i mC1;
|
||||
__m128i mC2;
|
||||
__m128i mC3;
|
||||
__m128i mDX12;
|
||||
__m128i mDY12;
|
||||
__m128i mDX23;
|
||||
__m128i mDY23;
|
||||
__m128i mDX31;
|
||||
__m128i mDY31;
|
||||
#endif
|
||||
|
||||
void CoverageTest();
|
||||
|
@ -203,6 +212,15 @@ TriangleBlock::TriangleBlock(const TriDrawTriangleArgs *args)
|
|||
mFDX12 = _mm_set1_epi32(FDX12);
|
||||
mFDX23 = _mm_set1_epi32(FDX23);
|
||||
mFDX31 = _mm_set1_epi32(FDX31);
|
||||
mC1 = _mm_set1_epi32(C1);
|
||||
mC2 = _mm_set1_epi32(C2);
|
||||
mC3 = _mm_set1_epi32(C3);
|
||||
mDX12 = _mm_set1_epi32(DX12);
|
||||
mDY12 = _mm_set1_epi32(DY12);
|
||||
mDX23 = _mm_set1_epi32(DX23);
|
||||
mDY23 = _mm_set1_epi32(DY23);
|
||||
mDX31 = _mm_set1_epi32(DX31);
|
||||
mDY31 = _mm_set1_epi32(DY31);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -432,30 +450,44 @@ void TriangleBlock::StencilEqualTest()
|
|||
uint32_t mask0 = 0;
|
||||
uint32_t mask1 = 0;
|
||||
|
||||
for (int iy = 0; iy < 4; iy++)
|
||||
for (int iy = 0; iy < 2; iy++)
|
||||
{
|
||||
__m128i mstencilBlock = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(stencilBlock + iy * 8)), _mm_setzero_si128());
|
||||
__m128i mstencilTest = _mm_cmpeq_epi16(mstencilBlock, mstencilTestValue);
|
||||
__m128i mstencilBlock = _mm_loadu_si128((const __m128i *)stencilBlock);
|
||||
|
||||
__m128i mstencilTest = _mm_cmpeq_epi16(_mm_unpacklo_epi8(mstencilBlock, _mm_setzero_si128()), mstencilTestValue);
|
||||
__m128i mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest);
|
||||
__m128i mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest);
|
||||
__m128i first = _mm_packs_epi32(_mm_shuffle_epi32(mstencilTest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mstencilTest0, _MM_SHUFFLE(0, 1, 2, 3)));
|
||||
|
||||
mask0 <<= 4;
|
||||
mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mstencilTest0, _MM_SHUFFLE(0, 1, 2, 3))));
|
||||
mask0 <<= 4;
|
||||
mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mstencilTest1, _MM_SHUFFLE(0, 1, 2, 3))));
|
||||
mstencilTest = _mm_cmpeq_epi16(_mm_unpackhi_epi8(mstencilBlock, _mm_setzero_si128()), mstencilTestValue);
|
||||
mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest);
|
||||
mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest);
|
||||
__m128i second = _mm_packs_epi32(_mm_shuffle_epi32(mstencilTest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mstencilTest0, _MM_SHUFFLE(0, 1, 2, 3)));
|
||||
|
||||
mask0 <<= 16;
|
||||
mask0 |= _mm_movemask_epi8(_mm_packs_epi16(second, first));
|
||||
|
||||
stencilBlock += 16;
|
||||
}
|
||||
|
||||
for (int iy = 4; iy < q; iy++)
|
||||
for (int iy = 0; iy < 2; iy++)
|
||||
{
|
||||
__m128i mstencilBlock = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(stencilBlock + iy * 8)), _mm_setzero_si128());
|
||||
__m128i mstencilTest = _mm_cmpeq_epi16(mstencilBlock, mstencilTestValue);
|
||||
__m128i mstencilBlock = _mm_loadu_si128((const __m128i *)stencilBlock);
|
||||
|
||||
__m128i mstencilTest = _mm_cmpeq_epi16(_mm_unpacklo_epi8(mstencilBlock, _mm_setzero_si128()), mstencilTestValue);
|
||||
__m128i mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest);
|
||||
__m128i mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest);
|
||||
__m128i first = _mm_packs_epi32(_mm_shuffle_epi32(mstencilTest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mstencilTest0, _MM_SHUFFLE(0, 1, 2, 3)));
|
||||
|
||||
mask1 <<= 4;
|
||||
mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mstencilTest0, _MM_SHUFFLE(0, 1, 2, 3))));
|
||||
mask1 <<= 4;
|
||||
mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mstencilTest1, _MM_SHUFFLE(0, 1, 2, 3))));
|
||||
mstencilTest = _mm_cmpeq_epi16(_mm_unpackhi_epi8(mstencilBlock, _mm_setzero_si128()), mstencilTestValue);
|
||||
mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest);
|
||||
mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest);
|
||||
__m128i second = _mm_packs_epi32(_mm_shuffle_epi32(mstencilTest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mstencilTest0, _MM_SHUFFLE(0, 1, 2, 3)));
|
||||
|
||||
mask1 <<= 16;
|
||||
mask1 |= _mm_movemask_epi8(_mm_packs_epi16(second, first));
|
||||
|
||||
stencilBlock += 16;
|
||||
}
|
||||
|
||||
Mask0 = Mask0 & mask0;
|
||||
|
@ -617,50 +649,46 @@ void TriangleBlock::CoverageTest()
|
|||
int y0 = Y << 4;
|
||||
int y1 = (Y + q - 1) << 4;
|
||||
|
||||
__m128i mY = _mm_set_epi32(y0, y0, y1, y1);
|
||||
__m128i mX = _mm_set_epi32(x0, x0, x1, x1);
|
||||
|
||||
// Evaluate half-space functions
|
||||
bool a00 = C1 + DX12 * y0 - DY12 * x0 > 0;
|
||||
bool a10 = C1 + DX12 * y0 - DY12 * x1 > 0;
|
||||
bool a01 = C1 + DX12 * y1 - DY12 * x0 > 0;
|
||||
bool a11 = C1 + DX12 * y1 - DY12 * x1 > 0;
|
||||
int a = (a00 << 0) | (a10 << 1) | (a01 << 2) | (a11 << 3);
|
||||
__m128i mCY1 = _mm_sub_epi32(
|
||||
_mm_add_epi32(mC1, _mm_shuffle_epi32(_mm_mul_epu32(mDX12, mY), _MM_SHUFFLE(0, 0, 2, 2))),
|
||||
_mm_shuffle_epi32(_mm_mul_epu32(mDY12, mX), _MM_SHUFFLE(0, 2, 0, 2)));
|
||||
__m128i mA = _mm_cmpgt_epi32(mCY1, _mm_setzero_si128());
|
||||
|
||||
bool b00 = C2 + DX23 * y0 - DY23 * x0 > 0;
|
||||
bool b10 = C2 + DX23 * y0 - DY23 * x1 > 0;
|
||||
bool b01 = C2 + DX23 * y1 - DY23 * x0 > 0;
|
||||
bool b11 = C2 + DX23 * y1 - DY23 * x1 > 0;
|
||||
int b = (b00 << 0) | (b10 << 1) | (b01 << 2) | (b11 << 3);
|
||||
__m128i mCY2 = _mm_sub_epi32(
|
||||
_mm_add_epi32(mC2, _mm_shuffle_epi32(_mm_mul_epu32(mDX23, mY), _MM_SHUFFLE(0, 0, 2, 2))),
|
||||
_mm_shuffle_epi32(_mm_mul_epu32(mDY23, mX), _MM_SHUFFLE(0, 2, 0, 2)));
|
||||
__m128i mB = _mm_cmpgt_epi32(mCY2, _mm_setzero_si128());
|
||||
|
||||
bool c00 = C3 + DX31 * y0 - DY31 * x0 > 0;
|
||||
bool c10 = C3 + DX31 * y0 - DY31 * x1 > 0;
|
||||
bool c01 = C3 + DX31 * y1 - DY31 * x0 > 0;
|
||||
bool c11 = C3 + DX31 * y1 - DY31 * x1 > 0;
|
||||
int c = (c00 << 0) | (c10 << 1) | (c01 << 2) | (c11 << 3);
|
||||
__m128i mCY3 = _mm_sub_epi32(
|
||||
_mm_add_epi32(mC3, _mm_shuffle_epi32(_mm_mul_epu32(mDX31, mY), _MM_SHUFFLE(0, 0, 2, 2))),
|
||||
_mm_shuffle_epi32(_mm_mul_epu32(mDY31, mX), _MM_SHUFFLE(0, 2, 0, 2)));
|
||||
__m128i mC = _mm_cmpgt_epi32(mCY3, _mm_setzero_si128());
|
||||
|
||||
if (a == 0 || b == 0 || c == 0) // Skip block when outside an edge
|
||||
int abc = _mm_movemask_epi8(_mm_packs_epi16(_mm_packs_epi32(mA, mB), _mm_packs_epi32(mC, _mm_setzero_si128())));
|
||||
|
||||
if ((abc & 0xf) == 0 || (abc & 0xf0) == 0 || (abc & 0xf00) == 0) // Skip block when outside an edge
|
||||
{
|
||||
Mask0 = 0;
|
||||
Mask1 = 0;
|
||||
}
|
||||
else if (a == 0xf && b == 0xf && c == 0xf) // Accept whole block when totally covered
|
||||
else if (abc == 0xfff) // Accept whole block when totally covered
|
||||
{
|
||||
Mask0 = 0xffffffff;
|
||||
Mask1 = 0xffffffff;
|
||||
}
|
||||
else // Partially covered block
|
||||
{
|
||||
x0 = X << 4;
|
||||
x1 = (X + q - 1) << 4;
|
||||
int CY1 = C1 + DX12 * y0 - DY12 * x0;
|
||||
int CY2 = C2 + DX23 * y0 - DY23 * x0;
|
||||
int CY3 = C3 + DX31 * y0 - DY31 * x0;
|
||||
|
||||
uint32_t mask0 = 0;
|
||||
uint32_t mask1 = 0;
|
||||
|
||||
__m128i mCY1 = _mm_sub_epi32(_mm_set1_epi32(CY1), mFDY12Offset);
|
||||
__m128i mCY2 = _mm_sub_epi32(_mm_set1_epi32(CY2), mFDY23Offset);
|
||||
__m128i mCY3 = _mm_sub_epi32(_mm_set1_epi32(CY3), mFDY31Offset);
|
||||
for (int iy = 0; iy < 4; iy++)
|
||||
mCY1 = _mm_sub_epi32(_mm_shuffle_epi32(mCY1, _MM_SHUFFLE(0, 0, 0, 0)), mFDY12Offset);
|
||||
mCY2 = _mm_sub_epi32(_mm_shuffle_epi32(mCY2, _MM_SHUFFLE(0, 0, 0, 0)), mFDY23Offset);
|
||||
mCY3 = _mm_sub_epi32(_mm_shuffle_epi32(mCY3, _MM_SHUFFLE(0, 0, 0, 0)), mFDY31Offset);
|
||||
for (int iy = 0; iy < 2; iy++)
|
||||
{
|
||||
__m128i mtest0 = _mm_cmpgt_epi32(mCY1, _mm_setzero_si128());
|
||||
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0);
|
||||
|
@ -668,18 +696,27 @@ void TriangleBlock::CoverageTest()
|
|||
__m128i mtest1 = _mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128());
|
||||
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1);
|
||||
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1);
|
||||
|
||||
mCY1 = _mm_add_epi32(mCY1, mFDX12);
|
||||
mCY2 = _mm_add_epi32(mCY2, mFDX23);
|
||||
mCY3 = _mm_add_epi32(mCY3, mFDX31);
|
||||
__m128i first = _mm_packs_epi32(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3)));
|
||||
|
||||
mask0 <<= 4;
|
||||
mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3))));
|
||||
mask0 <<= 4;
|
||||
mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3))));
|
||||
mtest0 = _mm_cmpgt_epi32(mCY1, _mm_setzero_si128());
|
||||
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0);
|
||||
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY3, _mm_setzero_si128()), mtest0);
|
||||
mtest1 = _mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128());
|
||||
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1);
|
||||
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1);
|
||||
mCY1 = _mm_add_epi32(mCY1, mFDX12);
|
||||
mCY2 = _mm_add_epi32(mCY2, mFDX23);
|
||||
mCY3 = _mm_add_epi32(mCY3, mFDX31);
|
||||
__m128i second = _mm_packs_epi32(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3)));
|
||||
|
||||
mask0 <<= 16;
|
||||
mask0 |= _mm_movemask_epi8(_mm_packs_epi16(second, first));
|
||||
}
|
||||
|
||||
for (int iy = 4; iy < q; iy++)
|
||||
for (int iy = 0; iy < 2; iy++)
|
||||
{
|
||||
__m128i mtest0 = _mm_cmpgt_epi32(mCY1, _mm_setzero_si128());
|
||||
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0);
|
||||
|
@ -687,15 +724,24 @@ void TriangleBlock::CoverageTest()
|
|||
__m128i mtest1 = _mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128());
|
||||
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1);
|
||||
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1);
|
||||
|
||||
mCY1 = _mm_add_epi32(mCY1, mFDX12);
|
||||
mCY2 = _mm_add_epi32(mCY2, mFDX23);
|
||||
mCY3 = _mm_add_epi32(mCY3, mFDX31);
|
||||
__m128i first = _mm_packs_epi32(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3)));
|
||||
|
||||
mask1 <<= 4;
|
||||
mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3))));
|
||||
mask1 <<= 4;
|
||||
mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3))));
|
||||
mtest0 = _mm_cmpgt_epi32(mCY1, _mm_setzero_si128());
|
||||
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0);
|
||||
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY3, _mm_setzero_si128()), mtest0);
|
||||
mtest1 = _mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128());
|
||||
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1);
|
||||
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1);
|
||||
mCY1 = _mm_add_epi32(mCY1, mFDX12);
|
||||
mCY2 = _mm_add_epi32(mCY2, mFDX23);
|
||||
mCY3 = _mm_add_epi32(mCY3, mFDX31);
|
||||
__m128i second = _mm_packs_epi32(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3)), _mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3)));
|
||||
|
||||
mask1 <<= 16;
|
||||
mask1 |= _mm_movemask_epi8(_mm_packs_epi16(second, first));
|
||||
}
|
||||
|
||||
Mask0 = mask0;
|
||||
|
@ -755,6 +801,8 @@ void TriangleBlock::StencilWrite()
|
|||
}
|
||||
}
|
||||
|
||||
#ifdef NO_SSE
|
||||
|
||||
void TriangleBlock::SubsectorWrite()
|
||||
{
|
||||
auto pitch = subsectorPitch;
|
||||
|
@ -796,6 +844,80 @@ void TriangleBlock::SubsectorWrite()
|
|||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
void TriangleBlock::SubsectorWrite()
|
||||
{
|
||||
auto pitch = subsectorPitch;
|
||||
uint32_t *subsector = subsectorGBuffer + X + Y * pitch;
|
||||
__m128i msubsectorDepth = _mm_set1_epi32(subsectorDepth);
|
||||
|
||||
if (Mask0 == 0xffffffff && Mask1 == 0xffffffff)
|
||||
{
|
||||
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth);
|
||||
_mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch;
|
||||
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth);
|
||||
_mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch;
|
||||
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth);
|
||||
_mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch;
|
||||
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth);
|
||||
_mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch;
|
||||
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth);
|
||||
_mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch;
|
||||
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth);
|
||||
_mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch;
|
||||
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth);
|
||||
_mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch;
|
||||
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth);
|
||||
_mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth);
|
||||
}
|
||||
else
|
||||
{
|
||||
__m128i mxormask = _mm_set1_epi32(0xffffffff);
|
||||
__m128i topfour = _mm_setr_epi32(1 << 31, 1 << 30, 1 << 29, 1 << 28);
|
||||
__m128i bottomfour = _mm_setr_epi32(1 << 27, 1 << 26, 1 << 25, 1 << 24);
|
||||
|
||||
__m128i mmask0 = _mm_set1_epi32(Mask0);
|
||||
__m128i mmask1 = _mm_set1_epi32(Mask1);
|
||||
|
||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector);
|
||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4));
|
||||
mmask0 = _mm_slli_si128(mmask0, 1);
|
||||
subsector += pitch;
|
||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector);
|
||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4));
|
||||
mmask0 = _mm_slli_si128(mmask0, 1);
|
||||
subsector += pitch;
|
||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector);
|
||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4));
|
||||
mmask0 = _mm_slli_si128(mmask0, 1);
|
||||
subsector += pitch;
|
||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector);
|
||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4));
|
||||
mmask0 = _mm_slli_si128(mmask0, 1);
|
||||
subsector += pitch;
|
||||
|
||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector);
|
||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4));
|
||||
mmask1 = _mm_slli_si128(mmask1, 1);
|
||||
subsector += pitch;
|
||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector);
|
||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4));
|
||||
mmask1 = _mm_slli_si128(mmask1, 1);
|
||||
subsector += pitch;
|
||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector);
|
||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4));
|
||||
mmask1 = _mm_slli_si128(mmask1, 1);
|
||||
subsector += pitch;
|
||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector);
|
||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4));
|
||||
mmask1 = _mm_slli_si128(mmask1, 1);
|
||||
subsector += pitch;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
void ScreenTriangle::Draw(const TriDrawTriangleArgs *args, WorkerThreadData *thread)
|
||||
{
|
||||
TriangleBlock block(args);
|
||||
|
|
Loading…
Reference in a new issue