mirror of
https://github.com/ZDoom/qzdoom.git
synced 2025-02-19 18:41:02 +00:00
- change subsector gbuffer to use a 8x8 block memory layout to avoid having to clip memory reads and writes
This commit is contained in:
parent
0ed60b8df6
commit
4a9dffa70d
3 changed files with 70 additions and 100 deletions
|
@ -48,7 +48,8 @@ void PolySubsectorGBuffer::Resize(int newwidth, int newheight)
|
||||||
{
|
{
|
||||||
width = newwidth;
|
width = newwidth;
|
||||||
height = newheight;
|
height = newheight;
|
||||||
values.resize(width * height);
|
int count = BlockWidth() * BlockHeight();
|
||||||
|
values.resize(count * 64);
|
||||||
}
|
}
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
|
@ -33,6 +33,8 @@ public:
|
||||||
void Resize(int newwidth, int newheight);
|
void Resize(int newwidth, int newheight);
|
||||||
int Width() const { return width; }
|
int Width() const { return width; }
|
||||||
int Height() const { return height; }
|
int Height() const { return height; }
|
||||||
|
int BlockWidth() const { return (width + 7) / 8; }
|
||||||
|
int BlockHeight() const { return (height + 7) / 8; }
|
||||||
uint32_t *Values() { return values.data(); }
|
uint32_t *Values() { return values.data(); }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
|
@ -133,7 +133,7 @@ TriangleBlock::TriangleBlock(const TriDrawTriangleArgs *args)
|
||||||
|
|
||||||
subsectorGBuffer = args->subsectorGBuffer;
|
subsectorGBuffer = args->subsectorGBuffer;
|
||||||
subsectorDepth = args->uniforms->SubsectorDepth();
|
subsectorDepth = args->uniforms->SubsectorDepth();
|
||||||
subsectorPitch = args->pitch;
|
subsectorPitch = args->stencilPitch;
|
||||||
|
|
||||||
// 28.4 fixed-point coordinates
|
// 28.4 fixed-point coordinates
|
||||||
#ifdef NO_SSE
|
#ifdef NO_SSE
|
||||||
|
@ -288,29 +288,24 @@ void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thre
|
||||||
|
|
||||||
void TriangleBlock::SubsectorTest()
|
void TriangleBlock::SubsectorTest()
|
||||||
{
|
{
|
||||||
uint32_t *subsector = subsectorGBuffer + X + Y * subsectorPitch;
|
int block = (X >> 3) + (Y >> 3) * subsectorPitch;
|
||||||
|
uint32_t *subsector = subsectorGBuffer + block * 64;
|
||||||
uint32_t mask0 = 0;
|
uint32_t mask0 = 0;
|
||||||
uint32_t mask1 = 0;
|
uint32_t mask1 = 0;
|
||||||
|
|
||||||
for (int iy = 0; iy < 4; iy++)
|
for (int i = 0; i < 32; i++)
|
||||||
{
|
{
|
||||||
for (int ix = 0; ix < q; ix++)
|
bool covered = *subsector >= subsectorDepth;
|
||||||
{
|
|
||||||
bool covered = subsector[ix] >= subsectorDepth;
|
|
||||||
mask0 <<= 1;
|
mask0 <<= 1;
|
||||||
mask0 |= (uint32_t)covered;
|
mask0 |= (uint32_t)covered;
|
||||||
|
subsector++;
|
||||||
}
|
}
|
||||||
subsector += subsectorPitch;
|
for (int i = 0; i < 32; i++)
|
||||||
}
|
|
||||||
for (int iy = 4; iy < q; iy++)
|
|
||||||
{
|
{
|
||||||
for (int ix = 0; ix < q; ix++)
|
bool covered = *subsector >= subsectorDepth;
|
||||||
{
|
|
||||||
bool covered = subsector[ix] >= subsectorDepth;
|
|
||||||
mask1 <<= 1;
|
mask1 <<= 1;
|
||||||
mask1 |= (uint32_t)covered;
|
mask1 |= (uint32_t)covered;
|
||||||
}
|
subsector++;
|
||||||
subsector += subsectorPitch;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Mask0 = Mask0 & mask0;
|
Mask0 = Mask0 & mask0;
|
||||||
|
@ -321,27 +316,24 @@ void TriangleBlock::SubsectorTest()
|
||||||
|
|
||||||
void TriangleBlock::SubsectorTest()
|
void TriangleBlock::SubsectorTest()
|
||||||
{
|
{
|
||||||
uint32_t *subsector = subsectorGBuffer + X + Y * subsectorPitch;
|
int block = (X >> 3) + (Y >> 3) * subsectorPitch;
|
||||||
|
uint32_t *subsector = subsectorGBuffer + block * 64;
|
||||||
uint32_t mask0 = 0;
|
uint32_t mask0 = 0;
|
||||||
uint32_t mask1 = 0;
|
uint32_t mask1 = 0;
|
||||||
__m128i msubsectorDepth = _mm_set1_epi32(subsectorDepth);
|
__m128i msubsectorDepth = _mm_set1_epi32(subsectorDepth);
|
||||||
__m128i mnotxor = _mm_set1_epi32(0xffffffff);
|
__m128i mnotxor = _mm_set1_epi32(0xffffffff);
|
||||||
|
|
||||||
for (int iy = 0; iy < 4; iy++)
|
for (int iy = 0; iy < 8; iy++)
|
||||||
{
|
{
|
||||||
mask0 <<= 4;
|
mask0 <<= 4;
|
||||||
mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)subsector), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3))));
|
mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)subsector), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3))));
|
||||||
mask0 <<= 4;
|
subsector += 4;
|
||||||
mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(subsector + 4)), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3))));
|
|
||||||
subsector += subsectorPitch;
|
|
||||||
}
|
}
|
||||||
for (int iy = 4; iy < q; iy++)
|
for (int iy = 0; iy < 8; iy++)
|
||||||
{
|
{
|
||||||
mask1 <<= 4;
|
mask1 <<= 4;
|
||||||
mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)subsector), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3))));
|
mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)subsector), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3))));
|
||||||
mask1 <<= 4;
|
subsector += 4;
|
||||||
mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(subsector + 4)), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3))));
|
|
||||||
subsector += subsectorPitch;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Mask0 = Mask0 & mask0;
|
Mask0 = Mask0 & mask0;
|
||||||
|
@ -805,41 +797,33 @@ void TriangleBlock::StencilWrite()
|
||||||
|
|
||||||
void TriangleBlock::SubsectorWrite()
|
void TriangleBlock::SubsectorWrite()
|
||||||
{
|
{
|
||||||
auto pitch = subsectorPitch;
|
int block = (X >> 3) + (Y >> 3) * subsectorPitch;
|
||||||
uint32_t *subsector = subsectorGBuffer + X + Y * pitch;
|
uint32_t *subsector = subsectorGBuffer + block * 64;
|
||||||
|
|
||||||
if (Mask0 == 0xffffffff && Mask1 == 0xffffffff)
|
if (Mask0 == 0xffffffff && Mask1 == 0xffffffff)
|
||||||
{
|
{
|
||||||
for (int y = 0; y < 8; y++)
|
for (int i = 0; i < 64; i++)
|
||||||
{
|
{
|
||||||
for (int x = 0; x < 8; x++)
|
*(subsector++) = subsectorDepth;
|
||||||
subsector[x] = subsectorDepth;
|
|
||||||
subsector += pitch;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
uint32_t mask0 = Mask0;
|
uint32_t mask0 = Mask0;
|
||||||
uint32_t mask1 = Mask1;
|
uint32_t mask1 = Mask1;
|
||||||
for (int y = 0; y < 4; y++)
|
for (int i = 0; i < 32; i++)
|
||||||
{
|
|
||||||
for (int x = 0; x < 8; x++)
|
|
||||||
{
|
{
|
||||||
if (mask0 & (1 << 31))
|
if (mask0 & (1 << 31))
|
||||||
subsector[x] = subsectorDepth;
|
*subsector = subsectorDepth;
|
||||||
mask0 <<= 1;
|
mask0 <<= 1;
|
||||||
|
subsector++;
|
||||||
}
|
}
|
||||||
subsector += pitch;
|
for (int i = 0; i < 32; i++)
|
||||||
}
|
|
||||||
for (int y = 4; y < 8; y++)
|
|
||||||
{
|
|
||||||
for (int x = 0; x < 8; x++)
|
|
||||||
{
|
{
|
||||||
if (mask1 & (1 << 31))
|
if (mask1 & (1 << 31))
|
||||||
subsector[x] = subsectorDepth;
|
*subsector = subsectorDepth;
|
||||||
mask1 <<= 1;
|
mask1 <<= 1;
|
||||||
}
|
subsector++;
|
||||||
subsector += pitch;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -848,71 +832,54 @@ void TriangleBlock::SubsectorWrite()
|
||||||
|
|
||||||
void TriangleBlock::SubsectorWrite()
|
void TriangleBlock::SubsectorWrite()
|
||||||
{
|
{
|
||||||
auto pitch = subsectorPitch;
|
int block = (X >> 3) + (Y >> 3) * subsectorPitch;
|
||||||
uint32_t *subsector = subsectorGBuffer + X + Y * pitch;
|
uint32_t *subsector = subsectorGBuffer + block * 64;
|
||||||
__m128i msubsectorDepth = _mm_set1_epi32(subsectorDepth);
|
__m128i msubsectorDepth = _mm_set1_epi32(subsectorDepth);
|
||||||
|
|
||||||
if (Mask0 == 0xffffffff && Mask1 == 0xffffffff)
|
if (Mask0 == 0xffffffff && Mask1 == 0xffffffff)
|
||||||
{
|
{
|
||||||
|
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4;
|
||||||
|
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4;
|
||||||
|
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4;
|
||||||
|
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4;
|
||||||
|
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4;
|
||||||
|
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4;
|
||||||
|
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4;
|
||||||
|
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4;
|
||||||
|
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4;
|
||||||
|
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4;
|
||||||
|
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4;
|
||||||
|
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4;
|
||||||
|
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4;
|
||||||
|
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4;
|
||||||
|
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth); subsector += 4;
|
||||||
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth);
|
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth);
|
||||||
_mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch;
|
|
||||||
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth);
|
|
||||||
_mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch;
|
|
||||||
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth);
|
|
||||||
_mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch;
|
|
||||||
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth);
|
|
||||||
_mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch;
|
|
||||||
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth);
|
|
||||||
_mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch;
|
|
||||||
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth);
|
|
||||||
_mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch;
|
|
||||||
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth);
|
|
||||||
_mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth); subsector += pitch;
|
|
||||||
_mm_storeu_si128((__m128i*)subsector, msubsectorDepth);
|
|
||||||
_mm_storeu_si128((__m128i*)(subsector + 4), msubsectorDepth);
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
__m128i mxormask = _mm_set1_epi32(0xffffffff);
|
__m128i mxormask = _mm_set1_epi32(0xffffffff);
|
||||||
__m128i topfour = _mm_setr_epi32(1 << 31, 1 << 30, 1 << 29, 1 << 28);
|
__m128i topfour = _mm_setr_epi32(1 << 31, 1 << 30, 1 << 29, 1 << 28);
|
||||||
__m128i bottomfour = _mm_setr_epi32(1 << 27, 1 << 26, 1 << 25, 1 << 24);
|
|
||||||
|
|
||||||
__m128i mmask0 = _mm_set1_epi32(Mask0);
|
__m128i mmask0 = _mm_set1_epi32(Mask0);
|
||||||
__m128i mmask1 = _mm_set1_epi32(Mask1);
|
__m128i mmask1 = _mm_set1_epi32(Mask1);
|
||||||
|
|
||||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector);
|
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask0 = _mm_slli_epi32(mmask0, 4); subsector += 4;
|
||||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4));
|
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask0 = _mm_slli_epi32(mmask0, 4); subsector += 4;
|
||||||
mmask0 = _mm_slli_si128(mmask0, 1);
|
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask0 = _mm_slli_epi32(mmask0, 4); subsector += 4;
|
||||||
subsector += pitch;
|
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask0 = _mm_slli_epi32(mmask0, 4); subsector += 4;
|
||||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector);
|
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask0 = _mm_slli_epi32(mmask0, 4); subsector += 4;
|
||||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4));
|
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask0 = _mm_slli_epi32(mmask0, 4); subsector += 4;
|
||||||
mmask0 = _mm_slli_si128(mmask0, 1);
|
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask0 = _mm_slli_epi32(mmask0, 4); subsector += 4;
|
||||||
subsector += pitch;
|
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); subsector += 4;
|
||||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector);
|
|
||||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4));
|
|
||||||
mmask0 = _mm_slli_si128(mmask0, 1);
|
|
||||||
subsector += pitch;
|
|
||||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, topfour), _mm_setzero_si128()), mxormask), (char*)subsector);
|
|
||||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask0, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4));
|
|
||||||
mmask0 = _mm_slli_si128(mmask0, 1);
|
|
||||||
subsector += pitch;
|
|
||||||
|
|
||||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector);
|
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask1 = _mm_slli_epi32(mmask1, 4); subsector += 4;
|
||||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4));
|
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask1 = _mm_slli_epi32(mmask1, 4); subsector += 4;
|
||||||
mmask1 = _mm_slli_si128(mmask1, 1);
|
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask1 = _mm_slli_epi32(mmask1, 4); subsector += 4;
|
||||||
subsector += pitch;
|
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask1 = _mm_slli_epi32(mmask1, 4); subsector += 4;
|
||||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector);
|
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask1 = _mm_slli_epi32(mmask1, 4); subsector += 4;
|
||||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4));
|
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask1 = _mm_slli_epi32(mmask1, 4); subsector += 4;
|
||||||
mmask1 = _mm_slli_si128(mmask1, 1);
|
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); mmask1 = _mm_slli_epi32(mmask1, 4); subsector += 4;
|
||||||
subsector += pitch;
|
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector); subsector += 4;
|
||||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector);
|
|
||||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4));
|
|
||||||
mmask1 = _mm_slli_si128(mmask1, 1);
|
|
||||||
subsector += pitch;
|
|
||||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, topfour), _mm_setzero_si128()), mxormask), (char*)subsector);
|
|
||||||
_mm_maskmoveu_si128(msubsectorDepth, _mm_xor_si128(_mm_cmpeq_epi32(_mm_and_si128(mmask1, bottomfour), _mm_setzero_si128()), mxormask), (char*)(subsector + 4));
|
|
||||||
mmask1 = _mm_slli_si128(mmask1, 1);
|
|
||||||
subsector += pitch;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue