This commit is contained in:
Rachael Alexanderson 2017-03-24 15:06:05 -04:00
commit 2063813348
3 changed files with 237 additions and 1 deletions

View file

@ -190,6 +190,7 @@ void PolyTriangleDrawer::draw_shaded_triangle(const ShadedTriVertex *vert, bool
TriVertex clippedvert[max_additional_vertices];
int numclipvert = clipedge(vert, clippedvert);
#ifdef NO_SSE
// Map to 2D viewport:
for (int j = 0; j < numclipvert; j++)
{
@ -205,6 +206,39 @@ void PolyTriangleDrawer::draw_shaded_triangle(const ShadedTriVertex *vert, bool
v.x = viewport_x + viewport_width * (1.0f + v.x) * 0.5f;
v.y = viewport_y + viewport_height * (1.0f - v.y) * 0.5f;
}
#else
// Map to 2D viewport:
__m128 mviewport_x = _mm_set1_ps((float)viewport_x);
__m128 mviewport_y = _mm_set1_ps((float)viewport_y);
__m128 mviewport_halfwidth = _mm_set1_ps(viewport_width * 0.5f);
__m128 mviewport_halfheight = _mm_set1_ps(viewport_height * 0.5f);
__m128 mone = _mm_set1_ps(1.0f);
int sse_length = (numclipvert + 3) / 4 * 4;
for (int j = 0; j < sse_length; j += 4)
{
__m128 vx = _mm_loadu_ps(&clippedvert[j].x);
__m128 vy = _mm_loadu_ps(&clippedvert[j + 1].x);
__m128 vz = _mm_loadu_ps(&clippedvert[j + 2].x);
__m128 vw = _mm_loadu_ps(&clippedvert[j + 3].x);
_MM_TRANSPOSE4_PS(vx, vy, vz, vw);
// Calculate normalized device coordinates:
vw = _mm_div_ps(mone, vw);
vx = _mm_mul_ps(vx, vw);
vy = _mm_mul_ps(vy, vw);
vz = _mm_mul_ps(vz, vw);
// Apply viewport scale to get screen coordinates:
vx = _mm_add_ps(mviewport_x, _mm_mul_ps(mviewport_halfwidth, _mm_add_ps(mone, vx)));
vy = _mm_add_ps(mviewport_y, _mm_mul_ps(mviewport_halfheight, _mm_sub_ps(mone, vy)));
_MM_TRANSPOSE4_PS(vx, vy, vz, vw);
_mm_storeu_ps(&clippedvert[j].x, vx);
_mm_storeu_ps(&clippedvert[j + 1].x, vy);
_mm_storeu_ps(&clippedvert[j + 2].x, vz);
_mm_storeu_ps(&clippedvert[j + 3].x, vw);
}
#endif
// Keep varyings in -128 to 128 range if possible
if (numclipvert > 0)

View file

@ -138,6 +138,22 @@ void ScreenTriangle::SetupNormal(const TriDrawTriangleArgs *args, WorkerThreadDa
thread->StartY = miny;
span->Length = 0;
#ifndef NO_SSE
__m128i mnotxor = _mm_set1_epi32(0xffffffff);
__m128i mstencilTestValue = _mm_set1_epi16(stencilTestValue);
__m128i mFDY12Offset = _mm_setr_epi32(0, FDY12, FDY12 * 2, FDY12 * 3);
__m128i mFDY23Offset = _mm_setr_epi32(0, FDY23, FDY23 * 2, FDY23 * 3);
__m128i mFDY31Offset = _mm_setr_epi32(0, FDY31, FDY31 * 2, FDY31 * 3);
__m128i mFDY12x4 = _mm_set1_epi32(FDY12 * 4);
__m128i mFDY23x4 = _mm_set1_epi32(FDY23 * 4);
__m128i mFDY31x4 = _mm_set1_epi32(FDY31 * 4);
__m128i mFDX12 = _mm_set1_epi32(FDX12);
__m128i mFDX23 = _mm_set1_epi32(FDX23);
__m128i mFDX31 = _mm_set1_epi32(FDX31);
__m128i mClipCompare0 = _mm_setr_epi32(clipright, clipright - 1, clipright - 2, clipright - 3);
__m128i mClipCompare1 = _mm_setr_epi32(clipright - 4, clipright - 5, clipright - 6, clipright - 7);
#endif
// Loop through blocks
for (int y = miny; y < maxy; y += q * num_cores)
{
@ -211,6 +227,7 @@ void ScreenTriangle::SetupNormal(const TriDrawTriangleArgs *args, WorkerThreadDa
uint32_t mask0 = 0;
uint32_t mask1 = 0;
#if NO_SSE
for (int iy = 0; iy < 4; iy++)
{
int CX1 = CY1;
@ -256,6 +273,69 @@ void ScreenTriangle::SetupNormal(const TriDrawTriangleArgs *args, WorkerThreadDa
CY2 += FDX23;
CY3 += FDX31;
}
#else
__m128i mSingleStencilMask = _mm_set1_epi32(blockIsSingleStencil ? 0xffffffff : 0);
__m128i mCY1 = _mm_sub_epi32(_mm_set1_epi32(CY1), mFDY12Offset);
__m128i mCY2 = _mm_sub_epi32(_mm_set1_epi32(CY2), mFDY23Offset);
__m128i mCY3 = _mm_sub_epi32(_mm_set1_epi32(CY3), mFDY31Offset);
__m128i mx = _mm_set1_epi32(x);
__m128i mClipTest0 = _mm_cmplt_epi32(mx, mClipCompare0);
__m128i mClipTest1 = _mm_cmplt_epi32(mx, mClipCompare1);
int iy;
for (iy = 0; iy < 4 && iy < clipbottom - y; iy++)
{
__m128i mstencilBlock = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(stencilBlock + iy * 8)), _mm_setzero_si128());
__m128i mstencilTest = _mm_or_si128(_mm_cmpeq_epi16(mstencilBlock, mstencilTestValue), mSingleStencilMask);
__m128i mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest);
__m128i mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest);
__m128i mtest0 = _mm_and_si128(mstencilTest0, mClipTest0);
__m128i mtest1 = _mm_and_si128(mstencilTest1, mClipTest1);
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY1, _mm_setzero_si128()), mtest0);
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0);
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY3, _mm_setzero_si128()), mtest0);
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128()), mtest1);
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1);
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1);
mCY1 = _mm_add_epi32(mCY1, mFDX12);
mCY2 = _mm_add_epi32(mCY2, mFDX23);
mCY3 = _mm_add_epi32(mCY3, mFDX31);
mask0 <<= 4;
mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3))));
mask0 <<= 4;
mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3))));
}
mask0 <<= (4 - iy) * 8;
for (iy = 4; iy < q && iy < clipbottom - y; iy++)
{
__m128i mstencilBlock = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(stencilBlock + iy * 8)), _mm_setzero_si128());
__m128i mstencilTest = _mm_or_si128(_mm_cmpeq_epi16(mstencilBlock, mstencilTestValue), mSingleStencilMask);
__m128i mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest);
__m128i mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest);
__m128i mtest0 = _mm_and_si128(mstencilTest0, mClipTest0);
__m128i mtest1 = _mm_and_si128(mstencilTest1, mClipTest1);
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY1, _mm_setzero_si128()), mtest0);
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0);
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY3, _mm_setzero_si128()), mtest0);
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128()), mtest1);
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1);
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1);
mCY1 = _mm_add_epi32(mCY1, mFDX12);
mCY2 = _mm_add_epi32(mCY2, mFDX23);
mCY3 = _mm_add_epi32(mCY3, mFDX31);
mask1 <<= 4;
mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3))));
mask1 <<= 4;
mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3))));
}
mask1 <<= (q - iy) * 8;
#endif
if (mask0 != 0xffffffff || mask1 != 0xffffffff)
{
@ -399,6 +479,23 @@ void ScreenTriangle::SetupSubsector(const TriDrawTriangleArgs *args, WorkerThrea
thread->StartY = miny;
span->Length = 0;
#ifndef NO_SSE
__m128i msubsectorDepth = _mm_set1_epi32(subsectorDepth);
__m128i mnotxor = _mm_set1_epi32(0xffffffff);
__m128i mstencilTestValue = _mm_set1_epi16(stencilTestValue);
__m128i mFDY12Offset = _mm_setr_epi32(0, FDY12, FDY12 * 2, FDY12 * 3);
__m128i mFDY23Offset = _mm_setr_epi32(0, FDY23, FDY23 * 2, FDY23 * 3);
__m128i mFDY31Offset = _mm_setr_epi32(0, FDY31, FDY31 * 2, FDY31 * 3);
__m128i mFDY12x4 = _mm_set1_epi32(FDY12 * 4);
__m128i mFDY23x4 = _mm_set1_epi32(FDY23 * 4);
__m128i mFDY31x4 = _mm_set1_epi32(FDY31 * 4);
__m128i mFDX12 = _mm_set1_epi32(FDX12);
__m128i mFDX23 = _mm_set1_epi32(FDX23);
__m128i mFDX31 = _mm_set1_epi32(FDX31);
__m128i mClipCompare0 = _mm_setr_epi32(clipright, clipright - 1, clipright - 2, clipright - 3);
__m128i mClipCompare1 = _mm_setr_epi32(clipright - 4, clipright - 5, clipright - 6, clipright - 7);
#endif
// Loop through blocks
for (int y = miny; y < maxy; y += q * num_cores)
{
@ -457,6 +554,7 @@ void ScreenTriangle::SetupSubsector(const TriDrawTriangleArgs *args, WorkerThrea
uint32_t mask0 = 0;
uint32_t mask1 = 0;
#ifdef NO_SSE
for (int iy = 0; iy < 4; iy++)
{
for (int ix = 0; ix < q; ix++)
@ -467,7 +565,6 @@ void ScreenTriangle::SetupSubsector(const TriDrawTriangleArgs *args, WorkerThrea
}
subsector += pitch;
}
for (int iy = 4; iy < q; iy++)
{
for (int ix = 0; ix < q; ix++)
@ -478,6 +575,24 @@ void ScreenTriangle::SetupSubsector(const TriDrawTriangleArgs *args, WorkerThrea
}
subsector += pitch;
}
#else
for (int iy = 0; iy < 4; iy++)
{
mask0 <<= 4;
mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)subsector), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3))));
mask0 <<= 4;
mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(subsector + 4)), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3))));
subsector += pitch;
}
for (int iy = 4; iy < q; iy++)
{
mask1 <<= 4;
mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)subsector), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3))));
mask1 <<= 4;
mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(subsector + 4)), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3))));
subsector += pitch;
}
#endif
if (mask0 != 0xffffffff || mask1 != 0xffffffff)
{
@ -520,6 +635,7 @@ void ScreenTriangle::SetupSubsector(const TriDrawTriangleArgs *args, WorkerThrea
uint32_t mask0 = 0;
uint32_t mask1 = 0;
#ifdef NO_SSE
for (int iy = 0; iy < 4; iy++)
{
int CX1 = CY1;
@ -567,6 +683,77 @@ void ScreenTriangle::SetupSubsector(const TriDrawTriangleArgs *args, WorkerThrea
CY3 += FDX31;
subsector += pitch;
}
#else
__m128i mSingleStencilMask = _mm_set1_epi32(blockIsSingleStencil ? 0 : 0xffffffff);
__m128i mCY1 = _mm_sub_epi32(_mm_set1_epi32(CY1), mFDY12Offset);
__m128i mCY2 = _mm_sub_epi32(_mm_set1_epi32(CY2), mFDY23Offset);
__m128i mCY3 = _mm_sub_epi32(_mm_set1_epi32(CY3), mFDY31Offset);
__m128i mx = _mm_set1_epi32(x);
__m128i mClipTest0 = _mm_cmplt_epi32(mx, mClipCompare0);
__m128i mClipTest1 = _mm_cmplt_epi32(mx, mClipCompare1);
int iy;
for (iy = 0; iy < 4 && iy < clipbottom - y; iy++)
{
__m128i mstencilBlock = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(stencilBlock + iy * 8)), _mm_setzero_si128());
__m128i mstencilTest = _mm_and_si128(_mm_cmplt_epi16(mstencilBlock, mstencilTestValue), mSingleStencilMask);
__m128i mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest);
__m128i mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest);
__m128i msubsectorTest0 = _mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)subsector), msubsectorDepth);
__m128i msubsectorTest1 = _mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(subsector + 4)), msubsectorDepth);
__m128i mtest0 = _mm_and_si128(_mm_xor_si128(_mm_or_si128(mstencilTest0, msubsectorTest0), mnotxor), mClipTest0);
__m128i mtest1 = _mm_and_si128(_mm_xor_si128(_mm_or_si128(mstencilTest1, msubsectorTest1), mnotxor), mClipTest1);
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY1, _mm_setzero_si128()), mtest0);
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0);
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY3, _mm_setzero_si128()), mtest0);
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128()), mtest1);
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1);
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1);
mCY1 = _mm_add_epi32(mCY1, mFDX12);
mCY2 = _mm_add_epi32(mCY2, mFDX23);
mCY3 = _mm_add_epi32(mCY3, mFDX31);
mask0 <<= 4;
mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3))));
mask0 <<= 4;
mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3))));
subsector += pitch;
}
mask0 <<= (4 - iy) * 8;
for (iy = 4; iy < q && iy < clipbottom - y; iy++)
{
__m128i mstencilBlock = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(stencilBlock + iy * 8)), _mm_setzero_si128());
__m128i mstencilTest = _mm_and_si128(_mm_cmplt_epi16(mstencilBlock, mstencilTestValue), mSingleStencilMask);
__m128i mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest);
__m128i mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest);
__m128i msubsectorTest0 = _mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)subsector), msubsectorDepth);
__m128i msubsectorTest1 = _mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(subsector + 4)), msubsectorDepth);
__m128i mtest0 = _mm_and_si128(_mm_xor_si128(_mm_or_si128(mstencilTest0, msubsectorTest0), mnotxor), mClipTest0);
__m128i mtest1 = _mm_and_si128(_mm_xor_si128(_mm_or_si128(mstencilTest1, msubsectorTest1), mnotxor), mClipTest1);
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY1, _mm_setzero_si128()), mtest0);
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0);
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY3, _mm_setzero_si128()), mtest0);
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128()), mtest1);
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1);
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1);
mCY1 = _mm_add_epi32(mCY1, mFDX12);
mCY2 = _mm_add_epi32(mCY2, mFDX23);
mCY3 = _mm_add_epi32(mCY3, mFDX31);
mask1 <<= 4;
mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3))));
mask1 <<= 4;
mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3))));
subsector += pitch;
}
mask1 <<= (q - iy) * 8;
#endif
if (mask0 != 0xffffffff || mask1 != 0xffffffff)
{

View file

@ -175,6 +175,7 @@ TriMatrix TriMatrix::operator*(const TriMatrix &mult) const
ShadedTriVertex TriMatrix::operator*(TriVertex v) const
{
#ifdef NO_SSE
float vx = matrix[0 * 4 + 0] * v.x + matrix[1 * 4 + 0] * v.y + matrix[2 * 4 + 0] * v.z + matrix[3 * 4 + 0] * v.w;
float vy = matrix[0 * 4 + 1] * v.x + matrix[1 * 4 + 1] * v.y + matrix[2 * 4 + 1] * v.z + matrix[3 * 4 + 1] * v.w;
float vz = matrix[0 * 4 + 2] * v.x + matrix[1 * 4 + 2] * v.y + matrix[2 * 4 + 2] * v.z + matrix[3 * 4 + 2] * v.w;
@ -184,6 +185,20 @@ ShadedTriVertex TriMatrix::operator*(TriVertex v) const
sv.y = vy;
sv.z = vz;
sv.w = vw;
#else
__m128 m0 = _mm_loadu_ps(matrix);
__m128 m1 = _mm_loadu_ps(matrix + 4);
__m128 m2 = _mm_loadu_ps(matrix + 8);
__m128 m3 = _mm_loadu_ps(matrix + 12);
__m128 mv = _mm_loadu_ps(&v.x);
m0 = _mm_mul_ps(m0, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(0, 0, 0, 0)));
m1 = _mm_mul_ps(m1, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(1, 1, 1, 1)));
m2 = _mm_mul_ps(m2, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(2, 2, 2, 2)));
m3 = _mm_mul_ps(m3, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(3, 3, 3, 3)));
mv = _mm_add_ps(_mm_add_ps(_mm_add_ps(m0, m1), m2), m3);
ShadedTriVertex sv;
_mm_storeu_ps(&sv.x, mv);
#endif
for (int i = 0; i < TriVertex::NumVarying; i++)
sv.varying[i] = v.varying[i];
return sv;