mirror of
https://github.com/ZDoom/qzdoom.git
synced 2024-11-11 15:22:16 +00:00
- improve triangle setup performance a little bit
This commit is contained in:
parent
bc8a4474d5
commit
ac518e23bf
3 changed files with 237 additions and 1 deletions
|
@ -190,6 +190,7 @@ void PolyTriangleDrawer::draw_shaded_triangle(const ShadedTriVertex *vert, bool
|
|||
TriVertex clippedvert[max_additional_vertices];
|
||||
int numclipvert = clipedge(vert, clippedvert);
|
||||
|
||||
#ifdef NO_SSE
|
||||
// Map to 2D viewport:
|
||||
for (int j = 0; j < numclipvert; j++)
|
||||
{
|
||||
|
@ -205,6 +206,39 @@ void PolyTriangleDrawer::draw_shaded_triangle(const ShadedTriVertex *vert, bool
|
|||
v.x = viewport_x + viewport_width * (1.0f + v.x) * 0.5f;
|
||||
v.y = viewport_y + viewport_height * (1.0f - v.y) * 0.5f;
|
||||
}
|
||||
#else
|
||||
// Map to 2D viewport:
|
||||
__m128 mviewport_x = _mm_set1_ps((float)viewport_x);
|
||||
__m128 mviewport_y = _mm_set1_ps((float)viewport_y);
|
||||
__m128 mviewport_halfwidth = _mm_set1_ps(viewport_width * 0.5f);
|
||||
__m128 mviewport_halfheight = _mm_set1_ps(viewport_height * 0.5f);
|
||||
__m128 mone = _mm_set1_ps(1.0f);
|
||||
int sse_length = (numclipvert + 3) / 4 * 4;
|
||||
for (int j = 0; j < sse_length; j += 4)
|
||||
{
|
||||
__m128 vx = _mm_loadu_ps(&clippedvert[j].x);
|
||||
__m128 vy = _mm_loadu_ps(&clippedvert[j + 1].x);
|
||||
__m128 vz = _mm_loadu_ps(&clippedvert[j + 2].x);
|
||||
__m128 vw = _mm_loadu_ps(&clippedvert[j + 3].x);
|
||||
_MM_TRANSPOSE4_PS(vx, vy, vz, vw);
|
||||
|
||||
// Calculate normalized device coordinates:
|
||||
vw = _mm_div_ps(mone, vw);
|
||||
vx = _mm_mul_ps(vx, vw);
|
||||
vy = _mm_mul_ps(vy, vw);
|
||||
vz = _mm_mul_ps(vz, vw);
|
||||
|
||||
// Apply viewport scale to get screen coordinates:
|
||||
vx = _mm_add_ps(mviewport_x, _mm_mul_ps(mviewport_halfwidth, _mm_add_ps(mone, vx)));
|
||||
vy = _mm_add_ps(mviewport_y, _mm_mul_ps(mviewport_halfheight, _mm_sub_ps(mone, vy)));
|
||||
|
||||
_MM_TRANSPOSE4_PS(vx, vy, vz, vw);
|
||||
_mm_storeu_ps(&clippedvert[j].x, vx);
|
||||
_mm_storeu_ps(&clippedvert[j + 1].x, vy);
|
||||
_mm_storeu_ps(&clippedvert[j + 2].x, vz);
|
||||
_mm_storeu_ps(&clippedvert[j + 3].x, vw);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Keep varyings in -128 to 128 range if possible
|
||||
if (numclipvert > 0)
|
||||
|
|
|
@ -138,6 +138,22 @@ void ScreenTriangle::SetupNormal(const TriDrawTriangleArgs *args, WorkerThreadDa
|
|||
thread->StartY = miny;
|
||||
span->Length = 0;
|
||||
|
||||
#ifndef NO_SSE
|
||||
__m128i mnotxor = _mm_set1_epi32(0xffffffff);
|
||||
__m128i mstencilTestValue = _mm_set1_epi16(stencilTestValue);
|
||||
__m128i mFDY12Offset = _mm_setr_epi32(0, FDY12, FDY12 * 2, FDY12 * 3);
|
||||
__m128i mFDY23Offset = _mm_setr_epi32(0, FDY23, FDY23 * 2, FDY23 * 3);
|
||||
__m128i mFDY31Offset = _mm_setr_epi32(0, FDY31, FDY31 * 2, FDY31 * 3);
|
||||
__m128i mFDY12x4 = _mm_set1_epi32(FDY12 * 4);
|
||||
__m128i mFDY23x4 = _mm_set1_epi32(FDY23 * 4);
|
||||
__m128i mFDY31x4 = _mm_set1_epi32(FDY31 * 4);
|
||||
__m128i mFDX12 = _mm_set1_epi32(FDX12);
|
||||
__m128i mFDX23 = _mm_set1_epi32(FDX23);
|
||||
__m128i mFDX31 = _mm_set1_epi32(FDX31);
|
||||
__m128i mClipCompare0 = _mm_setr_epi32(clipright, clipright - 1, clipright - 2, clipright - 3);
|
||||
__m128i mClipCompare1 = _mm_setr_epi32(clipright - 4, clipright - 5, clipright - 6, clipright - 7);
|
||||
#endif
|
||||
|
||||
// Loop through blocks
|
||||
for (int y = miny; y < maxy; y += q * num_cores)
|
||||
{
|
||||
|
@ -211,6 +227,7 @@ void ScreenTriangle::SetupNormal(const TriDrawTriangleArgs *args, WorkerThreadDa
|
|||
uint32_t mask0 = 0;
|
||||
uint32_t mask1 = 0;
|
||||
|
||||
#if NO_SSE
|
||||
for (int iy = 0; iy < 4; iy++)
|
||||
{
|
||||
int CX1 = CY1;
|
||||
|
@ -256,6 +273,69 @@ void ScreenTriangle::SetupNormal(const TriDrawTriangleArgs *args, WorkerThreadDa
|
|||
CY2 += FDX23;
|
||||
CY3 += FDX31;
|
||||
}
|
||||
#else
|
||||
__m128i mSingleStencilMask = _mm_set1_epi32(blockIsSingleStencil ? 0xffffffff : 0);
|
||||
__m128i mCY1 = _mm_sub_epi32(_mm_set1_epi32(CY1), mFDY12Offset);
|
||||
__m128i mCY2 = _mm_sub_epi32(_mm_set1_epi32(CY2), mFDY23Offset);
|
||||
__m128i mCY3 = _mm_sub_epi32(_mm_set1_epi32(CY3), mFDY31Offset);
|
||||
__m128i mx = _mm_set1_epi32(x);
|
||||
__m128i mClipTest0 = _mm_cmplt_epi32(mx, mClipCompare0);
|
||||
__m128i mClipTest1 = _mm_cmplt_epi32(mx, mClipCompare1);
|
||||
int iy;
|
||||
for (iy = 0; iy < 4 && iy < clipbottom - y; iy++)
|
||||
{
|
||||
__m128i mstencilBlock = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(stencilBlock + iy * 8)), _mm_setzero_si128());
|
||||
__m128i mstencilTest = _mm_or_si128(_mm_cmpeq_epi16(mstencilBlock, mstencilTestValue), mSingleStencilMask);
|
||||
__m128i mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest);
|
||||
__m128i mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest);
|
||||
__m128i mtest0 = _mm_and_si128(mstencilTest0, mClipTest0);
|
||||
__m128i mtest1 = _mm_and_si128(mstencilTest1, mClipTest1);
|
||||
|
||||
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY1, _mm_setzero_si128()), mtest0);
|
||||
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0);
|
||||
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY3, _mm_setzero_si128()), mtest0);
|
||||
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128()), mtest1);
|
||||
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1);
|
||||
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1);
|
||||
|
||||
mCY1 = _mm_add_epi32(mCY1, mFDX12);
|
||||
mCY2 = _mm_add_epi32(mCY2, mFDX23);
|
||||
mCY3 = _mm_add_epi32(mCY3, mFDX31);
|
||||
|
||||
mask0 <<= 4;
|
||||
mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3))));
|
||||
mask0 <<= 4;
|
||||
mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3))));
|
||||
}
|
||||
mask0 <<= (4 - iy) * 8;
|
||||
|
||||
for (iy = 4; iy < q && iy < clipbottom - y; iy++)
|
||||
{
|
||||
__m128i mstencilBlock = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(stencilBlock + iy * 8)), _mm_setzero_si128());
|
||||
__m128i mstencilTest = _mm_or_si128(_mm_cmpeq_epi16(mstencilBlock, mstencilTestValue), mSingleStencilMask);
|
||||
__m128i mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest);
|
||||
__m128i mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest);
|
||||
__m128i mtest0 = _mm_and_si128(mstencilTest0, mClipTest0);
|
||||
__m128i mtest1 = _mm_and_si128(mstencilTest1, mClipTest1);
|
||||
|
||||
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY1, _mm_setzero_si128()), mtest0);
|
||||
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0);
|
||||
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY3, _mm_setzero_si128()), mtest0);
|
||||
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128()), mtest1);
|
||||
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1);
|
||||
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1);
|
||||
|
||||
mCY1 = _mm_add_epi32(mCY1, mFDX12);
|
||||
mCY2 = _mm_add_epi32(mCY2, mFDX23);
|
||||
mCY3 = _mm_add_epi32(mCY3, mFDX31);
|
||||
|
||||
mask1 <<= 4;
|
||||
mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3))));
|
||||
mask1 <<= 4;
|
||||
mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3))));
|
||||
}
|
||||
mask1 <<= (q - iy) * 8;
|
||||
#endif
|
||||
|
||||
if (mask0 != 0xffffffff || mask1 != 0xffffffff)
|
||||
{
|
||||
|
@ -399,6 +479,23 @@ void ScreenTriangle::SetupSubsector(const TriDrawTriangleArgs *args, WorkerThrea
|
|||
thread->StartY = miny;
|
||||
span->Length = 0;
|
||||
|
||||
#ifndef NO_SSE
|
||||
__m128i msubsectorDepth = _mm_set1_epi32(subsectorDepth);
|
||||
__m128i mnotxor = _mm_set1_epi32(0xffffffff);
|
||||
__m128i mstencilTestValue = _mm_set1_epi16(stencilTestValue);
|
||||
__m128i mFDY12Offset = _mm_setr_epi32(0, FDY12, FDY12 * 2, FDY12 * 3);
|
||||
__m128i mFDY23Offset = _mm_setr_epi32(0, FDY23, FDY23 * 2, FDY23 * 3);
|
||||
__m128i mFDY31Offset = _mm_setr_epi32(0, FDY31, FDY31 * 2, FDY31 * 3);
|
||||
__m128i mFDY12x4 = _mm_set1_epi32(FDY12 * 4);
|
||||
__m128i mFDY23x4 = _mm_set1_epi32(FDY23 * 4);
|
||||
__m128i mFDY31x4 = _mm_set1_epi32(FDY31 * 4);
|
||||
__m128i mFDX12 = _mm_set1_epi32(FDX12);
|
||||
__m128i mFDX23 = _mm_set1_epi32(FDX23);
|
||||
__m128i mFDX31 = _mm_set1_epi32(FDX31);
|
||||
__m128i mClipCompare0 = _mm_setr_epi32(clipright, clipright - 1, clipright - 2, clipright - 3);
|
||||
__m128i mClipCompare1 = _mm_setr_epi32(clipright - 4, clipright - 5, clipright - 6, clipright - 7);
|
||||
#endif
|
||||
|
||||
// Loop through blocks
|
||||
for (int y = miny; y < maxy; y += q * num_cores)
|
||||
{
|
||||
|
@ -457,6 +554,7 @@ void ScreenTriangle::SetupSubsector(const TriDrawTriangleArgs *args, WorkerThrea
|
|||
uint32_t mask0 = 0;
|
||||
uint32_t mask1 = 0;
|
||||
|
||||
#ifdef NO_SSE
|
||||
for (int iy = 0; iy < 4; iy++)
|
||||
{
|
||||
for (int ix = 0; ix < q; ix++)
|
||||
|
@ -467,7 +565,6 @@ void ScreenTriangle::SetupSubsector(const TriDrawTriangleArgs *args, WorkerThrea
|
|||
}
|
||||
subsector += pitch;
|
||||
}
|
||||
|
||||
for (int iy = 4; iy < q; iy++)
|
||||
{
|
||||
for (int ix = 0; ix < q; ix++)
|
||||
|
@ -478,6 +575,24 @@ void ScreenTriangle::SetupSubsector(const TriDrawTriangleArgs *args, WorkerThrea
|
|||
}
|
||||
subsector += pitch;
|
||||
}
|
||||
#else
|
||||
for (int iy = 0; iy < 4; iy++)
|
||||
{
|
||||
mask0 <<= 4;
|
||||
mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)subsector), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3))));
|
||||
mask0 <<= 4;
|
||||
mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(subsector + 4)), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3))));
|
||||
subsector += pitch;
|
||||
}
|
||||
for (int iy = 4; iy < q; iy++)
|
||||
{
|
||||
mask1 <<= 4;
|
||||
mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)subsector), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3))));
|
||||
mask1 <<= 4;
|
||||
mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_xor_si128(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(subsector + 4)), msubsectorDepth), mnotxor), _MM_SHUFFLE(0, 1, 2, 3))));
|
||||
subsector += pitch;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (mask0 != 0xffffffff || mask1 != 0xffffffff)
|
||||
{
|
||||
|
@ -520,6 +635,7 @@ void ScreenTriangle::SetupSubsector(const TriDrawTriangleArgs *args, WorkerThrea
|
|||
uint32_t mask0 = 0;
|
||||
uint32_t mask1 = 0;
|
||||
|
||||
#ifdef NO_SSE
|
||||
for (int iy = 0; iy < 4; iy++)
|
||||
{
|
||||
int CX1 = CY1;
|
||||
|
@ -567,6 +683,77 @@ void ScreenTriangle::SetupSubsector(const TriDrawTriangleArgs *args, WorkerThrea
|
|||
CY3 += FDX31;
|
||||
subsector += pitch;
|
||||
}
|
||||
#else
|
||||
__m128i mSingleStencilMask = _mm_set1_epi32(blockIsSingleStencil ? 0 : 0xffffffff);
|
||||
__m128i mCY1 = _mm_sub_epi32(_mm_set1_epi32(CY1), mFDY12Offset);
|
||||
__m128i mCY2 = _mm_sub_epi32(_mm_set1_epi32(CY2), mFDY23Offset);
|
||||
__m128i mCY3 = _mm_sub_epi32(_mm_set1_epi32(CY3), mFDY31Offset);
|
||||
__m128i mx = _mm_set1_epi32(x);
|
||||
__m128i mClipTest0 = _mm_cmplt_epi32(mx, mClipCompare0);
|
||||
__m128i mClipTest1 = _mm_cmplt_epi32(mx, mClipCompare1);
|
||||
int iy;
|
||||
for (iy = 0; iy < 4 && iy < clipbottom - y; iy++)
|
||||
{
|
||||
__m128i mstencilBlock = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(stencilBlock + iy * 8)), _mm_setzero_si128());
|
||||
__m128i mstencilTest = _mm_and_si128(_mm_cmplt_epi16(mstencilBlock, mstencilTestValue), mSingleStencilMask);
|
||||
__m128i mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest);
|
||||
__m128i mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest);
|
||||
__m128i msubsectorTest0 = _mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)subsector), msubsectorDepth);
|
||||
__m128i msubsectorTest1 = _mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(subsector + 4)), msubsectorDepth);
|
||||
__m128i mtest0 = _mm_and_si128(_mm_xor_si128(_mm_or_si128(mstencilTest0, msubsectorTest0), mnotxor), mClipTest0);
|
||||
__m128i mtest1 = _mm_and_si128(_mm_xor_si128(_mm_or_si128(mstencilTest1, msubsectorTest1), mnotxor), mClipTest1);
|
||||
|
||||
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY1, _mm_setzero_si128()), mtest0);
|
||||
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0);
|
||||
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY3, _mm_setzero_si128()), mtest0);
|
||||
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128()), mtest1);
|
||||
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1);
|
||||
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1);
|
||||
|
||||
mCY1 = _mm_add_epi32(mCY1, mFDX12);
|
||||
mCY2 = _mm_add_epi32(mCY2, mFDX23);
|
||||
mCY3 = _mm_add_epi32(mCY3, mFDX31);
|
||||
|
||||
mask0 <<= 4;
|
||||
mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3))));
|
||||
mask0 <<= 4;
|
||||
mask0 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3))));
|
||||
|
||||
subsector += pitch;
|
||||
}
|
||||
mask0 <<= (4 - iy) * 8;
|
||||
|
||||
for (iy = 4; iy < q && iy < clipbottom - y; iy++)
|
||||
{
|
||||
__m128i mstencilBlock = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(stencilBlock + iy * 8)), _mm_setzero_si128());
|
||||
__m128i mstencilTest = _mm_and_si128(_mm_cmplt_epi16(mstencilBlock, mstencilTestValue), mSingleStencilMask);
|
||||
__m128i mstencilTest0 = _mm_unpacklo_epi16(mstencilTest, mstencilTest);
|
||||
__m128i mstencilTest1 = _mm_unpackhi_epi16(mstencilTest, mstencilTest);
|
||||
__m128i msubsectorTest0 = _mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)subsector), msubsectorDepth);
|
||||
__m128i msubsectorTest1 = _mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(subsector + 4)), msubsectorDepth);
|
||||
__m128i mtest0 = _mm_and_si128(_mm_xor_si128(_mm_or_si128(mstencilTest0, msubsectorTest0), mnotxor), mClipTest0);
|
||||
__m128i mtest1 = _mm_and_si128(_mm_xor_si128(_mm_or_si128(mstencilTest1, msubsectorTest1), mnotxor), mClipTest1);
|
||||
|
||||
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY1, _mm_setzero_si128()), mtest0);
|
||||
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY2, _mm_setzero_si128()), mtest0);
|
||||
mtest0 = _mm_and_si128(_mm_cmpgt_epi32(mCY3, _mm_setzero_si128()), mtest0);
|
||||
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY1, mFDY12x4), _mm_setzero_si128()), mtest1);
|
||||
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY2, mFDY23x4), _mm_setzero_si128()), mtest1);
|
||||
mtest1 = _mm_and_si128(_mm_cmpgt_epi32(_mm_sub_epi32(mCY3, mFDY31x4), _mm_setzero_si128()), mtest1);
|
||||
|
||||
mCY1 = _mm_add_epi32(mCY1, mFDX12);
|
||||
mCY2 = _mm_add_epi32(mCY2, mFDX23);
|
||||
mCY3 = _mm_add_epi32(mCY3, mFDX31);
|
||||
|
||||
mask1 <<= 4;
|
||||
mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest0, _MM_SHUFFLE(0, 1, 2, 3))));
|
||||
mask1 <<= 4;
|
||||
mask1 |= _mm_movemask_ps(_mm_castsi128_ps(_mm_shuffle_epi32(mtest1, _MM_SHUFFLE(0, 1, 2, 3))));
|
||||
|
||||
subsector += pitch;
|
||||
}
|
||||
mask1 <<= (q - iy) * 8;
|
||||
#endif
|
||||
|
||||
if (mask0 != 0xffffffff || mask1 != 0xffffffff)
|
||||
{
|
||||
|
|
|
@ -175,6 +175,7 @@ TriMatrix TriMatrix::operator*(const TriMatrix &mult) const
|
|||
|
||||
ShadedTriVertex TriMatrix::operator*(TriVertex v) const
|
||||
{
|
||||
#ifdef NO_SSE
|
||||
float vx = matrix[0 * 4 + 0] * v.x + matrix[1 * 4 + 0] * v.y + matrix[2 * 4 + 0] * v.z + matrix[3 * 4 + 0] * v.w;
|
||||
float vy = matrix[0 * 4 + 1] * v.x + matrix[1 * 4 + 1] * v.y + matrix[2 * 4 + 1] * v.z + matrix[3 * 4 + 1] * v.w;
|
||||
float vz = matrix[0 * 4 + 2] * v.x + matrix[1 * 4 + 2] * v.y + matrix[2 * 4 + 2] * v.z + matrix[3 * 4 + 2] * v.w;
|
||||
|
@ -184,6 +185,20 @@ ShadedTriVertex TriMatrix::operator*(TriVertex v) const
|
|||
sv.y = vy;
|
||||
sv.z = vz;
|
||||
sv.w = vw;
|
||||
#else
|
||||
__m128 m0 = _mm_loadu_ps(matrix);
|
||||
__m128 m1 = _mm_loadu_ps(matrix + 4);
|
||||
__m128 m2 = _mm_loadu_ps(matrix + 8);
|
||||
__m128 m3 = _mm_loadu_ps(matrix + 12);
|
||||
__m128 mv = _mm_loadu_ps(&v.x);
|
||||
m0 = _mm_mul_ps(m0, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(0, 0, 0, 0)));
|
||||
m1 = _mm_mul_ps(m1, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(1, 1, 1, 1)));
|
||||
m2 = _mm_mul_ps(m2, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(2, 2, 2, 2)));
|
||||
m3 = _mm_mul_ps(m3, _mm_shuffle_ps(mv, mv, _MM_SHUFFLE(3, 3, 3, 3)));
|
||||
mv = _mm_add_ps(_mm_add_ps(_mm_add_ps(m0, m1), m2), m3);
|
||||
ShadedTriVertex sv;
|
||||
_mm_storeu_ps(&sv.x, mv);
|
||||
#endif
|
||||
for (int i = 0; i < TriVertex::NumVarying; i++)
|
||||
sv.varying[i] = v.varying[i];
|
||||
return sv;
|
||||
|
|
Loading…
Reference in a new issue