- Add SSE code to OpenGLSWFrameBuffer::OpenGLPal::Update to workaround a broken auto-vectorizer in GCC

This commit is contained in:
Magnus Norddahl 2017-05-06 22:03:32 +02:00
parent 7f7be9e393
commit f083109b51

View file

@ -2383,6 +2383,41 @@ bool OpenGLSWFrameBuffer::OpenGLPal::Update()
// See explanation in UploadPalette() for skipat rationale.
skipat = MIN(numEntries, DoColorSkip ? 256 - 8 : 256);
#ifndef NO_SSE
// Manual SSE vectorized version here to workaround a bug in GCC's auto-vectorizer
int sse_count = skipat / 4 * 4;
for (i = 0; i < sse_count; i += 4)
{
_mm_storeu_si128((__m128i*)(&buff[i]), _mm_loadu_si128((__m128i*)(&pal[i])));
}
switch (skipat - i)
{
// fall through is intentional
case 3: buff[i] = pal[i].d; i++;
case 2: buff[i] = pal[i].d; i++;
case 1: buff[i] = pal[i].d; i++;
default: i++;
}
sse_count = numEntries / 4 * 4;
__m128i alphamask = _mm_set1_epi32(0xff000000);
while (i < sse_count)
{
__m128i lastcolor = _mm_loadu_si128((__m128i*)(&pal[i - 1]));
__m128i color = _mm_loadu_si128((__m128i*)(&pal[i]));
_mm_storeu_si128((__m128i*)(&buff[i]), _mm_or_si128(_mm_and_si128(alphamask, color), _mm_andnot_si128(alphamask, lastcolor)));
i += 4;
}
switch (numEntries - i)
{
// fall through is intentional
case 3: buff[i] = ColorARGB(pal[i].a, pal[i - 1].r, pal[i - 1].g, pal[i - 1].b); i++;
case 2: buff[i] = ColorARGB(pal[i].a, pal[i - 1].r, pal[i - 1].g, pal[i - 1].b); i++;
case 1: buff[i] = ColorARGB(pal[i].a, pal[i - 1].r, pal[i - 1].g, pal[i - 1].b); i++;
default: break;
}
#else
for (i = 0; i < skipat; ++i)
{
buff[i] = ColorARGB(pal[i].a, pal[i].r, pal[i].g, pal[i].b);
@ -2391,6 +2426,7 @@ bool OpenGLSWFrameBuffer::OpenGLPal::Update()
{
buff[i] = ColorARGB(pal[i].a, pal[i - 1].r, pal[i - 1].g, pal[i - 1].b);
}
#endif
if (numEntries > 1)
{
i = numEntries - 1;