diff --git a/src/gl/system/gl_swframebuffer.cpp b/src/gl/system/gl_swframebuffer.cpp index 2bf3cf2ee..39b3307b6 100644 --- a/src/gl/system/gl_swframebuffer.cpp +++ b/src/gl/system/gl_swframebuffer.cpp @@ -2383,6 +2383,41 @@ bool OpenGLSWFrameBuffer::OpenGLPal::Update() // See explanation in UploadPalette() for skipat rationale. skipat = MIN(numEntries, DoColorSkip ? 256 - 8 : 256); +#ifndef NO_SSE + // Manual SSE vectorized version here to workaround a bug in GCC's auto-vectorizer + + int sse_count = skipat / 4 * 4; + for (i = 0; i < sse_count; i += 4) + { + _mm_storeu_si128((__m128i*)(&buff[i]), _mm_loadu_si128((__m128i*)(&pal[i]))); + } + switch (skipat - i) + { + // fall through is intentional + case 3: buff[i] = pal[i].d; i++; + case 2: buff[i] = pal[i].d; i++; + case 1: buff[i] = pal[i].d; i++; + default: i++; + } + sse_count = numEntries / 4 * 4; + __m128i alphamask = _mm_set1_epi32(0xff000000); + while (i < sse_count) + { + __m128i lastcolor = _mm_loadu_si128((__m128i*)(&pal[i - 1])); + __m128i color = _mm_loadu_si128((__m128i*)(&pal[i])); + _mm_storeu_si128((__m128i*)(&buff[i]), _mm_or_si128(_mm_and_si128(alphamask, color), _mm_andnot_si128(alphamask, lastcolor))); + i += 4; + } + switch (numEntries - i) + { + // fall through is intentional + case 3: buff[i] = ColorARGB(pal[i].a, pal[i - 1].r, pal[i - 1].g, pal[i - 1].b); i++; + case 2: buff[i] = ColorARGB(pal[i].a, pal[i - 1].r, pal[i - 1].g, pal[i - 1].b); i++; + case 1: buff[i] = ColorARGB(pal[i].a, pal[i - 1].r, pal[i - 1].g, pal[i - 1].b); i++; + default: break; + } + +#else for (i = 0; i < skipat; ++i) { buff[i] = ColorARGB(pal[i].a, pal[i].r, pal[i].g, pal[i].b); @@ -2391,6 +2426,7 @@ bool OpenGLSWFrameBuffer::OpenGLPal::Update() { buff[i] = ColorARGB(pal[i].a, pal[i - 1].r, pal[i - 1].g, pal[i - 1].b); } +#endif if (numEntries > 1) { i = numEntries - 1;