- discontinue using the MMX assembly version of DoBlending.

Some benchmarking shows that on SSE systems it only harms performance and compared to the intrinsics version the gains are too marginal for something this infrequently called.
Doing 100000 calls of DoBlending results in a 5 ms decrease of using assembly vs intrinsics on a 3.4 GHz Core i7, meaning that even on a computer that is 10x slower you can still do 1000 or so blends per frame without a speed hit.
This commit is contained in:
Christoph Oelckers 2016-12-07 14:26:26 +01:00
parent 42346c58d3
commit 5910067c44
2 changed files with 8 additions and 23 deletions

View file

@ -384,8 +384,8 @@ void InitPalette ()
R_InitColormaps ();
}
extern "C" void DoBlending_MMX (const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a);
extern void DoBlending_SSE2 (const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a);
void DoBlending_MMX (const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a);
void DoBlending_SSE2 (const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a);
void DoBlending (const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a)
{
@ -395,6 +395,7 @@ void DoBlending (const PalEntry *from, PalEntry *to, int count, int r, int g, in
{
memcpy (to, from, count * sizeof(DWORD));
}
return;
}
else if (a == 256)
{
@ -405,6 +406,7 @@ void DoBlending (const PalEntry *from, PalEntry *to, int count, int r, int g, in
{
to[i] = t;
}
return;
}
#if defined(_M_X64) || defined(_M_IX86) || defined(__i386__) || defined(__amd64__)
else if (CPU.bSSE2)
@ -423,7 +425,7 @@ void DoBlending (const PalEntry *from, PalEntry *to, int count, int r, int g, in
}
}
#endif
#ifdef X86_ASM
#if defined(_M_IX86) || defined(__i386__)
else if (CPU.bMMX)
{
if (count >= 4)

View file

@ -227,10 +227,9 @@ void DumpCPUInfo(const CPUInfo *cpu)
}
}
#if 0
// Compiler output for this function is crap compared to the assembly
// version, which is why it isn't used.
void DoBlending_MMX2(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a)
#if !defined(__amd64__) && !defined(_M_X64)
void DoBlending_MMX(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a)
{
__m64 blendcolor;
__m64 blendalpha;
@ -272,9 +271,6 @@ void DoBlending_MMX2(const PalEntry *from, PalEntry *to, int count, int r, int g
}
#endif
#ifdef X86_ASM
extern "C" void DoBlending_MMX(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a);
#endif
void DoBlending_SSE2(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a)
{
@ -288,17 +284,6 @@ void DoBlending_SSE2(const PalEntry *from, PalEntry *to, int count, int r, int g
unaligned = ((size_t)from | (size_t)to) & 0xF;
#ifdef X86_ASM
// For unaligned accesses, the assembly MMX version is slightly faster.
// Note that using unaligned SSE loads and stores is still faster than
// the compiler-generated MMX version.
if (unaligned)
{
DoBlending_MMX(from, to, count, r, g, b, a);
return;
}
#endif
#if defined(__amd64__) || defined(_M_X64)
long long color;
@ -326,7 +311,6 @@ void DoBlending_SSE2(const PalEntry *from, PalEntry *to, int count, int r, int g
zero = _mm_setzero_si128();
#ifndef X86_ASM
if (unaligned)
{
for (count >>= 2; count > 0; --count)
@ -346,7 +330,6 @@ void DoBlending_SSE2(const PalEntry *from, PalEntry *to, int count, int r, int g
}
}
else
#endif
{
for (count >>= 2; count > 0; --count)
{