diff --git a/src/v_palette.cpp b/src/v_palette.cpp index 934a57dd3c..49fbd6cb6f 100644 --- a/src/v_palette.cpp +++ b/src/v_palette.cpp @@ -384,8 +384,8 @@ void InitPalette () R_InitColormaps (); } -extern "C" void DoBlending_MMX (const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a); -extern void DoBlending_SSE2 (const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a); +void DoBlending_MMX (const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a); +void DoBlending_SSE2 (const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a); void DoBlending (const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a) { @@ -395,6 +395,7 @@ void DoBlending (const PalEntry *from, PalEntry *to, int count, int r, int g, in { memcpy (to, from, count * sizeof(DWORD)); } + return; } else if (a == 256) { @@ -405,6 +406,7 @@ void DoBlending (const PalEntry *from, PalEntry *to, int count, int r, int g, in { to[i] = t; } + return; } #if defined(_M_X64) || defined(_M_IX86) || defined(__i386__) || defined(__amd64__) else if (CPU.bSSE2) @@ -423,7 +425,7 @@ void DoBlending (const PalEntry *from, PalEntry *to, int count, int r, int g, in } } #endif -#ifdef X86_ASM +#if defined(_M_IX86) || defined(__i386__) else if (CPU.bMMX) { if (count >= 4) diff --git a/src/x86.cpp b/src/x86.cpp index f6c878da61..17c946ac0f 100644 --- a/src/x86.cpp +++ b/src/x86.cpp @@ -227,10 +227,9 @@ void DumpCPUInfo(const CPUInfo *cpu) } } -#if 0 -// Compiler output for this function is crap compared to the assembly -// version, which is why it isn't used. -void DoBlending_MMX2(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a) +#if !defined(__amd64__) && !defined(_M_X64) + +void DoBlending_MMX(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a) { __m64 blendcolor; __m64 blendalpha; @@ -272,9 +271,6 @@ void DoBlending_MMX2(const PalEntry *from, PalEntry *to, int count, int r, int g } #endif -#ifdef X86_ASM -extern "C" void DoBlending_MMX(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a); -#endif void DoBlending_SSE2(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a) { @@ -288,17 +284,6 @@ void DoBlending_SSE2(const PalEntry *from, PalEntry *to, int count, int r, int g unaligned = ((size_t)from | (size_t)to) & 0xF; -#ifdef X86_ASM - // For unaligned accesses, the assembly MMX version is slightly faster. - // Note that using unaligned SSE loads and stores is still faster than - // the compiler-generated MMX version. - if (unaligned) - { - DoBlending_MMX(from, to, count, r, g, b, a); - return; - } -#endif - #if defined(__amd64__) || defined(_M_X64) long long color; @@ -326,7 +311,6 @@ void DoBlending_SSE2(const PalEntry *from, PalEntry *to, int count, int r, int g zero = _mm_setzero_si128(); -#ifndef X86_ASM if (unaligned) { for (count >>= 2; count > 0; --count) @@ -346,7 +330,6 @@ void DoBlending_SSE2(const PalEntry *from, PalEntry *to, int count, int r, int g } } else -#endif { for (count >>= 2; count > 0; --count) {