From 5910067c4473a682727d8e1e7cdd92f0ea060260 Mon Sep 17 00:00:00 2001 From: Christoph Oelckers Date: Wed, 7 Dec 2016 14:26:26 +0100 Subject: [PATCH] - discontinue using the MMX assembly version of DoBlending. Some benchmarking shows that on SSE systems it only harms performance and compared to the intrinsics version the gains are too marginal for something this infrequently called. Doing 100000 calls of DoBlending results in a 5 ms decrease of using assembly vs intrinsics on a 3.4 GHz Core i7, meaning that even on a computer that is 10x slower you can still do 1000 or so blends per frame without a speed hit. --- src/v_palette.cpp | 8 +++++--- src/x86.cpp | 23 +++-------------------- 2 files changed, 8 insertions(+), 23 deletions(-) diff --git a/src/v_palette.cpp b/src/v_palette.cpp index 934a57dd3..49fbd6cb6 100644 --- a/src/v_palette.cpp +++ b/src/v_palette.cpp @@ -384,8 +384,8 @@ void InitPalette () R_InitColormaps (); } -extern "C" void DoBlending_MMX (const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a); -extern void DoBlending_SSE2 (const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a); +void DoBlending_MMX (const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a); +void DoBlending_SSE2 (const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a); void DoBlending (const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a) { @@ -395,6 +395,7 @@ void DoBlending (const PalEntry *from, PalEntry *to, int count, int r, int g, in { memcpy (to, from, count * sizeof(DWORD)); } + return; } else if (a == 256) { @@ -405,6 +406,7 @@ void DoBlending (const PalEntry *from, PalEntry *to, int count, int r, int g, in { to[i] = t; } + return; } #if defined(_M_X64) || defined(_M_IX86) || defined(__i386__) || defined(__amd64__) else if (CPU.bSSE2) @@ -423,7 +425,7 @@ void DoBlending (const PalEntry *from, PalEntry *to, int count, int r, int g, in } } #endif -#ifdef X86_ASM +#if defined(_M_IX86) || defined(__i386__) else if (CPU.bMMX) { if (count >= 4) diff --git a/src/x86.cpp b/src/x86.cpp index f6c878da6..17c946ac0 100644 --- a/src/x86.cpp +++ b/src/x86.cpp @@ -227,10 +227,9 @@ void DumpCPUInfo(const CPUInfo *cpu) } } -#if 0 -// Compiler output for this function is crap compared to the assembly -// version, which is why it isn't used. -void DoBlending_MMX2(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a) +#if !defined(__amd64__) && !defined(_M_X64) + +void DoBlending_MMX(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a) { __m64 blendcolor; __m64 blendalpha; @@ -272,9 +271,6 @@ void DoBlending_MMX2(const PalEntry *from, PalEntry *to, int count, int r, int g } #endif -#ifdef X86_ASM -extern "C" void DoBlending_MMX(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a); -#endif void DoBlending_SSE2(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a) { @@ -288,17 +284,6 @@ void DoBlending_SSE2(const PalEntry *from, PalEntry *to, int count, int r, int g unaligned = ((size_t)from | (size_t)to) & 0xF; -#ifdef X86_ASM - // For unaligned accesses, the assembly MMX version is slightly faster. - // Note that using unaligned SSE loads and stores is still faster than - // the compiler-generated MMX version. - if (unaligned) - { - DoBlending_MMX(from, to, count, r, g, b, a); - return; - } -#endif - #if defined(__amd64__) || defined(_M_X64) long long color; @@ -326,7 +311,6 @@ void DoBlending_SSE2(const PalEntry *from, PalEntry *to, int count, int r, int g zero = _mm_setzero_si128(); -#ifndef X86_ASM if (unaligned) { for (count >>= 2; count > 0; --count) @@ -346,7 +330,6 @@ void DoBlending_SSE2(const PalEntry *from, PalEntry *to, int count, int r, int g } } else -#endif { for (count >>= 2; count > 0; --count) {