From 86fcc3fd21ca4b66daa3f3e0d7287de6908e2ef1 Mon Sep 17 00:00:00 2001 From: Christoph Oelckers Date: Sun, 4 Dec 2016 23:53:36 +0100 Subject: [PATCH] - added a heavily optimized version of vlinec4 for x64. The original loaded everything from the global variables. While this is acceptable in 32 bit code because it has an immediate register load instruction, for 64 bit this does not exist. Accessing these variables from the stack or a register doubles the execution speed of this function and on a Core i7-3770 from 2012 is even faster than the assembly version. Right now the assembly version is still there, pending a benchmark run on an older 64 bit system. --- src/r_draw.cpp | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/src/r_draw.cpp b/src/r_draw.cpp index fba01cbfe2..0e217c2d75 100644 --- a/src/r_draw.cpp +++ b/src/r_draw.cpp @@ -1673,6 +1673,7 @@ DWORD vlinec1 () return frac; } +#ifndef _M_X64 void vlinec4 () { BYTE *dest = dc_dest; @@ -1689,6 +1690,43 @@ void vlinec4 () dest += dc_pitch; } while (--count); } +#else +// Optimized version for 64 bit. In 64 bit mode, accessing global variables is very expensive so even though +// this exceeds the register count, loading all those values into a local variable is faster than not loading all of them. +void vlinec4() +{ + BYTE *dest = dc_dest; + int count = dc_count; + int bits = vlinebits; + DWORD place; + auto pal0 = palookupoffse[0]; + auto pal1 = palookupoffse[1]; + auto pal2 = palookupoffse[2]; + auto pal3 = palookupoffse[3]; + auto buf0 = bufplce[0]; + auto buf1 = bufplce[1]; + auto buf2 = bufplce[2]; + auto buf3 = bufplce[3]; + const auto vince0 = vince[0]; + const auto vince1 = vince[1]; + const auto vince2 = vince[2]; + const auto vince3 = vince[3]; + auto vplce0 = vplce[0]; + auto vplce1 = vplce[1]; + auto vplce2 = vplce[2]; + auto vplce3 = vplce[3]; + + do + { + dest[0] = pal0[buf0[(place = vplce0) >> bits]]; vplce0 = place + vince0; + dest[1] = pal1[buf1[(place = vplce1) >> bits]]; vplce1 = place + vince1; + dest[2] = pal2[buf2[(place = vplce2) >> bits]]; vplce2 = place + vince2; + dest[3] = pal3[buf3[(place = vplce3) >> bits]]; vplce3 = place + vince3; + dest += dc_pitch; + } while (--count); +} +#endif + #endif void setupmvline (int fracbits)