- added a heavily optimized version of vlinec4 for x64. The original loaded everything from the global variables. While this is acceptable in 32 bit code because it has an immediate register load instruction, for 64 bit this does not exist. Accessing these variables from the stack or a register doubles the execution speed of this function and on a Core i7-3770 from 2012 is even faster than the assembly version. Right now the assembly version is still there, pending a benchmark run on an older 64 bit system.

This commit is contained in:
Christoph Oelckers 2016-12-04 23:53:36 +01:00
parent f4454d2e00
commit 86fcc3fd21

View file

@ -1673,6 +1673,7 @@ DWORD vlinec1 ()
return frac;
}
#ifndef _M_X64
void vlinec4 ()
{
BYTE *dest = dc_dest;
@ -1689,6 +1690,43 @@ void vlinec4 ()
dest += dc_pitch;
} while (--count);
}
#else
// Optimized version for 64 bit. In 64 bit mode, accessing global variables is very expensive so even though
// this exceeds the register count, loading all those values into a local variable is faster than not loading all of them.
void vlinec4()
{
BYTE *dest = dc_dest;
int count = dc_count;
int bits = vlinebits;
DWORD place;
auto pal0 = palookupoffse[0];
auto pal1 = palookupoffse[1];
auto pal2 = palookupoffse[2];
auto pal3 = palookupoffse[3];
auto buf0 = bufplce[0];
auto buf1 = bufplce[1];
auto buf2 = bufplce[2];
auto buf3 = bufplce[3];
const auto vince0 = vince[0];
const auto vince1 = vince[1];
const auto vince2 = vince[2];
const auto vince3 = vince[3];
auto vplce0 = vplce[0];
auto vplce1 = vplce[1];
auto vplce2 = vplce[2];
auto vplce3 = vplce[3];
do
{
dest[0] = pal0[buf0[(place = vplce0) >> bits]]; vplce0 = place + vince0;
dest[1] = pal1[buf1[(place = vplce1) >> bits]]; vplce1 = place + vince1;
dest[2] = pal2[buf2[(place = vplce2) >> bits]]; vplce2 = place + vince2;
dest[3] = pal3[buf3[(place = vplce3) >> bits]]; vplce3 = place + vince3;
dest += dc_pitch;
} while (--count);
}
#endif
#endif
void setupmvline (int fracbits)