a-c.c: in {v,mv}lineasm4, use vector types for vplc/vinc with GCC >=4.7.

For a full 1680x1050 screen drawing a solid/masked wall, the FPS increases
from 118 to 133 and from 114 to 116 (respectively) for me.
Guarded by the macro USE_VECTOR_EXT in the source.

git-svn-id: https://svn.eduke32.com/eduke32@4160 1a8010ca-5511-0410-912e-c29ae57300e0
This commit is contained in:
helixhorned 2013-11-16 18:47:07 +00:00
parent 02507d76e4
commit ca22bf8185

View file

@ -153,30 +153,29 @@ extern uint32_t vplce[4];
extern int32_t vince[4]; extern int32_t vince[4];
extern intptr_t bufplce[4]; extern intptr_t bufplce[4];
#if defined __GNUC__ && __GNUC_MINOR__ >= 7
# define USE_VECTOR_EXT
#endif
#ifdef USE_VECTOR_EXT
typedef uint32_t uint32_vec4 __attribute__ ((vector_size (16)));
#endif
// cnt >= 1 // cnt >= 1
void vlineasm4(int32_t cnt, char *p) void vlineasm4(int32_t cnt, char *p)
{ {
char ch; char ch;
int32_t i; int32_t i;
#if 1
// this gives slightly more stuff in registers in the loop
// (on x86_64 at least)
char *const pal[4] = {(char *)palookupoffse[0], (char *)palookupoffse[1], (char *)palookupoffse[2], (char *)palookupoffse[3]}; char *const pal[4] = {(char *)palookupoffse[0], (char *)palookupoffse[1], (char *)palookupoffse[2], (char *)palookupoffse[3]};
char *const buf[4] = {(char *)bufplce[0], (char *)bufplce[1], (char *)bufplce[2], (char *)bufplce[3]}; char *const buf[4] = {(char *)bufplce[0], (char *)bufplce[1], (char *)bufplce[2], (char *)bufplce[3]};
#ifdef USE_VECTOR_EXT
uint32_vec4 vinc = {vince[0], vince[1], vince[2], vince[3]};
uint32_vec4 vplc = {vplce[0], vplce[1], vplce[2], vplce[3]};
#else
const int32_t vinc[4] = {vince[0], vince[1], vince[2], vince[3]}; const int32_t vinc[4] = {vince[0], vince[1], vince[2], vince[3]};
uint32_t vplc[4] = {vplce[0], vplce[1], vplce[2], vplce[3]}; uint32_t vplc[4] = {vplce[0], vplce[1], vplce[2], vplce[3]};
#else
char *pal[4];
char *buf[4];
int32_t vinc[4];
uint32_t vplc[4];
Bmemcpy(pal, palookupoffse, sizeof(pal));
Bmemcpy(buf, bufplce, sizeof(buf));
Bmemcpy(vinc, vince, sizeof(vinc));
Bmemcpy(vplc, vplce, sizeof(vplc));
#endif #endif
const int32_t logy = glogy, ourbpl = bpl; const int32_t logy = glogy, ourbpl = bpl;
do do
@ -185,13 +184,19 @@ void vlineasm4(int32_t cnt, char *p)
{ {
ch = getpix(logy, buf[i], vplc[i]); ch = getpix(logy, buf[i], vplc[i]);
p[i] = pal[i][ch]; p[i] = pal[i][ch];
#if !defined USE_VECTOR_EXT
vplc[i] += vinc[i]; vplc[i] += vinc[i];
#endif
} }
#ifdef USE_VECTOR_EXT
vplc += vinc;
#endif
p += ourbpl; p += ourbpl;
} }
while (--cnt); while (--cnt);
Bmemcpy(vplce, vplc, sizeof(vplce)); for (i=0; i<4; i++)
vplce[i] = vplc[i];
} }
void setupmvlineasm(int32_t neglogy) { glogy = neglogy; } void setupmvlineasm(int32_t neglogy) { glogy = neglogy; }
@ -227,9 +232,13 @@ void mvlineasm4(int32_t cnt, char *p)
char *const pal[4] = {(char *)palookupoffse[0], (char *)palookupoffse[1], (char *)palookupoffse[2], (char *)palookupoffse[3]}; char *const pal[4] = {(char *)palookupoffse[0], (char *)palookupoffse[1], (char *)palookupoffse[2], (char *)palookupoffse[3]};
char *const buf[4] = {(char *)bufplce[0], (char *)bufplce[1], (char *)bufplce[2], (char *)bufplce[3]}; char *const buf[4] = {(char *)bufplce[0], (char *)bufplce[1], (char *)bufplce[2], (char *)bufplce[3]};
#ifdef USE_VECTOR_EXT
uint32_vec4 vinc = {vince[0], vince[1], vince[2], vince[3]};
uint32_vec4 vplc = {vplce[0], vplce[1], vplce[2], vplce[3]};
#else
const int32_t vinc[4] = {vince[0], vince[1], vince[2], vince[3]}; const int32_t vinc[4] = {vince[0], vince[1], vince[2], vince[3]};
uint32_t vplc[4] = {vplce[0], vplce[1], vplce[2], vplce[3]}; uint32_t vplc[4] = {vplce[0], vplce[1], vplce[2], vplce[3]};
#endif
const int32_t logy = glogy, ourbpl = bpl; const int32_t logy = glogy, ourbpl = bpl;
do do
@ -238,13 +247,19 @@ void mvlineasm4(int32_t cnt, char *p)
{ {
ch = getpix(logy, buf[i], vplc[i]); ch = getpix(logy, buf[i], vplc[i]);
if (ch != 255) p[i] = pal[i][ch]; if (ch != 255) p[i] = pal[i][ch];
#if !defined USE_VECTOR_EXT
vplc[i] += vinc[i]; vplc[i] += vinc[i];
#endif
} }
#ifdef USE_VECTOR_EXT
vplc += vinc;
#endif
p += ourbpl; p += ourbpl;
} }
while (--cnt); while (--cnt);
Bmemcpy(vplce, vplc, sizeof(vplce)); for (i=0; i<4; i++)
vplce[i] = vplc[i];
} }
#ifdef USE_ASM64 #ifdef USE_ASM64