From e191a915f7e427c3eebf3192c1df7b386def1966 Mon Sep 17 00:00:00 2001 From: helixhorned Date: Sun, 18 Mar 2012 23:16:57 +0000 Subject: [PATCH] Port [m]vlineasm4 to C replacements and enable for solid and masked walls. These two functions draw a vertical line 4 neighboring pixels at a time. This gives a significant speed boost for a full screen solid and masked wall scene for x86_64 (where we have plenty of registers), about 60 --> 76 fps. git-svn-id: https://svn.eduke32.com/eduke32@2497 1a8010ca-5511-0410-912e-c29ae57300e0 --- polymer/eduke32/build/include/a.h | 18 ++++--- polymer/eduke32/build/src/a-c.c | 81 +++++++++++++++++++++++++++++- polymer/eduke32/build/src/engine.c | 44 ++++++++-------- 3 files changed, 113 insertions(+), 30 deletions(-) diff --git a/polymer/eduke32/build/include/a.h b/polymer/eduke32/build/include/a.h index 0a8af5826..9f7411c59 100644 --- a/polymer/eduke32/build/include/a.h +++ b/polymer/eduke32/build/include/a.h @@ -35,9 +35,9 @@ extern int32_t __cdecl setuptvlineasm2(int32_t,int32_t,int32_t); extern int32_t __cdecl tvlineasm2(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t); extern int32_t __cdecl mvlineasm1(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t); extern int32_t __cdecl setupvlineasm(int32_t); -extern int32_t __cdecl vlineasm4(int32_t,int32_t); +extern int32_t __cdecl vlineasm4(int32_t,char *); extern int32_t __cdecl setupmvlineasm(int32_t); -extern int32_t __cdecl mvlineasm4(int32_t,int32_t); +extern int32_t __cdecl mvlineasm4(int32_t,char *); extern int32_t __cdecl setupspritevline(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t); extern int32_t __cdecl spritevline(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t); extern int32_t __cdecl msetupspritevline(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t); @@ -81,9 +81,9 @@ extern int32_t _cdecl setuptvlineasm2(int32_t,int32_t,int32_t); extern int32_t _cdecl tvlineasm2(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t); extern int32_t _cdecl mvlineasm1(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t); extern int32_t _cdecl setupvlineasm(int32_t); -extern int32_t _cdecl vlineasm4(int32_t,int32_t); +extern int32_t _cdecl vlineasm4(int32_t,char *); extern int32_t _cdecl setupmvlineasm(int32_t); -extern int32_t _cdecl mvlineasm4(int32_t,int32_t); +extern int32_t _cdecl mvlineasm4(int32_t,char *); extern int32_t _cdecl setupspritevline(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t); extern int32_t _cdecl spritevline(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t); extern int32_t _cdecl msetupspritevline(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t); @@ -109,6 +109,8 @@ extern void _cdecl stretchhline(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t) #define ENGINE_USING_A_C #include +#define prevlineasm1 vlineasm1 + void setvlinebpl(int32_t dabpl); void fixtransluscence(intptr_t datransoff); void settransnormal(void); @@ -123,13 +125,15 @@ void setupslopevlin(int32_t logylogx, intptr_t bufplc, int32_t pinc); void slopevlin(intptr_t p, int32_t i, intptr_t slopaloffs, int32_t cnt, int32_t bx, int32_t by); void setupvlineasm(int32_t neglogy); -void vlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p); +int32_t vlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p); +void vlineasm4(int32_t cnt, char *p); void setupmvlineasm(int32_t neglogy); -void mvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p); +int32_t mvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p); +void mvlineasm4(int32_t cnt, char *p); void setuptvlineasm(int32_t neglogy); -void tvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p); +int32_t tvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p); void msethlineshift(int32_t logx, int32_t logy); void mhline(intptr_t bufplc, uint32_t bx, int32_t cntup16, int32_t junk, uint32_t by, intptr_t p); diff --git a/polymer/eduke32/build/src/a-c.c b/polymer/eduke32/build/src/a-c.c index 4b70b2947..3c7af40e5 100644 --- a/polymer/eduke32/build/src/a-c.c +++ b/polymer/eduke32/build/src/a-c.c @@ -89,7 +89,7 @@ void slopevlin(intptr_t p, int32_t i, intptr_t slopaloffs, int32_t cnt, int32_t void setupvlineasm(int32_t neglogy) { glogy = neglogy; } // cnt+1 loop iterations! -void vlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p) +int32_t vlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p) { const char *const buf = (char *)bufplc; const char *const pal = (char *)paloffs; @@ -105,11 +105,58 @@ void vlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intpt vplc += vinc; } while (--cnt); + + return vplc; +} + + +extern intptr_t palookupoffse[4]; +extern int32_t vplce[4], vince[4]; +extern intptr_t bufplce[4]; + +// cnt >= 1 +void vlineasm4(int32_t cnt, char *p) +{ + char ch; + int32_t i; +#if 1 + // this gives slightly more stuff in registers in the loop + // (on x86_64 at least) + char *const pal[4] = {(char *)palookupoffse[0], (char *)palookupoffse[1], (char *)palookupoffse[2], (char *)palookupoffse[3]}; + char *const buf[4] = {(char *)bufplce[0], (char *)bufplce[1], (char *)bufplce[2], (char *)bufplce[3]}; + const int32_t vinc[4] = {vince[0], vince[1], vince[2], vince[3]}; + uint32_t vplc[4] = {vplce[0], vplce[1], vplce[2], vplce[3]}; +#else + char *pal[4]; + char *buf[4]; + int32_t vinc[4]; + uint32_t vplc[4]; + + Bmemcpy(pal, palookupoffse, sizeof(pal)); + Bmemcpy(buf, bufplce, sizeof(buf)); + Bmemcpy(vinc, vince, sizeof(vinc)); + Bmemcpy(vplc, vplce, sizeof(vplc)); +#endif + + const int32_t logy = glogy, ourbpl = bpl; + + do + { + for (i=0; i<4; i++) + { + ch = buf[i][vplc[i]>>logy]; p[i] = pal[i][ch]; + vplc[i] += vinc[i]; + } + p += ourbpl; + } + while (--cnt); + + Bmemcpy(vplce, vplc, sizeof(vplce)); } void setupmvlineasm(int32_t neglogy) { glogy = neglogy; } // cnt+1 loop iterations! -void mvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p) +int32_t mvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p) { char ch; @@ -127,8 +174,38 @@ void mvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intp vplc += vinc; } while (--cnt); + + return vplc; } +// cnt >= 1 +void mvlineasm4(int32_t cnt, char *p) +{ + char ch; + int32_t i; + + char *const pal[4] = {(char *)palookupoffse[0], (char *)palookupoffse[1], (char *)palookupoffse[2], (char *)palookupoffse[3]}; + char *const buf[4] = {(char *)bufplce[0], (char *)bufplce[1], (char *)bufplce[2], (char *)bufplce[3]}; + const int32_t vinc[4] = {vince[0], vince[1], vince[2], vince[3]}; + uint32_t vplc[4] = {vplce[0], vplce[1], vplce[2], vplce[3]}; + + const int32_t logy = glogy, ourbpl = bpl; + + do + { + for (i=0; i<4; i++) + { + ch = buf[i][vplc[i]>>logy]; if (ch != 255) p[i] = pal[i][ch]; + vplc[i] += vinc[i]; + } + p += ourbpl; + } + while (--cnt); + + Bmemcpy(vplce, vplc, sizeof(vplce)); +} + + void setuptvlineasm(int32_t neglogy) { glogy = neglogy; } // cnt+1 loop iterations! void tvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p) diff --git a/polymer/eduke32/build/src/engine.c b/polymer/eduke32/build/src/engine.c index f0b6b82db..fe8a84d9e 100644 --- a/polymer/eduke32/build/src/engine.c +++ b/polymer/eduke32/build/src/engine.c @@ -2240,7 +2240,8 @@ int32_t xyaspect; static int32_t viewingrangerecip; intptr_t asm1, asm2, asm3, asm4, palookupoffse[4]; -int32_t vplce[4], vince[4], bufplce[4]; +int32_t vplce[4], vince[4]; +intptr_t bufplce[4]; static char globalxshift, globalyshift; static int32_t globalxpanning, globalypanning; int32_t globalshade, globalorientation; @@ -2637,9 +2638,9 @@ skipitaddwall: static void maskwallscan(int32_t x1, int32_t x2, int16_t *uwal, int16_t *dwal, int32_t *swal, int32_t *lwal) { int32_t x,/* startx,*/ xnice, ynice; - intptr_t startx, p, fpalookup; - int32_t y1ve[4], y2ve[4], /* p,*/ tsizx, tsizy; -#ifndef ENGINE_USING_A_C + intptr_t startx, p, pp, fpalookup; + int32_t y1ve[4], y2ve[4], tsizx, tsizy; +#if 1 //ndef ENGINE_USING_A_C char bad; int32_t i, u4, d4, dax, z; #endif @@ -2667,7 +2668,7 @@ static void maskwallscan(int32_t x1, int32_t x2, int16_t *uwal, int16_t *dwal, i setupmvlineasm(globalshiftval); -#ifndef ENGINE_USING_A_C +#if 1 //ndef ENGINE_USING_A_C x = startx; while ((startumost[x+windowx1] > startdmost[x+windowx1]) && (x <= x2)) x++; @@ -2741,13 +2742,13 @@ static void maskwallscan(int32_t x1, int32_t x2, int16_t *uwal, int16_t *dwal, i if (u4 > y1ve[2]) vplce[2] = mvlineasm1(vince[2],palookupoffse[2],u4-y1ve[2]-1,vplce[2],bufplce[2],ylookup[y1ve[2]]+p+2); if (u4 > y1ve[3]) vplce[3] = mvlineasm1(vince[3],palookupoffse[3],u4-y1ve[3]-1,vplce[3],bufplce[3],ylookup[y1ve[3]]+p+3); - if (d4 >= u4) mvlineasm4(d4-u4+1,ylookup[u4]+p); + if (d4 >= u4) mvlineasm4(d4-u4+1, (char *)ylookup[u4]+p); - i = p+ylookup[d4+1]; - if (y2ve[0] > d4) mvlineasm1(vince[0],palookupoffse[0],y2ve[0]-d4-1,vplce[0],bufplce[0],i+0); - if (y2ve[1] > d4) mvlineasm1(vince[1],palookupoffse[1],y2ve[1]-d4-1,vplce[1],bufplce[1],i+1); - if (y2ve[2] > d4) mvlineasm1(vince[2],palookupoffse[2],y2ve[2]-d4-1,vplce[2],bufplce[2],i+2); - if (y2ve[3] > d4) mvlineasm1(vince[3],palookupoffse[3],y2ve[3]-d4-1,vplce[3],bufplce[3],i+3); + pp = p+ylookup[d4+1]; + if (y2ve[0] > d4) mvlineasm1(vince[0],palookupoffse[0],y2ve[0]-d4-1,vplce[0],bufplce[0],pp+0); + if (y2ve[1] > d4) mvlineasm1(vince[1],palookupoffse[1],y2ve[1]-d4-1,vplce[1],bufplce[1],pp+1); + if (y2ve[2] > d4) mvlineasm1(vince[2],palookupoffse[2],y2ve[2]-d4-1,vplce[2],bufplce[2],pp+2); + if (y2ve[3] > d4) mvlineasm1(vince[3],palookupoffse[3],y2ve[3]-d4-1,vplce[3],bufplce[3],pp+3); } for (; x<=x2; x++,p++) { @@ -3686,9 +3687,10 @@ static void wallscan(int32_t x1, int32_t x2, int16_t *uwal, int16_t *dwal, int32 int32_t x, xnice, ynice; intptr_t fpalookup; int32_t y1ve[4], y2ve[4], tsizx, tsizy; -#ifndef ENGINE_USING_A_C +#if 1 //ndef ENGINE_USING_A_C char bad; int32_t i, u4, d4, z; + uintptr_t p; #endif #ifdef YAX_ENABLE if (g_nodraw) @@ -3714,7 +3716,7 @@ static void wallscan(int32_t x1, int32_t x2, int16_t *uwal, int16_t *dwal, int32 setupvlineasm(globalshiftval); -#ifndef ENGINE_USING_A_C +#if 1 //ndef ENGINE_USING_A_C x = x1; while ((umost[x] > dmost[x]) && (x <= x2)) x++; @@ -3786,13 +3788,13 @@ static void wallscan(int32_t x1, int32_t x2, int16_t *uwal, int16_t *dwal, int32 if (u4 > y1ve[2]) vplce[2] = prevlineasm1(vince[2],palookupoffse[2],u4-y1ve[2]-1,vplce[2],bufplce[2],ylookup[y1ve[2]]+x+frameoffset+2); if (u4 > y1ve[3]) vplce[3] = prevlineasm1(vince[3],palookupoffse[3],u4-y1ve[3]-1,vplce[3],bufplce[3],ylookup[y1ve[3]]+x+frameoffset+3); - if (d4 >= u4) vlineasm4(d4-u4+1,ylookup[u4]+x+frameoffset); + if (d4 >= u4) vlineasm4(d4-u4+1, (char *)ylookup[u4]+x+frameoffset); - i = x+frameoffset+ylookup[d4+1]; - if (y2ve[0] > d4) prevlineasm1(vince[0],palookupoffse[0],y2ve[0]-d4-1,vplce[0],bufplce[0],i+0); - if (y2ve[1] > d4) prevlineasm1(vince[1],palookupoffse[1],y2ve[1]-d4-1,vplce[1],bufplce[1],i+1); - if (y2ve[2] > d4) prevlineasm1(vince[2],palookupoffse[2],y2ve[2]-d4-1,vplce[2],bufplce[2],i+2); - if (y2ve[3] > d4) prevlineasm1(vince[3],palookupoffse[3],y2ve[3]-d4-1,vplce[3],bufplce[3],i+3); + p = x+frameoffset+ylookup[d4+1]; + if (y2ve[0] > d4) prevlineasm1(vince[0],palookupoffse[0],y2ve[0]-d4-1,vplce[0],bufplce[0],p+0); + if (y2ve[1] > d4) prevlineasm1(vince[1],palookupoffse[1],y2ve[1]-d4-1,vplce[1],bufplce[1],p+1); + if (y2ve[2] > d4) prevlineasm1(vince[2],palookupoffse[2],y2ve[2]-d4-1,vplce[2],bufplce[2],p+2); + if (y2ve[3] > d4) prevlineasm1(vince[3],palookupoffse[3],y2ve[3]-d4-1,vplce[3],bufplce[3],p+3); } for (; x<=x2; x++) { @@ -7045,7 +7047,7 @@ static void dorotatesprite(int32_t sx, int32_t sy, int32_t z, int16_t a, int16_t if (u4 > y1ve[2]) vplce[2] = prevlineasm1(vince[2],palookupoffse[2],u4-y1ve[2]-1,vplce[2],bufplce[2],ylookup[y1ve[2]]+p+2); if (u4 > y1ve[3]) vplce[3] = prevlineasm1(vince[3],palookupoffse[3],u4-y1ve[3]-1,vplce[3],bufplce[3],ylookup[y1ve[3]]+p+3); - if (d4 >= u4) vlineasm4(d4-u4+1,ylookup[u4]+p); + if (d4 >= u4) vlineasm4(d4-u4+1, (char *)ylookup[u4]+p); i = p+ylookup[d4+1]; if (y2ve[0] > d4) prevlineasm1(vince[0],palookupoffse[0],y2ve[0]-d4-1,vplce[0],bufplce[0],i+0); @@ -7069,7 +7071,7 @@ static void dorotatesprite(int32_t sx, int32_t sy, int32_t z, int16_t a, int16_t if (u4 > y1ve[2]) vplce[2] = mvlineasm1(vince[2],palookupoffse[2],u4-y1ve[2]-1,vplce[2],bufplce[2],ylookup[y1ve[2]]+p+2); if (u4 > y1ve[3]) vplce[3] = mvlineasm1(vince[3],palookupoffse[3],u4-y1ve[3]-1,vplce[3],bufplce[3],ylookup[y1ve[3]]+p+3); - if (d4 >= u4) mvlineasm4(d4-u4+1,ylookup[u4]+p); + if (d4 >= u4) mvlineasm4(d4-u4+1, (char *)ylookup[u4]+p); i = p+ylookup[d4+1]; if (y2ve[0] > d4) mvlineasm1(vince[0],palookupoffse[0],y2ve[0]-d4-1,vplce[0],bufplce[0],i+0);