Port [m]vlineasm4 to C replacements and enable for solid and masked walls.

These two functions draw a vertical line 4 neighboring pixels at a time.
This gives a significant speed boost for a full screen solid and masked wall
scene for x86_64 (where we have plenty of registers), about 60 --> 76 fps.

git-svn-id: https://svn.eduke32.com/eduke32@2497 1a8010ca-5511-0410-912e-c29ae57300e0
This commit is contained in:
helixhorned 2012-03-18 23:16:57 +00:00
parent 68603b0a28
commit e191a915f7
3 changed files with 113 additions and 30 deletions

View file

@ -35,9 +35,9 @@ extern int32_t __cdecl setuptvlineasm2(int32_t,int32_t,int32_t);
extern int32_t __cdecl tvlineasm2(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t);
extern int32_t __cdecl mvlineasm1(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t);
extern int32_t __cdecl setupvlineasm(int32_t);
extern int32_t __cdecl vlineasm4(int32_t,int32_t);
extern int32_t __cdecl vlineasm4(int32_t,char *);
extern int32_t __cdecl setupmvlineasm(int32_t);
extern int32_t __cdecl mvlineasm4(int32_t,int32_t);
extern int32_t __cdecl mvlineasm4(int32_t,char *);
extern int32_t __cdecl setupspritevline(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t);
extern int32_t __cdecl spritevline(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t);
extern int32_t __cdecl msetupspritevline(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t);
@ -81,9 +81,9 @@ extern int32_t _cdecl setuptvlineasm2(int32_t,int32_t,int32_t);
extern int32_t _cdecl tvlineasm2(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t);
extern int32_t _cdecl mvlineasm1(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t);
extern int32_t _cdecl setupvlineasm(int32_t);
extern int32_t _cdecl vlineasm4(int32_t,int32_t);
extern int32_t _cdecl vlineasm4(int32_t,char *);
extern int32_t _cdecl setupmvlineasm(int32_t);
extern int32_t _cdecl mvlineasm4(int32_t,int32_t);
extern int32_t _cdecl mvlineasm4(int32_t,char *);
extern int32_t _cdecl setupspritevline(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t);
extern int32_t _cdecl spritevline(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t);
extern int32_t _cdecl msetupspritevline(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t);
@ -109,6 +109,8 @@ extern void _cdecl stretchhline(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t)
#define ENGINE_USING_A_C
#include <stdint.h>
#define prevlineasm1 vlineasm1
void setvlinebpl(int32_t dabpl);
void fixtransluscence(intptr_t datransoff);
void settransnormal(void);
@ -123,13 +125,15 @@ void setupslopevlin(int32_t logylogx, intptr_t bufplc, int32_t pinc);
void slopevlin(intptr_t p, int32_t i, intptr_t slopaloffs, int32_t cnt, int32_t bx, int32_t by);
void setupvlineasm(int32_t neglogy);
void vlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p);
int32_t vlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p);
void vlineasm4(int32_t cnt, char *p);
void setupmvlineasm(int32_t neglogy);
void mvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p);
int32_t mvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p);
void mvlineasm4(int32_t cnt, char *p);
void setuptvlineasm(int32_t neglogy);
void tvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p);
int32_t tvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p);
void msethlineshift(int32_t logx, int32_t logy);
void mhline(intptr_t bufplc, uint32_t bx, int32_t cntup16, int32_t junk, uint32_t by, intptr_t p);

View file

@ -89,7 +89,7 @@ void slopevlin(intptr_t p, int32_t i, intptr_t slopaloffs, int32_t cnt, int32_t
void setupvlineasm(int32_t neglogy) { glogy = neglogy; }
// cnt+1 loop iterations!
void vlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p)
int32_t vlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p)
{
const char *const buf = (char *)bufplc;
const char *const pal = (char *)paloffs;
@ -105,11 +105,58 @@ void vlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intpt
vplc += vinc;
}
while (--cnt);
return vplc;
}
extern intptr_t palookupoffse[4];
extern int32_t vplce[4], vince[4];
extern intptr_t bufplce[4];
// cnt >= 1
void vlineasm4(int32_t cnt, char *p)
{
char ch;
int32_t i;
#if 1
// this gives slightly more stuff in registers in the loop
// (on x86_64 at least)
char *const pal[4] = {(char *)palookupoffse[0], (char *)palookupoffse[1], (char *)palookupoffse[2], (char *)palookupoffse[3]};
char *const buf[4] = {(char *)bufplce[0], (char *)bufplce[1], (char *)bufplce[2], (char *)bufplce[3]};
const int32_t vinc[4] = {vince[0], vince[1], vince[2], vince[3]};
uint32_t vplc[4] = {vplce[0], vplce[1], vplce[2], vplce[3]};
#else
char *pal[4];
char *buf[4];
int32_t vinc[4];
uint32_t vplc[4];
Bmemcpy(pal, palookupoffse, sizeof(pal));
Bmemcpy(buf, bufplce, sizeof(buf));
Bmemcpy(vinc, vince, sizeof(vinc));
Bmemcpy(vplc, vplce, sizeof(vplc));
#endif
const int32_t logy = glogy, ourbpl = bpl;
do
{
for (i=0; i<4; i++)
{
ch = buf[i][vplc[i]>>logy]; p[i] = pal[i][ch];
vplc[i] += vinc[i];
}
p += ourbpl;
}
while (--cnt);
Bmemcpy(vplce, vplc, sizeof(vplce));
}
void setupmvlineasm(int32_t neglogy) { glogy = neglogy; }
// cnt+1 loop iterations!
void mvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p)
int32_t mvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p)
{
char ch;
@ -127,8 +174,38 @@ void mvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intp
vplc += vinc;
}
while (--cnt);
return vplc;
}
// cnt >= 1
void mvlineasm4(int32_t cnt, char *p)
{
char ch;
int32_t i;
char *const pal[4] = {(char *)palookupoffse[0], (char *)palookupoffse[1], (char *)palookupoffse[2], (char *)palookupoffse[3]};
char *const buf[4] = {(char *)bufplce[0], (char *)bufplce[1], (char *)bufplce[2], (char *)bufplce[3]};
const int32_t vinc[4] = {vince[0], vince[1], vince[2], vince[3]};
uint32_t vplc[4] = {vplce[0], vplce[1], vplce[2], vplce[3]};
const int32_t logy = glogy, ourbpl = bpl;
do
{
for (i=0; i<4; i++)
{
ch = buf[i][vplc[i]>>logy]; if (ch != 255) p[i] = pal[i][ch];
vplc[i] += vinc[i];
}
p += ourbpl;
}
while (--cnt);
Bmemcpy(vplce, vplc, sizeof(vplce));
}
void setuptvlineasm(int32_t neglogy) { glogy = neglogy; }
// cnt+1 loop iterations!
void tvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p)

View file

@ -2240,7 +2240,8 @@ int32_t xyaspect;
static int32_t viewingrangerecip;
intptr_t asm1, asm2, asm3, asm4, palookupoffse[4];
int32_t vplce[4], vince[4], bufplce[4];
int32_t vplce[4], vince[4];
intptr_t bufplce[4];
static char globalxshift, globalyshift;
static int32_t globalxpanning, globalypanning;
int32_t globalshade, globalorientation;
@ -2637,9 +2638,9 @@ skipitaddwall:
static void maskwallscan(int32_t x1, int32_t x2, int16_t *uwal, int16_t *dwal, int32_t *swal, int32_t *lwal)
{
int32_t x,/* startx,*/ xnice, ynice;
intptr_t startx, p, fpalookup;
int32_t y1ve[4], y2ve[4], /* p,*/ tsizx, tsizy;
#ifndef ENGINE_USING_A_C
intptr_t startx, p, pp, fpalookup;
int32_t y1ve[4], y2ve[4], tsizx, tsizy;
#if 1 //ndef ENGINE_USING_A_C
char bad;
int32_t i, u4, d4, dax, z;
#endif
@ -2667,7 +2668,7 @@ static void maskwallscan(int32_t x1, int32_t x2, int16_t *uwal, int16_t *dwal, i
setupmvlineasm(globalshiftval);
#ifndef ENGINE_USING_A_C
#if 1 //ndef ENGINE_USING_A_C
x = startx;
while ((startumost[x+windowx1] > startdmost[x+windowx1]) && (x <= x2)) x++;
@ -2741,13 +2742,13 @@ static void maskwallscan(int32_t x1, int32_t x2, int16_t *uwal, int16_t *dwal, i
if (u4 > y1ve[2]) vplce[2] = mvlineasm1(vince[2],palookupoffse[2],u4-y1ve[2]-1,vplce[2],bufplce[2],ylookup[y1ve[2]]+p+2);
if (u4 > y1ve[3]) vplce[3] = mvlineasm1(vince[3],palookupoffse[3],u4-y1ve[3]-1,vplce[3],bufplce[3],ylookup[y1ve[3]]+p+3);
if (d4 >= u4) mvlineasm4(d4-u4+1,ylookup[u4]+p);
if (d4 >= u4) mvlineasm4(d4-u4+1, (char *)ylookup[u4]+p);
i = p+ylookup[d4+1];
if (y2ve[0] > d4) mvlineasm1(vince[0],palookupoffse[0],y2ve[0]-d4-1,vplce[0],bufplce[0],i+0);
if (y2ve[1] > d4) mvlineasm1(vince[1],palookupoffse[1],y2ve[1]-d4-1,vplce[1],bufplce[1],i+1);
if (y2ve[2] > d4) mvlineasm1(vince[2],palookupoffse[2],y2ve[2]-d4-1,vplce[2],bufplce[2],i+2);
if (y2ve[3] > d4) mvlineasm1(vince[3],palookupoffse[3],y2ve[3]-d4-1,vplce[3],bufplce[3],i+3);
pp = p+ylookup[d4+1];
if (y2ve[0] > d4) mvlineasm1(vince[0],palookupoffse[0],y2ve[0]-d4-1,vplce[0],bufplce[0],pp+0);
if (y2ve[1] > d4) mvlineasm1(vince[1],palookupoffse[1],y2ve[1]-d4-1,vplce[1],bufplce[1],pp+1);
if (y2ve[2] > d4) mvlineasm1(vince[2],palookupoffse[2],y2ve[2]-d4-1,vplce[2],bufplce[2],pp+2);
if (y2ve[3] > d4) mvlineasm1(vince[3],palookupoffse[3],y2ve[3]-d4-1,vplce[3],bufplce[3],pp+3);
}
for (; x<=x2; x++,p++)
{
@ -3686,9 +3687,10 @@ static void wallscan(int32_t x1, int32_t x2, int16_t *uwal, int16_t *dwal, int32
int32_t x, xnice, ynice;
intptr_t fpalookup;
int32_t y1ve[4], y2ve[4], tsizx, tsizy;
#ifndef ENGINE_USING_A_C
#if 1 //ndef ENGINE_USING_A_C
char bad;
int32_t i, u4, d4, z;
uintptr_t p;
#endif
#ifdef YAX_ENABLE
if (g_nodraw)
@ -3714,7 +3716,7 @@ static void wallscan(int32_t x1, int32_t x2, int16_t *uwal, int16_t *dwal, int32
setupvlineasm(globalshiftval);
#ifndef ENGINE_USING_A_C
#if 1 //ndef ENGINE_USING_A_C
x = x1;
while ((umost[x] > dmost[x]) && (x <= x2)) x++;
@ -3786,13 +3788,13 @@ static void wallscan(int32_t x1, int32_t x2, int16_t *uwal, int16_t *dwal, int32
if (u4 > y1ve[2]) vplce[2] = prevlineasm1(vince[2],palookupoffse[2],u4-y1ve[2]-1,vplce[2],bufplce[2],ylookup[y1ve[2]]+x+frameoffset+2);
if (u4 > y1ve[3]) vplce[3] = prevlineasm1(vince[3],palookupoffse[3],u4-y1ve[3]-1,vplce[3],bufplce[3],ylookup[y1ve[3]]+x+frameoffset+3);
if (d4 >= u4) vlineasm4(d4-u4+1,ylookup[u4]+x+frameoffset);
if (d4 >= u4) vlineasm4(d4-u4+1, (char *)ylookup[u4]+x+frameoffset);
i = x+frameoffset+ylookup[d4+1];
if (y2ve[0] > d4) prevlineasm1(vince[0],palookupoffse[0],y2ve[0]-d4-1,vplce[0],bufplce[0],i+0);
if (y2ve[1] > d4) prevlineasm1(vince[1],palookupoffse[1],y2ve[1]-d4-1,vplce[1],bufplce[1],i+1);
if (y2ve[2] > d4) prevlineasm1(vince[2],palookupoffse[2],y2ve[2]-d4-1,vplce[2],bufplce[2],i+2);
if (y2ve[3] > d4) prevlineasm1(vince[3],palookupoffse[3],y2ve[3]-d4-1,vplce[3],bufplce[3],i+3);
p = x+frameoffset+ylookup[d4+1];
if (y2ve[0] > d4) prevlineasm1(vince[0],palookupoffse[0],y2ve[0]-d4-1,vplce[0],bufplce[0],p+0);
if (y2ve[1] > d4) prevlineasm1(vince[1],palookupoffse[1],y2ve[1]-d4-1,vplce[1],bufplce[1],p+1);
if (y2ve[2] > d4) prevlineasm1(vince[2],palookupoffse[2],y2ve[2]-d4-1,vplce[2],bufplce[2],p+2);
if (y2ve[3] > d4) prevlineasm1(vince[3],palookupoffse[3],y2ve[3]-d4-1,vplce[3],bufplce[3],p+3);
}
for (; x<=x2; x++)
{
@ -7045,7 +7047,7 @@ static void dorotatesprite(int32_t sx, int32_t sy, int32_t z, int16_t a, int16_t
if (u4 > y1ve[2]) vplce[2] = prevlineasm1(vince[2],palookupoffse[2],u4-y1ve[2]-1,vplce[2],bufplce[2],ylookup[y1ve[2]]+p+2);
if (u4 > y1ve[3]) vplce[3] = prevlineasm1(vince[3],palookupoffse[3],u4-y1ve[3]-1,vplce[3],bufplce[3],ylookup[y1ve[3]]+p+3);
if (d4 >= u4) vlineasm4(d4-u4+1,ylookup[u4]+p);
if (d4 >= u4) vlineasm4(d4-u4+1, (char *)ylookup[u4]+p);
i = p+ylookup[d4+1];
if (y2ve[0] > d4) prevlineasm1(vince[0],palookupoffse[0],y2ve[0]-d4-1,vplce[0],bufplce[0],i+0);
@ -7069,7 +7071,7 @@ static void dorotatesprite(int32_t sx, int32_t sy, int32_t z, int16_t a, int16_t
if (u4 > y1ve[2]) vplce[2] = mvlineasm1(vince[2],palookupoffse[2],u4-y1ve[2]-1,vplce[2],bufplce[2],ylookup[y1ve[2]]+p+2);
if (u4 > y1ve[3]) vplce[3] = mvlineasm1(vince[3],palookupoffse[3],u4-y1ve[3]-1,vplce[3],bufplce[3],ylookup[y1ve[3]]+p+3);
if (d4 >= u4) mvlineasm4(d4-u4+1,ylookup[u4]+p);
if (d4 >= u4) mvlineasm4(d4-u4+1, (char *)ylookup[u4]+p);
i = p+ylookup[d4+1];
if (y2ve[0] > d4) mvlineasm1(vince[0],palookupoffse[0],y2ve[0]-d4-1,vplce[0],bufplce[0],i+0);