mirror of
https://github.com/ZDoom/raze-gles.git
synced 2025-01-26 00:40:56 +00:00
Port [m]vlineasm4 to C replacements and enable for solid and masked walls.
These two functions draw a vertical line 4 neighboring pixels at a time. This gives a significant speed boost for a full screen solid and masked wall scene for x86_64 (where we have plenty of registers), about 60 --> 76 fps. git-svn-id: https://svn.eduke32.com/eduke32@2497 1a8010ca-5511-0410-912e-c29ae57300e0
This commit is contained in:
parent
68603b0a28
commit
e191a915f7
3 changed files with 113 additions and 30 deletions
|
@ -35,9 +35,9 @@ extern int32_t __cdecl setuptvlineasm2(int32_t,int32_t,int32_t);
|
|||
extern int32_t __cdecl tvlineasm2(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t);
|
||||
extern int32_t __cdecl mvlineasm1(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t);
|
||||
extern int32_t __cdecl setupvlineasm(int32_t);
|
||||
extern int32_t __cdecl vlineasm4(int32_t,int32_t);
|
||||
extern int32_t __cdecl vlineasm4(int32_t,char *);
|
||||
extern int32_t __cdecl setupmvlineasm(int32_t);
|
||||
extern int32_t __cdecl mvlineasm4(int32_t,int32_t);
|
||||
extern int32_t __cdecl mvlineasm4(int32_t,char *);
|
||||
extern int32_t __cdecl setupspritevline(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t);
|
||||
extern int32_t __cdecl spritevline(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t);
|
||||
extern int32_t __cdecl msetupspritevline(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t);
|
||||
|
@ -81,9 +81,9 @@ extern int32_t _cdecl setuptvlineasm2(int32_t,int32_t,int32_t);
|
|||
extern int32_t _cdecl tvlineasm2(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t);
|
||||
extern int32_t _cdecl mvlineasm1(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t);
|
||||
extern int32_t _cdecl setupvlineasm(int32_t);
|
||||
extern int32_t _cdecl vlineasm4(int32_t,int32_t);
|
||||
extern int32_t _cdecl vlineasm4(int32_t,char *);
|
||||
extern int32_t _cdecl setupmvlineasm(int32_t);
|
||||
extern int32_t _cdecl mvlineasm4(int32_t,int32_t);
|
||||
extern int32_t _cdecl mvlineasm4(int32_t,char *);
|
||||
extern int32_t _cdecl setupspritevline(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t);
|
||||
extern int32_t _cdecl spritevline(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t);
|
||||
extern int32_t _cdecl msetupspritevline(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t);
|
||||
|
@ -109,6 +109,8 @@ extern void _cdecl stretchhline(int32_t,int32_t,int32_t,int32_t,int32_t,int32_t)
|
|||
#define ENGINE_USING_A_C
|
||||
#include <stdint.h>
|
||||
|
||||
#define prevlineasm1 vlineasm1
|
||||
|
||||
void setvlinebpl(int32_t dabpl);
|
||||
void fixtransluscence(intptr_t datransoff);
|
||||
void settransnormal(void);
|
||||
|
@ -123,13 +125,15 @@ void setupslopevlin(int32_t logylogx, intptr_t bufplc, int32_t pinc);
|
|||
void slopevlin(intptr_t p, int32_t i, intptr_t slopaloffs, int32_t cnt, int32_t bx, int32_t by);
|
||||
|
||||
void setupvlineasm(int32_t neglogy);
|
||||
void vlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p);
|
||||
int32_t vlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p);
|
||||
void vlineasm4(int32_t cnt, char *p);
|
||||
|
||||
void setupmvlineasm(int32_t neglogy);
|
||||
void mvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p);
|
||||
int32_t mvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p);
|
||||
void mvlineasm4(int32_t cnt, char *p);
|
||||
|
||||
void setuptvlineasm(int32_t neglogy);
|
||||
void tvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p);
|
||||
int32_t tvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p);
|
||||
|
||||
void msethlineshift(int32_t logx, int32_t logy);
|
||||
void mhline(intptr_t bufplc, uint32_t bx, int32_t cntup16, int32_t junk, uint32_t by, intptr_t p);
|
||||
|
|
|
@ -89,7 +89,7 @@ void slopevlin(intptr_t p, int32_t i, intptr_t slopaloffs, int32_t cnt, int32_t
|
|||
|
||||
void setupvlineasm(int32_t neglogy) { glogy = neglogy; }
|
||||
// cnt+1 loop iterations!
|
||||
void vlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p)
|
||||
int32_t vlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p)
|
||||
{
|
||||
const char *const buf = (char *)bufplc;
|
||||
const char *const pal = (char *)paloffs;
|
||||
|
@ -105,11 +105,58 @@ void vlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intpt
|
|||
vplc += vinc;
|
||||
}
|
||||
while (--cnt);
|
||||
|
||||
return vplc;
|
||||
}
|
||||
|
||||
|
||||
extern intptr_t palookupoffse[4];
|
||||
extern int32_t vplce[4], vince[4];
|
||||
extern intptr_t bufplce[4];
|
||||
|
||||
// cnt >= 1
|
||||
void vlineasm4(int32_t cnt, char *p)
|
||||
{
|
||||
char ch;
|
||||
int32_t i;
|
||||
#if 1
|
||||
// this gives slightly more stuff in registers in the loop
|
||||
// (on x86_64 at least)
|
||||
char *const pal[4] = {(char *)palookupoffse[0], (char *)palookupoffse[1], (char *)palookupoffse[2], (char *)palookupoffse[3]};
|
||||
char *const buf[4] = {(char *)bufplce[0], (char *)bufplce[1], (char *)bufplce[2], (char *)bufplce[3]};
|
||||
const int32_t vinc[4] = {vince[0], vince[1], vince[2], vince[3]};
|
||||
uint32_t vplc[4] = {vplce[0], vplce[1], vplce[2], vplce[3]};
|
||||
#else
|
||||
char *pal[4];
|
||||
char *buf[4];
|
||||
int32_t vinc[4];
|
||||
uint32_t vplc[4];
|
||||
|
||||
Bmemcpy(pal, palookupoffse, sizeof(pal));
|
||||
Bmemcpy(buf, bufplce, sizeof(buf));
|
||||
Bmemcpy(vinc, vince, sizeof(vinc));
|
||||
Bmemcpy(vplc, vplce, sizeof(vplc));
|
||||
#endif
|
||||
|
||||
const int32_t logy = glogy, ourbpl = bpl;
|
||||
|
||||
do
|
||||
{
|
||||
for (i=0; i<4; i++)
|
||||
{
|
||||
ch = buf[i][vplc[i]>>logy]; p[i] = pal[i][ch];
|
||||
vplc[i] += vinc[i];
|
||||
}
|
||||
p += ourbpl;
|
||||
}
|
||||
while (--cnt);
|
||||
|
||||
Bmemcpy(vplce, vplc, sizeof(vplce));
|
||||
}
|
||||
|
||||
void setupmvlineasm(int32_t neglogy) { glogy = neglogy; }
|
||||
// cnt+1 loop iterations!
|
||||
void mvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p)
|
||||
int32_t mvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p)
|
||||
{
|
||||
char ch;
|
||||
|
||||
|
@ -127,8 +174,38 @@ void mvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intp
|
|||
vplc += vinc;
|
||||
}
|
||||
while (--cnt);
|
||||
|
||||
return vplc;
|
||||
}
|
||||
|
||||
// cnt >= 1
|
||||
void mvlineasm4(int32_t cnt, char *p)
|
||||
{
|
||||
char ch;
|
||||
int32_t i;
|
||||
|
||||
char *const pal[4] = {(char *)palookupoffse[0], (char *)palookupoffse[1], (char *)palookupoffse[2], (char *)palookupoffse[3]};
|
||||
char *const buf[4] = {(char *)bufplce[0], (char *)bufplce[1], (char *)bufplce[2], (char *)bufplce[3]};
|
||||
const int32_t vinc[4] = {vince[0], vince[1], vince[2], vince[3]};
|
||||
uint32_t vplc[4] = {vplce[0], vplce[1], vplce[2], vplce[3]};
|
||||
|
||||
const int32_t logy = glogy, ourbpl = bpl;
|
||||
|
||||
do
|
||||
{
|
||||
for (i=0; i<4; i++)
|
||||
{
|
||||
ch = buf[i][vplc[i]>>logy]; if (ch != 255) p[i] = pal[i][ch];
|
||||
vplc[i] += vinc[i];
|
||||
}
|
||||
p += ourbpl;
|
||||
}
|
||||
while (--cnt);
|
||||
|
||||
Bmemcpy(vplce, vplc, sizeof(vplce));
|
||||
}
|
||||
|
||||
|
||||
void setuptvlineasm(int32_t neglogy) { glogy = neglogy; }
|
||||
// cnt+1 loop iterations!
|
||||
void tvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, intptr_t bufplc, intptr_t p)
|
||||
|
|
|
@ -2240,7 +2240,8 @@ int32_t xyaspect;
|
|||
static int32_t viewingrangerecip;
|
||||
|
||||
intptr_t asm1, asm2, asm3, asm4, palookupoffse[4];
|
||||
int32_t vplce[4], vince[4], bufplce[4];
|
||||
int32_t vplce[4], vince[4];
|
||||
intptr_t bufplce[4];
|
||||
static char globalxshift, globalyshift;
|
||||
static int32_t globalxpanning, globalypanning;
|
||||
int32_t globalshade, globalorientation;
|
||||
|
@ -2637,9 +2638,9 @@ skipitaddwall:
|
|||
static void maskwallscan(int32_t x1, int32_t x2, int16_t *uwal, int16_t *dwal, int32_t *swal, int32_t *lwal)
|
||||
{
|
||||
int32_t x,/* startx,*/ xnice, ynice;
|
||||
intptr_t startx, p, fpalookup;
|
||||
int32_t y1ve[4], y2ve[4], /* p,*/ tsizx, tsizy;
|
||||
#ifndef ENGINE_USING_A_C
|
||||
intptr_t startx, p, pp, fpalookup;
|
||||
int32_t y1ve[4], y2ve[4], tsizx, tsizy;
|
||||
#if 1 //ndef ENGINE_USING_A_C
|
||||
char bad;
|
||||
int32_t i, u4, d4, dax, z;
|
||||
#endif
|
||||
|
@ -2667,7 +2668,7 @@ static void maskwallscan(int32_t x1, int32_t x2, int16_t *uwal, int16_t *dwal, i
|
|||
|
||||
setupmvlineasm(globalshiftval);
|
||||
|
||||
#ifndef ENGINE_USING_A_C
|
||||
#if 1 //ndef ENGINE_USING_A_C
|
||||
|
||||
x = startx;
|
||||
while ((startumost[x+windowx1] > startdmost[x+windowx1]) && (x <= x2)) x++;
|
||||
|
@ -2741,13 +2742,13 @@ static void maskwallscan(int32_t x1, int32_t x2, int16_t *uwal, int16_t *dwal, i
|
|||
if (u4 > y1ve[2]) vplce[2] = mvlineasm1(vince[2],palookupoffse[2],u4-y1ve[2]-1,vplce[2],bufplce[2],ylookup[y1ve[2]]+p+2);
|
||||
if (u4 > y1ve[3]) vplce[3] = mvlineasm1(vince[3],palookupoffse[3],u4-y1ve[3]-1,vplce[3],bufplce[3],ylookup[y1ve[3]]+p+3);
|
||||
|
||||
if (d4 >= u4) mvlineasm4(d4-u4+1,ylookup[u4]+p);
|
||||
if (d4 >= u4) mvlineasm4(d4-u4+1, (char *)ylookup[u4]+p);
|
||||
|
||||
i = p+ylookup[d4+1];
|
||||
if (y2ve[0] > d4) mvlineasm1(vince[0],palookupoffse[0],y2ve[0]-d4-1,vplce[0],bufplce[0],i+0);
|
||||
if (y2ve[1] > d4) mvlineasm1(vince[1],palookupoffse[1],y2ve[1]-d4-1,vplce[1],bufplce[1],i+1);
|
||||
if (y2ve[2] > d4) mvlineasm1(vince[2],palookupoffse[2],y2ve[2]-d4-1,vplce[2],bufplce[2],i+2);
|
||||
if (y2ve[3] > d4) mvlineasm1(vince[3],palookupoffse[3],y2ve[3]-d4-1,vplce[3],bufplce[3],i+3);
|
||||
pp = p+ylookup[d4+1];
|
||||
if (y2ve[0] > d4) mvlineasm1(vince[0],palookupoffse[0],y2ve[0]-d4-1,vplce[0],bufplce[0],pp+0);
|
||||
if (y2ve[1] > d4) mvlineasm1(vince[1],palookupoffse[1],y2ve[1]-d4-1,vplce[1],bufplce[1],pp+1);
|
||||
if (y2ve[2] > d4) mvlineasm1(vince[2],palookupoffse[2],y2ve[2]-d4-1,vplce[2],bufplce[2],pp+2);
|
||||
if (y2ve[3] > d4) mvlineasm1(vince[3],palookupoffse[3],y2ve[3]-d4-1,vplce[3],bufplce[3],pp+3);
|
||||
}
|
||||
for (; x<=x2; x++,p++)
|
||||
{
|
||||
|
@ -3686,9 +3687,10 @@ static void wallscan(int32_t x1, int32_t x2, int16_t *uwal, int16_t *dwal, int32
|
|||
int32_t x, xnice, ynice;
|
||||
intptr_t fpalookup;
|
||||
int32_t y1ve[4], y2ve[4], tsizx, tsizy;
|
||||
#ifndef ENGINE_USING_A_C
|
||||
#if 1 //ndef ENGINE_USING_A_C
|
||||
char bad;
|
||||
int32_t i, u4, d4, z;
|
||||
uintptr_t p;
|
||||
#endif
|
||||
#ifdef YAX_ENABLE
|
||||
if (g_nodraw)
|
||||
|
@ -3714,7 +3716,7 @@ static void wallscan(int32_t x1, int32_t x2, int16_t *uwal, int16_t *dwal, int32
|
|||
|
||||
setupvlineasm(globalshiftval);
|
||||
|
||||
#ifndef ENGINE_USING_A_C
|
||||
#if 1 //ndef ENGINE_USING_A_C
|
||||
|
||||
x = x1;
|
||||
while ((umost[x] > dmost[x]) && (x <= x2)) x++;
|
||||
|
@ -3786,13 +3788,13 @@ static void wallscan(int32_t x1, int32_t x2, int16_t *uwal, int16_t *dwal, int32
|
|||
if (u4 > y1ve[2]) vplce[2] = prevlineasm1(vince[2],palookupoffse[2],u4-y1ve[2]-1,vplce[2],bufplce[2],ylookup[y1ve[2]]+x+frameoffset+2);
|
||||
if (u4 > y1ve[3]) vplce[3] = prevlineasm1(vince[3],palookupoffse[3],u4-y1ve[3]-1,vplce[3],bufplce[3],ylookup[y1ve[3]]+x+frameoffset+3);
|
||||
|
||||
if (d4 >= u4) vlineasm4(d4-u4+1,ylookup[u4]+x+frameoffset);
|
||||
if (d4 >= u4) vlineasm4(d4-u4+1, (char *)ylookup[u4]+x+frameoffset);
|
||||
|
||||
i = x+frameoffset+ylookup[d4+1];
|
||||
if (y2ve[0] > d4) prevlineasm1(vince[0],palookupoffse[0],y2ve[0]-d4-1,vplce[0],bufplce[0],i+0);
|
||||
if (y2ve[1] > d4) prevlineasm1(vince[1],palookupoffse[1],y2ve[1]-d4-1,vplce[1],bufplce[1],i+1);
|
||||
if (y2ve[2] > d4) prevlineasm1(vince[2],palookupoffse[2],y2ve[2]-d4-1,vplce[2],bufplce[2],i+2);
|
||||
if (y2ve[3] > d4) prevlineasm1(vince[3],palookupoffse[3],y2ve[3]-d4-1,vplce[3],bufplce[3],i+3);
|
||||
p = x+frameoffset+ylookup[d4+1];
|
||||
if (y2ve[0] > d4) prevlineasm1(vince[0],palookupoffse[0],y2ve[0]-d4-1,vplce[0],bufplce[0],p+0);
|
||||
if (y2ve[1] > d4) prevlineasm1(vince[1],palookupoffse[1],y2ve[1]-d4-1,vplce[1],bufplce[1],p+1);
|
||||
if (y2ve[2] > d4) prevlineasm1(vince[2],palookupoffse[2],y2ve[2]-d4-1,vplce[2],bufplce[2],p+2);
|
||||
if (y2ve[3] > d4) prevlineasm1(vince[3],palookupoffse[3],y2ve[3]-d4-1,vplce[3],bufplce[3],p+3);
|
||||
}
|
||||
for (; x<=x2; x++)
|
||||
{
|
||||
|
@ -7045,7 +7047,7 @@ static void dorotatesprite(int32_t sx, int32_t sy, int32_t z, int16_t a, int16_t
|
|||
if (u4 > y1ve[2]) vplce[2] = prevlineasm1(vince[2],palookupoffse[2],u4-y1ve[2]-1,vplce[2],bufplce[2],ylookup[y1ve[2]]+p+2);
|
||||
if (u4 > y1ve[3]) vplce[3] = prevlineasm1(vince[3],palookupoffse[3],u4-y1ve[3]-1,vplce[3],bufplce[3],ylookup[y1ve[3]]+p+3);
|
||||
|
||||
if (d4 >= u4) vlineasm4(d4-u4+1,ylookup[u4]+p);
|
||||
if (d4 >= u4) vlineasm4(d4-u4+1, (char *)ylookup[u4]+p);
|
||||
|
||||
i = p+ylookup[d4+1];
|
||||
if (y2ve[0] > d4) prevlineasm1(vince[0],palookupoffse[0],y2ve[0]-d4-1,vplce[0],bufplce[0],i+0);
|
||||
|
@ -7069,7 +7071,7 @@ static void dorotatesprite(int32_t sx, int32_t sy, int32_t z, int16_t a, int16_t
|
|||
if (u4 > y1ve[2]) vplce[2] = mvlineasm1(vince[2],palookupoffse[2],u4-y1ve[2]-1,vplce[2],bufplce[2],ylookup[y1ve[2]]+p+2);
|
||||
if (u4 > y1ve[3]) vplce[3] = mvlineasm1(vince[3],palookupoffse[3],u4-y1ve[3]-1,vplce[3],bufplce[3],ylookup[y1ve[3]]+p+3);
|
||||
|
||||
if (d4 >= u4) mvlineasm4(d4-u4+1,ylookup[u4]+p);
|
||||
if (d4 >= u4) mvlineasm4(d4-u4+1, (char *)ylookup[u4]+p);
|
||||
|
||||
i = p+ylookup[d4+1];
|
||||
if (y2ve[0] > d4) mvlineasm1(vince[0],palookupoffse[0],y2ve[0]-d4-1,vplce[0],bufplce[0],i+0);
|
||||
|
|
Loading…
Reference in a new issue