- Added an assembly version of rt_shaded4cols, since that's the main decal

drawing function. The most improvement came from being able to turn some
  constant variables into immediate values with self-modifying code, but I
  also managed to reorder it to make it a little faster. It's about 9% faster
  than VC++'s code and 19% faster than GCC's code. That's not a huge
  improvement (for VC++), but at least it's measurable.
- Removed the solid fill "optimization" from rt_shaded4cols(), because in my
  testing, it didn't help any and in fact, hurt just a little bit.
- In the name of simplification, all the rt_tlate* drawers were changed to do
  the translation in one step and the drawing in another. This lets me call
  the untranslated drawer to do the real drawing instead of mostly duplicating
  them. Performance wise, there is practically no difference from before.


SVN r771 (trunk)
This commit is contained in:
Randy Heit 2008-02-27 03:11:35 +00:00
parent 24100c25a9
commit 9cb674c60c
8 changed files with 368 additions and 500 deletions

View File

@ -7,7 +7,7 @@
# Where did you install the FMOD API to? Change this line so that the build process can find it.
FMODDIR = "c:/program files/fmodapi375win"
ifeq ($(findstring msys,$(shell sh --version 2>nul)),msys)
ifeq ($(findstring msys,$(shell sh --version 2>nul)),msys)
WINCMD=0
else
WINCMD=1

View File

@ -1,3 +1,17 @@
February 26, 2008
- Added an assembly version of rt_shaded4cols, since that's the main decal
drawing function. The most improvement came from being able to turn some
constant variables into immediate values with self-modifying code, but I
also managed to reorder it to make it a little faster. It's about 9% faster
than VC++'s code and 19% faster than GCC's code. That's not a huge
improvement (for VC++), but at least it's measurable.
- Removed the solid fill "optimization" from rt_shaded4cols(), because in my
testing, it didn't help any and in fact, hurt just a little bit.
- In the name of simplification, all the rt_tlate* drawers were changed to do
the translation in one step and the drawing in another. This lets me call
the untranslated drawer to do the real drawing instead of mostly duplicating
them. Performance wise, there is practically no difference from before.
February 25, 2008 (Changes by Graf Zahl)
- Fixed: The DECORATE expression evaluator's random function could produce
incorrect results for ranges > 255. Changed so that FRandom's default

View File

@ -80,6 +80,7 @@ int detailyshift; // [RH] Y shift for vertical detail level
extern "C" void STACK_ARGS DoubleHoriz_MMX (int height, int width, BYTE *dest, int pitch);
extern "C" void STACK_ARGS DoubleHorizVert_MMX (int height, int width, BYTE *dest, int pitch);
extern "C" void STACK_ARGS DoubleVert_ASM (int height, int width, BYTE *dest, int pitch);
extern "C" void R_SetupShadedCol();
#endif
// [RH] Pointers to the different column drawers.
@ -94,7 +95,7 @@ void (*R_DrawSpan)(void);
void (*R_DrawSpanMasked)(void);
void (*R_DrawSpanTranslucent)(void);
void (*R_DrawSpanMaskedTranslucent)(void);
void (*rt_map4cols)(int,int,int);
void (STACK_ARGS *rt_map4cols)(int,int,int);
//
// R_DrawColumn
@ -2270,6 +2271,7 @@ ESPSResult R_SetPatchStyle (FRenderStyle style, fixed_t alpha, int translation,
{
dc_colormap += fixedlightlev;
}
R_SetupShadedCol();
return r_columnmethod ? DoDraw1 : DoDraw0;
}

View File

@ -106,47 +106,59 @@ void R_DrawMaskedColumnHoriz (const BYTE *column, const FTexture::Span *spans);
void R_InitColumnDrawers ();
// [RH] Moves data from the temporary buffer to the screen.
extern "C"
{
void rt_copy1col_c (int hx, int sx, int yl, int yh);
void rt_copy4cols_c (int sx, int yl, int yh);
void rt_map1col_c (int hx, int sx, int yl, int yh);
void rt_map4cols_c (int sx, int yl, int yh);
void rt_add1col (int hx, int sx, int yl, int yh);
void rt_add4cols (int sx, int yl, int yh);
void rt_tlate1col (int hx, int sx, int yl, int yh);
void rt_tlate4cols (int sx, int yl, int yh);
void rt_tlateadd1col (int hx, int sx, int yl, int yh);
void rt_tlateadd4cols (int sx, int yl, int yh);
void STACK_ARGS rt_copy4cols_c (int sx, int yl, int yh);
void rt_shaded1col (int hx, int sx, int yl, int yh);
void rt_shaded4cols (int sx, int yl, int yh);
void STACK_ARGS rt_shaded4cols_c (int sx, int yl, int yh);
void STACK_ARGS rt_shaded4cols_asm (int sx, int yl, int yh);
void rt_map1col_c (int hx, int sx, int yl, int yh);
void rt_add1col (int hx, int sx, int yl, int yh);
void rt_addclamp1col (int hx, int sx, int yl, int yh);
void rt_addclamp4cols (int sx, int yl, int yh);
void rt_tlateaddclamp1col (int hx, int sx, int yl, int yh);
void rt_tlateaddclamp4cols (int sx, int yl, int yh);
void rt_subclamp1col (int hx, int sx, int yl, int yh);
void rt_subclamp4cols (int sx, int yl, int yh);
void rt_tlatesubclamp1col (int hx, int sx, int yl, int yh);
void rt_tlatesubclamp4cols (int sx, int yl, int yh);
void rt_revsubclamp1col (int hx, int sx, int yl, int yh);
void rt_revsubclamp4cols (int sx, int yl, int yh);
void rt_tlate1col (int hx, int sx, int yl, int yh);
void rt_tlateadd1col (int hx, int sx, int yl, int yh);
void rt_tlateaddclamp1col (int hx, int sx, int yl, int yh);
void rt_tlatesubclamp1col (int hx, int sx, int yl, int yh);
void rt_tlaterevsubclamp1col (int hx, int sx, int yl, int yh);
void rt_tlaterevsubclamp4cols (int sx, int yl, int yh);
extern "C" void rt_copy1col_asm (int hx, int sx, int yl, int yh);
extern "C" void rt_copy4cols_asm (int sx, int yl, int yh);
extern "C" void rt_map1col_asm (int hx, int sx, int yl, int yh);
extern "C" void rt_map4cols_asm1 (int sx, int yl, int yh);
extern "C" void rt_map4cols_asm2 (int sx, int yl, int yh);
void STACK_ARGS rt_map4cols_c (int sx, int yl, int yh);
void STACK_ARGS rt_add4cols (int sx, int yl, int yh);
void STACK_ARGS rt_addclamp4cols (int sx, int yl, int yh);
void STACK_ARGS rt_subclamp4cols (int sx, int yl, int yh);
void STACK_ARGS rt_revsubclamp4cols (int sx, int yl, int yh);
extern void (*rt_map4cols)(int sx, int yl, int yh);
void STACK_ARGS rt_tlate4cols (int sx, int yl, int yh);
void STACK_ARGS rt_tlateadd4cols (int sx, int yl, int yh);
void STACK_ARGS rt_tlateaddclamp4cols (int sx, int yl, int yh);
void STACK_ARGS rt_tlatesubclamp4cols (int sx, int yl, int yh);
void STACK_ARGS rt_tlaterevsubclamp4cols (int sx, int yl, int yh);
void rt_copy1col_asm (int hx, int sx, int yl, int yh);
void rt_map1col_asm (int hx, int sx, int yl, int yh);
void STACK_ARGS rt_copy4cols_asm (int sx, int yl, int yh);
void STACK_ARGS rt_map4cols_asm1 (int sx, int yl, int yh);
void STACK_ARGS rt_map4cols_asm2 (int sx, int yl, int yh);
}
extern void (STACK_ARGS *rt_map4cols)(int sx, int yl, int yh);
#ifdef USEASM
#define rt_copy1col rt_copy1col_asm
#define rt_copy4cols rt_copy4cols_asm
#define rt_map1col rt_map1col_asm
#define rt_shaded4cols rt_shaded4cols_asm
#else
#define rt_copy1col rt_copy1col_c
#define rt_copy4cols rt_copy4cols_c
#define rt_map1col rt_map1col_c
#define rt_shaded4cols rt_shaded4cols_c
#endif
void rt_draw4cols (int sx);

View File

@ -102,7 +102,7 @@ void rt_copy1col_c (int hx, int sx, int yl, int yh)
}
// Copies all four spans to the screen starting at sx.
void rt_copy4cols_c (int sx, int yl, int yh)
void STACK_ARGS rt_copy4cols_c (int sx, int yl, int yh)
{
int *source;
int *dest;
@ -170,7 +170,7 @@ void rt_map1col_c (int hx, int sx, int yl, int yh)
}
// Maps all four spans to the screen starting at sx.
void rt_map4cols_c (int sx, int yl, int yh)
void STACK_ARGS rt_map4cols_c (int sx, int yl, int yh)
{
BYTE *colormap;
BYTE *source;
@ -214,63 +214,106 @@ void rt_map4cols_c (int sx, int yl, int yh)
}
#endif /* !USEASM */
void rt_Translate1col(const BYTE *translation, int hx, int yl, int yh)
{
int count = yh - yl + 1;
BYTE *source = &dc_temp[yl*4 + hx];
// Things we do to hit the compiler's optimizer with a clue bat:
// 1. Parallelism is explicitly spelled out by using a separate
// C instruction for each assembly instruction. GCC lets me
// have four temporaries, but VC++ spills to the stack with
// more than two. Two is probably optimal, anyway.
// 2. The results of the translation lookups are explicitly
// stored in byte-sized variables. This causes the VC++ code
// to use byte mov instructions in most cases; for apparently
// random reasons, it will use movzx for some places. GCC
// ignores this and uses movzx always.
// Do 8 rows at a time.
for (int count8 = count >> 3; count8; --count8)
{
int c0, c1;
BYTE b0, b1;
c0 = source[0]; c1 = source[4];
b0 = translation[c0]; b1 = translation[c1];
source[0] = b0; source[4] = b1;
c0 = source[8]; c1 = source[12];
b0 = translation[c0]; b1 = translation[c1];
source[8] = b0; source[12] = b1;
c0 = source[16]; c1 = source[20];
b0 = translation[c0]; b1 = translation[c1];
source[16] = b0; source[20] = b1;
c0 = source[24]; c1 = source[28];
b0 = translation[c0]; b1 = translation[c1];
source[28] = b0; source[28] = b1;
source += 32;
}
// Finish by doing 1 row at a time.
for (count &= 7; count; --count, source++)
{
source[0] = translation[source[0]];
}
}
void rt_Translate4cols(const BYTE *translation, int yl, int yh)
{
int count = yh - yl + 1;
BYTE *source = &dc_temp[yl*4];
int c0, c1;
BYTE b0, b1;
// Do 2 rows at a time.
for (int count8 = count >> 1; count8; --count8)
{
c0 = source[0]; c1 = source[1];
b0 = translation[c0]; b1 = translation[c1];
source[0] = b0; source[1] = b1;
c0 = source[2]; c1 = source[3];
b0 = translation[c0]; b1 = translation[c1];
source[2] = b0; source[3] = b1;
c0 = source[4]; c1 = source[5];
b0 = translation[c0]; b1 = translation[c1];
source[4] = b0; source[5] = b1;
c0 = source[6]; c1 = source[7];
b0 = translation[c0]; b1 = translation[c1];
source[6] = b0; source[7] = b1;
source += 8;
}
// Do the final row if count was odd.
if (count & 1)
{
c0 = source[0]; c1 = source[1];
b0 = translation[c0]; b1 = translation[c1];
source[0] = b0; source[1] = b1;
c0 = source[2]; c1 = source[3];
b0 = translation[c0]; b1 = translation[c1];
source[2] = b0; source[3] = b1;
}
}
// Translates one span at hx to the screen at sx.
void rt_tlate1col (int hx, int sx, int yl, int yh)
{
BYTE *translation;
BYTE *colormap;
BYTE *source;
BYTE *dest;
int count;
int pitch;
count = yh-yl;
if (count < 0)
return;
count++;
translation = dc_translation;
colormap = dc_colormap;
dest = ylookup[yl] + sx + dc_destorg;
source = &dc_temp[yl*4 + hx];
pitch = dc_pitch;
do {
*dest = colormap[translation[*source]];
source += 4;
dest += pitch;
} while (--count);
rt_Translate1col(dc_translation, hx, yl, yh);
rt_map1col(hx, sx, yl, yh);
}
// Translates all four spans to the screen starting at sx.
void rt_tlate4cols (int sx, int yl, int yh)
void STACK_ARGS rt_tlate4cols (int sx, int yl, int yh)
{
BYTE *translation;
BYTE *colormap;
BYTE *source;
BYTE *dest;
int count;
int pitch;
translation = dc_translation;
count = yh-yl;
if (count < 0)
return;
count++;
colormap = dc_colormap;
dest = ylookup[yl] + sx + dc_destorg;
source = &dc_temp[yl*4];
pitch = dc_pitch;
do {
dest[0] = colormap[translation[source[0]]];
dest[1] = colormap[translation[source[1]]];
dest[2] = colormap[translation[source[2]]];
dest[3] = colormap[translation[source[3]]];
source += 4;
dest += pitch;
} while (--count);
rt_Translate4cols(dc_translation, yl, yh);
rt_map4cols(sx, yl, yh);
}
// Adds one span at hx to the screen at sx without clamping.
@ -308,7 +351,7 @@ void rt_add1col (int hx, int sx, int yl, int yh)
}
// Adds all four spans to the screen starting at sx without clamping.
void rt_add4cols (int sx, int yl, int yh)
void STACK_ARGS rt_add4cols (int sx, int yl, int yh)
{
BYTE *colormap;
BYTE *source;
@ -366,95 +409,15 @@ void rt_add4cols (int sx, int yl, int yh)
// Translates and adds one span at hx to the screen at sx without clamping.
void rt_tlateadd1col (int hx, int sx, int yl, int yh)
{
BYTE *translation;
BYTE *colormap;
BYTE *source;
BYTE *dest;
int count;
int pitch;
count = yh-yl;
if (count < 0)
return;
count++;
DWORD *fg2rgb = dc_srcblend;
DWORD *bg2rgb = dc_destblend;
translation = dc_translation;
colormap = dc_colormap;
dest = ylookup[yl] + sx + dc_destorg;
source = &dc_temp[yl*4 + hx];
pitch = dc_pitch;
do {
DWORD fg = colormap[translation[*source]];
DWORD bg = *dest;
fg = fg2rgb[fg];
bg = bg2rgb[bg];
fg = (fg+bg) | 0x1f07c1f;
*dest = RGB32k[0][0][fg & (fg>>15)];
source += 4;
dest += pitch;
} while (--count);
rt_Translate1col(dc_translation, hx, yl, yh);
rt_add1col(hx, sx, yl, yh);
}
// Translates and adds all four spans to the screen starting at sx without clamping.
void rt_tlateadd4cols (int sx, int yl, int yh)
void STACK_ARGS rt_tlateadd4cols (int sx, int yl, int yh)
{
BYTE *translation;
BYTE *colormap;
BYTE *source;
BYTE *dest;
int count;
int pitch;
count = yh-yl;
if (count < 0)
return;
count++;
DWORD *fg2rgb = dc_srcblend;
DWORD *bg2rgb = dc_destblend;
translation = dc_translation;
colormap = dc_colormap;
dest = ylookup[yl] + sx + dc_destorg;
source = &dc_temp[yl*4];
pitch = dc_pitch;
do {
DWORD fg = colormap[translation[source[0]]];
DWORD bg = dest[0];
fg = fg2rgb[fg];
bg = bg2rgb[bg];
fg = (fg+bg) | 0x1f07c1f;
dest[0] = RGB32k[0][0][fg & (fg>>15)];
fg = colormap[translation[source[1]]];
bg = dest[1];
fg = fg2rgb[fg];
bg = bg2rgb[bg];
fg = (fg+bg) | 0x1f07c1f;
dest[1] = RGB32k[0][0][fg & (fg>>15)];
fg = colormap[translation[source[2]]];
bg = dest[2];
fg = fg2rgb[fg];
bg = bg2rgb[bg];
fg = (fg+bg) | 0x1f07c1f;
dest[2] = RGB32k[0][0][fg & (fg>>15)];
fg = colormap[translation[source[3]]];
bg = dest[3];
fg = fg2rgb[fg];
bg = bg2rgb[bg];
fg = (fg+bg) | 0x1f07c1f;
dest[3] = RGB32k[0][0][fg & (fg>>15)];
source += 4;
dest += pitch;
} while (--count);
rt_Translate4cols(dc_translation, yl, yh);
rt_add4cols(sx, yl, yh);
}
// Shades one span at hx to the screen at sx.
@ -489,9 +452,8 @@ void rt_shaded1col (int hx, int sx, int yl, int yh)
}
// Shades all four spans to the screen starting at sx.
void rt_shaded4cols (int sx, int yl, int yh)
void STACK_ARGS rt_shaded4cols_c (int sx, int yl, int yh)
{
BYTE fill;
DWORD *fgstart;
BYTE *colormap;
BYTE *source;
@ -509,66 +471,73 @@ void rt_shaded4cols (int sx, int yl, int yh)
dest = ylookup[yl] + sx + dc_destorg;
source = &dc_temp[yl*4];
pitch = dc_pitch;
{
DWORD val = fgstart[64<<8] | 0x1f07c1f;
fill = RGB32k[0][0][val & (val>>15)];
}
// 107.1, 108.4, 118.2/117.7, 119.4
do {
DWORD val = colormap[source[0]];
DWORD fg;
if (val < 64)
{
fg = fgstart[val<<8];
val = (Col2RGB8[64-val][dest[0]] + fg) | 0x1f07c1f;
dest[0] = RGB32k[0][0][val & (val>>15)];
}
else
{
dest[0] = fill;
}
DWORD val;
val = colormap[source[0]];
val = (Col2RGB8[64-val][dest[0]] + fgstart[val<<8]) | 0x1f07c1f;
dest[0] = RGB32k[0][0][val & (val>>15)];
val = colormap[source[1]];
if (val < 64)
{
fg = fgstart[val<<8];
val = (Col2RGB8[64-val][dest[1]] + fg) | 0x1f07c1f;
dest[1] = RGB32k[0][0][val & (val>>15)];
}
else
{
dest[1] = fill;
}
val = (Col2RGB8[64-val][dest[1]] + fgstart[val<<8]) | 0x1f07c1f;
dest[1] = RGB32k[0][0][val & (val>>15)];
val = colormap[source[2]];
if (val < 64)
{
fg = fgstart[val<<8];
val = (Col2RGB8[64-val][dest[2]] + fg) | 0x1f07c1f;
dest[2] = RGB32k[0][0][val & (val>>15)];
}
else
{
dest[2] = fill;
}
val = (Col2RGB8[64-val][dest[2]] + fgstart[val<<8]) | 0x1f07c1f;
dest[2] = RGB32k[0][0][val & (val>>15)];
val = colormap[source[3]];
if (val < 64)
{
fg = fgstart[val<<8];
val = (Col2RGB8[64-val][dest[3]] + fg) | 0x1f07c1f;
dest[3] = RGB32k[0][0][val & (val>>15)];
}
else
{
dest[3] = fill;
}
val = (Col2RGB8[64-val][dest[3]] + fgstart[val<<8]) | 0x1f07c1f;
dest[3] = RGB32k[0][0][val & (val>>15)];
source += 4;
dest += pitch;
} while (--count);
}
#if 0
static DWORD t_fgstart[1];
static BYTE t_colormap[1];
void STACK_ARGS rt_shaded4cols_t (int sx, int yl, int yh)
{
BYTE *source;
BYTE *dest;
int count;
count = yh-yl;
if (count < 0)
return;
count++;
dest = ylookup[yl] + sx + dc_destorg;
source = &dc_temp[yl*4];
do {
DWORD val, val2;
val = t_colormap[source[0]];
val2 = t_colormap[source[1]];
val = (Col2RGB8[64-val][dest[0]] + t_fgstart[val<<8]) | 0x1f07c1f;
val2 = (Col2RGB8[64-val2][dest[1]] + t_fgstart[val2<<8]) | 0x1f07c1f;
dest[0] = RGB32k[0][0][val & (val>>15)];
dest[1] = RGB32k[0][0][val2 & (val2>>15)];
val = t_colormap[source[2]];
val2 = t_colormap[source[3]];
val = (Col2RGB8[64-val][dest[2]] + t_fgstart[val<<8]) | 0x1f07c1f;
val2 = (Col2RGB8[64-val2][dest[3]] + t_fgstart[val2<<8]) | 0x1f07c1f;
dest[2] = RGB32k[0][0][val & (val>>15)];
dest[3] = RGB32k[0][0][val2 & (val2>>15)];
source += 4;
dest += 320;
} while (--count);
}
#endif
// Adds one span at hx to the screen at sx with clamping.
void rt_addclamp1col (int hx, int sx, int yl, int yh)
{
@ -606,7 +575,7 @@ void rt_addclamp1col (int hx, int sx, int yl, int yh)
}
// Adds all four spans to the screen starting at sx with clamping.
void rt_addclamp4cols (int sx, int yl, int yh)
void STACK_ARGS rt_addclamp4cols (int sx, int yl, int yh)
{
BYTE *colormap;
BYTE *source;
@ -672,105 +641,15 @@ void rt_addclamp4cols (int sx, int yl, int yh)
// Translates and adds one span at hx to the screen at sx with clamping.
void rt_tlateaddclamp1col (int hx, int sx, int yl, int yh)
{
BYTE *translation;
BYTE *colormap;
BYTE *source;
BYTE *dest;
int count;
int pitch;
count = yh-yl;
if (count < 0)
return;
count++;
DWORD *fg2rgb = dc_srcblend;
DWORD *bg2rgb = dc_destblend;
dest = ylookup[yl] + sx + dc_destorg;
source = &dc_temp[yl*4 + hx];
pitch = dc_pitch;
colormap = dc_colormap;
translation = dc_translation;
do {
DWORD a = fg2rgb[colormap[translation[*source]]] + bg2rgb[*dest];
DWORD b = a;
a |= 0x01f07c1f;
b &= 0x40100400;
a &= 0x3fffffff;
b = b - (b >> 5);
a |= b;
*dest = RGB32k[0][0][(a>>15) & a];
source += 4;
dest += pitch;
} while (--count);
rt_Translate1col(dc_translation, hx, yl, yh);
rt_addclamp1col(hx, sx, yl, yh);
}
// Translates and adds all four spans to the screen starting at sx with clamping.
void rt_tlateaddclamp4cols (int sx, int yl, int yh)
void STACK_ARGS rt_tlateaddclamp4cols (int sx, int yl, int yh)
{
BYTE *translation;
BYTE *colormap;
BYTE *source;
BYTE *dest;
int count;
int pitch;
count = yh-yl;
if (count < 0)
return;
count++;
DWORD *fg2rgb = dc_srcblend;
DWORD *bg2rgb = dc_destblend;
dest = ylookup[yl] + sx + dc_destorg;
source = &dc_temp[yl*4];
pitch = dc_pitch;
colormap = dc_colormap;
translation = dc_translation;
do {
DWORD a = fg2rgb[colormap[translation[source[0]]]] + bg2rgb[dest[0]];
DWORD b = a;
a |= 0x01f07c1f;
b &= 0x40100400;
a &= 0x3fffffff;
b = b - (b >> 5);
a |= b;
dest[0] = RGB32k[0][0][(a>>15) & a];
a = fg2rgb[colormap[translation[source[1]]]] + bg2rgb[dest[1]];
b = a;
a |= 0x01f07c1f;
b &= 0x40100400;
a &= 0x3fffffff;
b = b - (b >> 5);
a |= b;
dest[1] = RGB32k[0][0][(a>>15) & a];
a = fg2rgb[colormap[translation[source[2]]]] + bg2rgb[dest[2]];
b = a;
a |= 0x01f07c1f;
b &= 0x40100400;
a &= 0x3fffffff;
b = b - (b >> 5);
a |= b;
dest[2] = RGB32k[0][0][(a>>15) & a];
a = fg2rgb[colormap[translation[source[3]]]] + bg2rgb[dest[3]];
b = a;
a |= 0x01f07c1f;
b &= 0x40100400;
a &= 0x3fffffff;
b = b - (b >> 5);
a |= b;
dest[3] = RGB32k[0][0][(a>>15) & a];
source += 4;
dest += pitch;
} while (--count);
rt_Translate4cols(dc_translation, yl, yh);
rt_addclamp4cols(sx, yl, yh);
}
// Subtracts one span at hx to the screen at sx with clamping.
@ -809,7 +688,7 @@ void rt_subclamp1col (int hx, int sx, int yl, int yh)
}
// Subtracts all four spans to the screen starting at sx with clamping.
void rt_subclamp4cols (int sx, int yl, int yh)
void STACK_ARGS rt_subclamp4cols (int sx, int yl, int yh)
{
BYTE *colormap;
BYTE *source;
@ -871,100 +750,15 @@ void rt_subclamp4cols (int sx, int yl, int yh)
// Translates and subtracts one span at hx to the screen at sx with clamping.
void rt_tlatesubclamp1col (int hx, int sx, int yl, int yh)
{
BYTE *translation;
BYTE *colormap;
BYTE *source;
BYTE *dest;
int count;
int pitch;
count = yh-yl;
if (count < 0)
return;
count++;
DWORD *fg2rgb = dc_srcblend;
DWORD *bg2rgb = dc_destblend;
dest = ylookup[yl] + sx + dc_destorg;
source = &dc_temp[yl*4 + hx];
pitch = dc_pitch;
colormap = dc_colormap;
translation = dc_translation;
do {
DWORD a = (fg2rgb[colormap[translation[*source]]] | 0x40100400) - bg2rgb[*dest];
DWORD b = a;
b &= 0x40100400;
b = b - (b >> 5);
a &= b;
a |= 0x01f07c1f;
*dest = RGB32k[0][0][(a>>15) & a];
source += 4;
dest += pitch;
} while (--count);
rt_Translate1col(dc_translation, hx, yl, yh);
rt_subclamp1col(hx, sx, yl, yh);
}
// Translates and subtracts all four spans to the screen starting at sx with clamping.
void rt_tlatesubclamp4cols (int sx, int yl, int yh)
void STACK_ARGS rt_tlatesubclamp4cols (int sx, int yl, int yh)
{
BYTE *translation;
BYTE *colormap;
BYTE *source;
BYTE *dest;
int count;
int pitch;
count = yh-yl;
if (count < 0)
return;
count++;
DWORD *fg2rgb = dc_srcblend;
DWORD *bg2rgb = dc_destblend;
dest = ylookup[yl] + sx + dc_destorg;
source = &dc_temp[yl*4];
pitch = dc_pitch;
colormap = dc_colormap;
translation = dc_translation;
do {
DWORD a = (fg2rgb[colormap[translation[source[0]]]] | 0x40100400) - bg2rgb[dest[0]];
DWORD b = a;
b &= 0x40100400;
b = b - (b >> 5);
a &= b;
a |= 0x01f07c1f;
dest[0] = RGB32k[0][0][(a>>15) & a];
a = (fg2rgb[colormap[translation[source[1]]]] | 0x40100400) - bg2rgb[dest[1]];
b = a;
b &= 0x40100400;
b = b - (b >> 5);
a &= b;
a |= 0x01f07c1f;
dest[1] = RGB32k[0][0][(a>>15) & a];
a = (fg2rgb[colormap[translation[source[2]]]] | 0x40100400) - bg2rgb[dest[2]];
b = a;
b &= 0x40100400;
b = b - (b >> 5);
a &= b;
a |= 0x01f07c1f;
dest[2] = RGB32k[0][0][(a>>15) & a];
a = (fg2rgb[colormap[translation[source[3]]]] | 0x40100400) - bg2rgb[dest[3]];
b = a;
b &= 0x40100400;
b = b - (b >> 5);
a &= b;
a |= 0x01f07c1f;
dest[3] = RGB32k[0][0][(a>>15) & a];
source += 4;
dest += pitch;
} while (--count);
rt_Translate4cols(dc_translation, yl, yh);
rt_subclamp4cols(sx, yl, yh);
}
// Subtracts one span at hx from the screen at sx with clamping.
@ -1003,7 +797,7 @@ void rt_revsubclamp1col (int hx, int sx, int yl, int yh)
}
// Subtracts all four spans from the screen starting at sx with clamping.
void rt_revsubclamp4cols (int sx, int yl, int yh)
void STACK_ARGS rt_revsubclamp4cols (int sx, int yl, int yh)
{
BYTE *colormap;
BYTE *source;
@ -1065,104 +859,19 @@ void rt_revsubclamp4cols (int sx, int yl, int yh)
// Translates and subtracts one span at hx from the screen at sx with clamping.
void rt_tlaterevsubclamp1col (int hx, int sx, int yl, int yh)
{
BYTE *translation;
BYTE *colormap;
BYTE *source;
BYTE *dest;
int count;
int pitch;
count = yh-yl;
if (count < 0)
return;
count++;
DWORD *fg2rgb = dc_srcblend;
DWORD *bg2rgb = dc_destblend;
dest = ylookup[yl] + sx + dc_destorg;
source = &dc_temp[yl*4 + hx];
pitch = dc_pitch;
colormap = dc_colormap;
translation = dc_translation;
do {
DWORD a = (bg2rgb[dest[0]] | 0x40100400) - fg2rgb[colormap[translation[source[0]]]];
DWORD b = a;
b &= 0x40100400;
b = b - (b >> 5);
a &= b;
a |= 0x01f07c1f;
*dest = RGB32k[0][0][(a>>15) & a];
source += 4;
dest += pitch;
} while (--count);
rt_Translate1col(dc_translation, hx, yl, yh);
rt_revsubclamp1col(hx, sx, yl, yh);
}
// Translates and subtracts all four spans from the screen starting at sx with clamping.
void rt_tlaterevsubclamp4cols (int sx, int yl, int yh)
void STACK_ARGS rt_tlaterevsubclamp4cols (int sx, int yl, int yh)
{
BYTE *translation;
BYTE *colormap;
BYTE *source;
BYTE *dest;
int count;
int pitch;
count = yh-yl;
if (count < 0)
return;
count++;
DWORD *fg2rgb = dc_srcblend;
DWORD *bg2rgb = dc_destblend;
dest = ylookup[yl] + sx + dc_destorg;
source = &dc_temp[yl*4];
pitch = dc_pitch;
colormap = dc_colormap;
translation = dc_translation;
do {
DWORD a = (bg2rgb[dest[0]] | 0x40100400) - fg2rgb[colormap[translation[source[0]]]];
DWORD b = a;
b &= 0x40100400;
b = b - (b >> 5);
a &= b;
a |= 0x01f07c1f;
dest[0] = RGB32k[0][0][(a>>15) & a];
a = (bg2rgb[dest[1]] | 0x40100400) - fg2rgb[colormap[translation[source[1]]]];
b = a;
b &= 0x40100400;
b = b - (b >> 5);
a &= b;
a |= 0x01f07c1f;
dest[1] = RGB32k[0][0][(a>>15) & a];
a = (bg2rgb[dest[2]] | 0x40100400) - fg2rgb[colormap[translation[source[2]]]];
b = a;
b &= 0x40100400;
b = b - (b >> 5);
a &= b;
a |= 0x01f07c1f;
dest[2] = RGB32k[0][0][(a>>15) & a];
a = (bg2rgb[dest[3]] | 0x40100400) - fg2rgb[colormap[translation[source[3]]]];
b = a;
b &= 0x40100400;
b = b - (b >> 5);
a &= b;
a |= 0x01f07c1f;
dest[3] = RGB32k[0][0][(a>>15) & a];
source += 4;
dest += pitch;
} while (--count);
rt_Translate4cols(dc_translation, yl, yh);
rt_revsubclamp4cols(sx, yl, yh);
}
// Copies all spans in all four columns to the screen starting at sx.
// sx should be longword-aligned.
// sx should be dword-aligned.
void rt_draw4cols (int sx)
{
int x, bad;

View File

@ -205,7 +205,7 @@ void (*spanfunc) (void);
void (*hcolfunc_pre) (void);
void (*hcolfunc_post1) (int hx, int sx, int yl, int yh);
void (*hcolfunc_post2) (int hx, int sx, int yl, int yh);
void (*hcolfunc_post4) (int sx, int yl, int yh);
void (STACK_ARGS *hcolfunc_post4) (int sx, int yl, int yh);
cycle_t WallCycles, PlaneCycles, MaskedCycles, WallScanCycles;

View File

@ -147,7 +147,7 @@ extern void (*spanfunc) (void);
extern void (*hcolfunc_pre) (void);
extern void (*hcolfunc_post1) (int hx, int sx, int yl, int yh);
extern void (*hcolfunc_post2) (int hx, int sx, int yl, int yh);
extern void (*hcolfunc_post4) (int sx, int yl, int yh);
extern void (STACK_ARGS *hcolfunc_post4) (int sx, int yl, int yh);
//

View File

@ -55,6 +55,7 @@ EXTERN CPU
EXTERN dc_pitch
EXTERN dc_colormap
EXTERN dc_color
EXTERN dc_iscale
EXTERN dc_texturefrac
EXTERN dc_source
@ -68,6 +69,9 @@ EXTERN dc_destorg
EXTERN dc_ctspan
EXTERN dc_temp
EXTERN Col2RGB8
EXTERN RGB32k
EXTERN ds_xstep
EXTERN ds_ystep
EXTERN ds_colormap
@ -94,6 +98,7 @@ EXTERN _CPU
EXTERN _dc_pitch
EXTERN _dc_colormap
EXTERN _dc_color
EXTERN _dc_iscale
EXTERN _dc_texturefrac
EXTERN _dc_source
@ -107,6 +112,9 @@ EXTERN _dc_destorg
EXTERN _dc_ctspan
EXTERN _dc_temp
EXTERN _Col2RGB8
EXTERN _RGB32k
EXTERN _ds_xstep
EXTERN _ds_ystep
EXTERN _ds_colormap
@ -131,6 +139,7 @@ GLOBAL _ds_curcolormap
%define dc_pitch _dc_pitch
%define dc_colormap _dc_colormap
%define dc_color _dc_color
%define dc_iscale _dc_iscale
%define dc_texturefrac _dc_texturefrac
%define dc_source _dc_source
@ -141,6 +150,9 @@ GLOBAL _ds_curcolormap
%define dc_dest _dc_dest
%define dc_destorg _dc_destorg
%define Col2RGB8 _Col2RGB8
%define RGB32k _RGB32k
%define dc_ctspan _dc_ctspan
%define dc_temp _dc_temp
@ -1476,10 +1488,121 @@ _rt_map4cols_asm2:
pop ebx
ret 4
align 16
GLOBAL rt_shaded4cols_asm
GLOBAL _rt_shaded4cols_asm
rt_shaded4cols_asm:
_rt_shaded4cols_asm:
mov ecx,[esp+8]
push ebp
mov ebp,[esp+16]
sub ebp,ecx
js near s4nil
mov eax,[ylookup+ecx*4]
add eax,[dc_destorg] ; eax = destination
push ebx
push esi
inc ebp ; ebp = count
add eax,[esp+16]
push edi
lea esi,[dc_temp+ecx*4] ; esi = source
align 16
s4loop: movzx edx,byte [esi]
movzx ecx,byte [esi+1]
s4cm1: movzx edx,byte [SPACEFILLER4+edx] ; colormap
s4cm2: movzx edi,byte [SPACEFILLER4+ecx] ; colormap
shl edx,8
movzx ebx,byte [eax]
shl edi,8
movzx ecx,byte [eax+1]
sub ebx,edx
sub ecx,edi
mov ebx,[Col2RGB8+0x10000+ebx*4]
mov ecx,[Col2RGB8+0x10000+ecx*4]
s4fg1: add ebx,[SPACEFILLER4+edx*4]
s4fg2: add ecx,[SPACEFILLER4+edi*4]
or ebx,0x1f07c1f
or ecx,0x1f07c1f
mov edx,ebx
shr ebx,15
mov edi,ecx
shr ecx,15
and edx,ebx
and ecx,edi
mov bl,[RGB32k+edx]
movzx edx,byte [esi+2]
mov bh,[RGB32k+ecx]
movzx ecx,byte [esi+3]
mov [eax],bl
mov [eax+1],bh
s4cm3: movzx edx,byte [SPACEFILLER4+edx] ; colormap
s4cm4: movzx edi,byte [SPACEFILLER4+ecx] ; colormap
shl edx,8
movzx ebx,byte [eax+2]
shl edi,8
movzx ecx,byte [eax+3]
sub ebx,edx
sub ecx,edi
mov ebx,[Col2RGB8+0x10000+ebx*4]
mov ecx,[Col2RGB8+0x10000+ecx*4]
s4fg3: add ebx,[SPACEFILLER4+edx*4]
s4fg4: add ecx,[SPACEFILLER4+edi*4]
or ebx,0x1f07c1f
or ecx,0x1f07c1f
mov edx,ebx
shr ebx,15
mov edi,ecx
shr ecx,15
and edx,ebx
and ecx,edi
s4p: add eax,320 ; pitch
add esi,4
mov bl,[RGB32k+edx]
mov bh,[RGB32k+ecx]
s4p2: mov [eax-320+2],bl
s4p3: mov [eax-320+3],bh
dec ebp
jne s4loop
pop edi
pop esi
pop ebx
s4nil: pop ebp
ret
align 16
;************************
SECTION .text
GLOBAL R_SetupShadedCol
GLOBAL _R_SetupShadedCol
GLOBAL @R_SetupShadedCol@0
# Patch the values of dc_colormap and dc_color into the shaded column drawer.
R_SetupShadedCol:
_R_SetupShadedCol:
@R_SetupShadedCol@0:
mov eax,[dc_colormap]
mov [s4cm1+3],eax
mov [s4cm2+3],eax
mov [s4cm3+3],eax
mov [s4cm4+3],eax
mov eax,[dc_color]
lea eax,[Col2RGB8+eax*4]
mov [s4fg1+3],eax
mov [s4fg2+3],eax
mov [s4fg3+3],eax
mov [s4fg4+3],eax
ret
EXTERN setvlinebpl_
EXTERN setpitch3
@ -1490,11 +1613,19 @@ GLOBAL ASM_PatchPitch
ASM_PatchPitch:
_ASM_PatchPitch:
@ASM_PatchPitch@0:
mov eax,[dc_pitch]
mov [rdcp1+2],eax
mov [rdcp2+2],eax
mov [rdcp3+2],eax
call setpitch3
jmp setvlinebpl_
mov eax,[dc_pitch]
mov [rdcp1+2],eax
mov [rdcp2+2],eax
mov [rdcp3+2],eax
mov [s4p+1],eax
mov ecx,eax
neg ecx
inc ecx
inc ecx
mov [s4p2+2],ecx
inc ecx
mov [s4p3+2],ecx
call setpitch3
jmp setvlinebpl_