From 9cb674c60c06e1252b46ee73acdadbc879f6a0e2 Mon Sep 17 00:00:00 2001 From: Randy Heit Date: Wed, 27 Feb 2008 03:11:35 +0000 Subject: [PATCH] - Added an assembly version of rt_shaded4cols, since that's the main decal drawing function. The most improvement came from being able to turn some constant variables into immediate values with self-modifying code, but I also managed to reorder it to make it a little faster. It's about 9% faster than VC++'s code and 19% faster than GCC's code. That's not a huge improvement (for VC++), but at least it's measurable. - Removed the solid fill "optimization" from rt_shaded4cols(), because in my testing, it didn't help any and in fact, hurt just a little bit. - In the name of simplification, all the rt_tlate* drawers were changed to do the translation in one step and the drawing in another. This lets me call the untranslated drawer to do the real drawing instead of mostly duplicating them. Performance wise, there is practically no difference from before. SVN r771 (trunk) --- Makefile.mingw | 2 +- docs/rh-log.txt | 14 ++ src/r_draw.cpp | 4 +- src/r_draw.h | 60 +++-- src/r_drawt.cpp | 641 +++++++++++++----------------------------------- src/r_main.cpp | 2 +- src/r_main.h | 2 +- src/tmap.nas | 143 ++++++++++- 8 files changed, 368 insertions(+), 500 deletions(-) diff --git a/Makefile.mingw b/Makefile.mingw index 8ea1c471ba..9200f09499 100644 --- a/Makefile.mingw +++ b/Makefile.mingw @@ -7,7 +7,7 @@ # Where did you install the FMOD API to? Change this line so that the build process can find it. FMODDIR = "c:/program files/fmodapi375win" -ifeq ($(findstring msys,$(shell sh --version 2>nul)),msys) +ifeq ($(findstring msys,$(shell sh --version 2>nul)),msys) WINCMD=0 else WINCMD=1 diff --git a/docs/rh-log.txt b/docs/rh-log.txt index 69cdf4b4e0..af25942240 100644 --- a/docs/rh-log.txt +++ b/docs/rh-log.txt @@ -1,3 +1,17 @@ +February 26, 2008 +- Added an assembly version of rt_shaded4cols, since that's the main decal + drawing function. The most improvement came from being able to turn some + constant variables into immediate values with self-modifying code, but I + also managed to reorder it to make it a little faster. It's about 9% faster + than VC++'s code and 19% faster than GCC's code. That's not a huge + improvement (for VC++), but at least it's measurable. +- Removed the solid fill "optimization" from rt_shaded4cols(), because in my + testing, it didn't help any and in fact, hurt just a little bit. +- In the name of simplification, all the rt_tlate* drawers were changed to do + the translation in one step and the drawing in another. This lets me call + the untranslated drawer to do the real drawing instead of mostly duplicating + them. Performance wise, there is practically no difference from before. + February 25, 2008 (Changes by Graf Zahl) - Fixed: The DECORATE expression evaluator's random function could produce incorrect results for ranges > 255. Changed so that FRandom's default diff --git a/src/r_draw.cpp b/src/r_draw.cpp index 7c61b1f8c1..acb5413a92 100644 --- a/src/r_draw.cpp +++ b/src/r_draw.cpp @@ -80,6 +80,7 @@ int detailyshift; // [RH] Y shift for vertical detail level extern "C" void STACK_ARGS DoubleHoriz_MMX (int height, int width, BYTE *dest, int pitch); extern "C" void STACK_ARGS DoubleHorizVert_MMX (int height, int width, BYTE *dest, int pitch); extern "C" void STACK_ARGS DoubleVert_ASM (int height, int width, BYTE *dest, int pitch); +extern "C" void R_SetupShadedCol(); #endif // [RH] Pointers to the different column drawers. @@ -94,7 +95,7 @@ void (*R_DrawSpan)(void); void (*R_DrawSpanMasked)(void); void (*R_DrawSpanTranslucent)(void); void (*R_DrawSpanMaskedTranslucent)(void); -void (*rt_map4cols)(int,int,int); +void (STACK_ARGS *rt_map4cols)(int,int,int); // // R_DrawColumn @@ -2270,6 +2271,7 @@ ESPSResult R_SetPatchStyle (FRenderStyle style, fixed_t alpha, int translation, { dc_colormap += fixedlightlev; } + R_SetupShadedCol(); return r_columnmethod ? DoDraw1 : DoDraw0; } diff --git a/src/r_draw.h b/src/r_draw.h index 7193bfe300..23ec1b74b2 100644 --- a/src/r_draw.h +++ b/src/r_draw.h @@ -106,47 +106,59 @@ void R_DrawMaskedColumnHoriz (const BYTE *column, const FTexture::Span *spans); void R_InitColumnDrawers (); // [RH] Moves data from the temporary buffer to the screen. +extern "C" +{ void rt_copy1col_c (int hx, int sx, int yl, int yh); -void rt_copy4cols_c (int sx, int yl, int yh); -void rt_map1col_c (int hx, int sx, int yl, int yh); -void rt_map4cols_c (int sx, int yl, int yh); -void rt_add1col (int hx, int sx, int yl, int yh); -void rt_add4cols (int sx, int yl, int yh); -void rt_tlate1col (int hx, int sx, int yl, int yh); -void rt_tlate4cols (int sx, int yl, int yh); -void rt_tlateadd1col (int hx, int sx, int yl, int yh); -void rt_tlateadd4cols (int sx, int yl, int yh); +void STACK_ARGS rt_copy4cols_c (int sx, int yl, int yh); + void rt_shaded1col (int hx, int sx, int yl, int yh); -void rt_shaded4cols (int sx, int yl, int yh); +void STACK_ARGS rt_shaded4cols_c (int sx, int yl, int yh); +void STACK_ARGS rt_shaded4cols_asm (int sx, int yl, int yh); + +void rt_map1col_c (int hx, int sx, int yl, int yh); +void rt_add1col (int hx, int sx, int yl, int yh); void rt_addclamp1col (int hx, int sx, int yl, int yh); -void rt_addclamp4cols (int sx, int yl, int yh); -void rt_tlateaddclamp1col (int hx, int sx, int yl, int yh); -void rt_tlateaddclamp4cols (int sx, int yl, int yh); void rt_subclamp1col (int hx, int sx, int yl, int yh); -void rt_subclamp4cols (int sx, int yl, int yh); -void rt_tlatesubclamp1col (int hx, int sx, int yl, int yh); -void rt_tlatesubclamp4cols (int sx, int yl, int yh); void rt_revsubclamp1col (int hx, int sx, int yl, int yh); -void rt_revsubclamp4cols (int sx, int yl, int yh); + +void rt_tlate1col (int hx, int sx, int yl, int yh); +void rt_tlateadd1col (int hx, int sx, int yl, int yh); +void rt_tlateaddclamp1col (int hx, int sx, int yl, int yh); +void rt_tlatesubclamp1col (int hx, int sx, int yl, int yh); void rt_tlaterevsubclamp1col (int hx, int sx, int yl, int yh); -void rt_tlaterevsubclamp4cols (int sx, int yl, int yh); -extern "C" void rt_copy1col_asm (int hx, int sx, int yl, int yh); -extern "C" void rt_copy4cols_asm (int sx, int yl, int yh); -extern "C" void rt_map1col_asm (int hx, int sx, int yl, int yh); -extern "C" void rt_map4cols_asm1 (int sx, int yl, int yh); -extern "C" void rt_map4cols_asm2 (int sx, int yl, int yh); +void STACK_ARGS rt_map4cols_c (int sx, int yl, int yh); +void STACK_ARGS rt_add4cols (int sx, int yl, int yh); +void STACK_ARGS rt_addclamp4cols (int sx, int yl, int yh); +void STACK_ARGS rt_subclamp4cols (int sx, int yl, int yh); +void STACK_ARGS rt_revsubclamp4cols (int sx, int yl, int yh); -extern void (*rt_map4cols)(int sx, int yl, int yh); +void STACK_ARGS rt_tlate4cols (int sx, int yl, int yh); +void STACK_ARGS rt_tlateadd4cols (int sx, int yl, int yh); +void STACK_ARGS rt_tlateaddclamp4cols (int sx, int yl, int yh); +void STACK_ARGS rt_tlatesubclamp4cols (int sx, int yl, int yh); +void STACK_ARGS rt_tlaterevsubclamp4cols (int sx, int yl, int yh); + +void rt_copy1col_asm (int hx, int sx, int yl, int yh); +void rt_map1col_asm (int hx, int sx, int yl, int yh); + +void STACK_ARGS rt_copy4cols_asm (int sx, int yl, int yh); +void STACK_ARGS rt_map4cols_asm1 (int sx, int yl, int yh); +void STACK_ARGS rt_map4cols_asm2 (int sx, int yl, int yh); +} + +extern void (STACK_ARGS *rt_map4cols)(int sx, int yl, int yh); #ifdef USEASM #define rt_copy1col rt_copy1col_asm #define rt_copy4cols rt_copy4cols_asm #define rt_map1col rt_map1col_asm +#define rt_shaded4cols rt_shaded4cols_asm #else #define rt_copy1col rt_copy1col_c #define rt_copy4cols rt_copy4cols_c #define rt_map1col rt_map1col_c +#define rt_shaded4cols rt_shaded4cols_c #endif void rt_draw4cols (int sx); diff --git a/src/r_drawt.cpp b/src/r_drawt.cpp index 5880fa7ce9..49e30016c1 100644 --- a/src/r_drawt.cpp +++ b/src/r_drawt.cpp @@ -102,7 +102,7 @@ void rt_copy1col_c (int hx, int sx, int yl, int yh) } // Copies all four spans to the screen starting at sx. -void rt_copy4cols_c (int sx, int yl, int yh) +void STACK_ARGS rt_copy4cols_c (int sx, int yl, int yh) { int *source; int *dest; @@ -170,7 +170,7 @@ void rt_map1col_c (int hx, int sx, int yl, int yh) } // Maps all four spans to the screen starting at sx. -void rt_map4cols_c (int sx, int yl, int yh) +void STACK_ARGS rt_map4cols_c (int sx, int yl, int yh) { BYTE *colormap; BYTE *source; @@ -214,63 +214,106 @@ void rt_map4cols_c (int sx, int yl, int yh) } #endif /* !USEASM */ +void rt_Translate1col(const BYTE *translation, int hx, int yl, int yh) +{ + int count = yh - yl + 1; + BYTE *source = &dc_temp[yl*4 + hx]; + + // Things we do to hit the compiler's optimizer with a clue bat: + // 1. Parallelism is explicitly spelled out by using a separate + // C instruction for each assembly instruction. GCC lets me + // have four temporaries, but VC++ spills to the stack with + // more than two. Two is probably optimal, anyway. + // 2. The results of the translation lookups are explicitly + // stored in byte-sized variables. This causes the VC++ code + // to use byte mov instructions in most cases; for apparently + // random reasons, it will use movzx for some places. GCC + // ignores this and uses movzx always. + + // Do 8 rows at a time. + for (int count8 = count >> 3; count8; --count8) + { + int c0, c1; + BYTE b0, b1; + + c0 = source[0]; c1 = source[4]; + b0 = translation[c0]; b1 = translation[c1]; + source[0] = b0; source[4] = b1; + + c0 = source[8]; c1 = source[12]; + b0 = translation[c0]; b1 = translation[c1]; + source[8] = b0; source[12] = b1; + + c0 = source[16]; c1 = source[20]; + b0 = translation[c0]; b1 = translation[c1]; + source[16] = b0; source[20] = b1; + + c0 = source[24]; c1 = source[28]; + b0 = translation[c0]; b1 = translation[c1]; + source[28] = b0; source[28] = b1; + + source += 32; + } + // Finish by doing 1 row at a time. + for (count &= 7; count; --count, source++) + { + source[0] = translation[source[0]]; + } +} + +void rt_Translate4cols(const BYTE *translation, int yl, int yh) +{ + int count = yh - yl + 1; + BYTE *source = &dc_temp[yl*4]; + int c0, c1; + BYTE b0, b1; + + // Do 2 rows at a time. + for (int count8 = count >> 1; count8; --count8) + { + c0 = source[0]; c1 = source[1]; + b0 = translation[c0]; b1 = translation[c1]; + source[0] = b0; source[1] = b1; + + c0 = source[2]; c1 = source[3]; + b0 = translation[c0]; b1 = translation[c1]; + source[2] = b0; source[3] = b1; + + c0 = source[4]; c1 = source[5]; + b0 = translation[c0]; b1 = translation[c1]; + source[4] = b0; source[5] = b1; + + c0 = source[6]; c1 = source[7]; + b0 = translation[c0]; b1 = translation[c1]; + source[6] = b0; source[7] = b1; + + source += 8; + } + // Do the final row if count was odd. + if (count & 1) + { + c0 = source[0]; c1 = source[1]; + b0 = translation[c0]; b1 = translation[c1]; + source[0] = b0; source[1] = b1; + + c0 = source[2]; c1 = source[3]; + b0 = translation[c0]; b1 = translation[c1]; + source[2] = b0; source[3] = b1; + } +} + // Translates one span at hx to the screen at sx. void rt_tlate1col (int hx, int sx, int yl, int yh) { - BYTE *translation; - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - translation = dc_translation; - colormap = dc_colormap; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4 + hx]; - pitch = dc_pitch; - - do { - *dest = colormap[translation[*source]]; - source += 4; - dest += pitch; - } while (--count); + rt_Translate1col(dc_translation, hx, yl, yh); + rt_map1col(hx, sx, yl, yh); } // Translates all four spans to the screen starting at sx. -void rt_tlate4cols (int sx, int yl, int yh) +void STACK_ARGS rt_tlate4cols (int sx, int yl, int yh) { - BYTE *translation; - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - translation = dc_translation; - count = yh-yl; - if (count < 0) - return; - count++; - - colormap = dc_colormap; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4]; - pitch = dc_pitch; - - do { - dest[0] = colormap[translation[source[0]]]; - dest[1] = colormap[translation[source[1]]]; - dest[2] = colormap[translation[source[2]]]; - dest[3] = colormap[translation[source[3]]]; - source += 4; - dest += pitch; - } while (--count); + rt_Translate4cols(dc_translation, yl, yh); + rt_map4cols(sx, yl, yh); } // Adds one span at hx to the screen at sx without clamping. @@ -308,7 +351,7 @@ void rt_add1col (int hx, int sx, int yl, int yh) } // Adds all four spans to the screen starting at sx without clamping. -void rt_add4cols (int sx, int yl, int yh) +void STACK_ARGS rt_add4cols (int sx, int yl, int yh) { BYTE *colormap; BYTE *source; @@ -366,95 +409,15 @@ void rt_add4cols (int sx, int yl, int yh) // Translates and adds one span at hx to the screen at sx without clamping. void rt_tlateadd1col (int hx, int sx, int yl, int yh) { - BYTE *translation; - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - DWORD *fg2rgb = dc_srcblend; - DWORD *bg2rgb = dc_destblend; - translation = dc_translation; - colormap = dc_colormap; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4 + hx]; - pitch = dc_pitch; - - do { - DWORD fg = colormap[translation[*source]]; - DWORD bg = *dest; - - fg = fg2rgb[fg]; - bg = bg2rgb[bg]; - fg = (fg+bg) | 0x1f07c1f; - *dest = RGB32k[0][0][fg & (fg>>15)]; - source += 4; - dest += pitch; - } while (--count); + rt_Translate1col(dc_translation, hx, yl, yh); + rt_add1col(hx, sx, yl, yh); } // Translates and adds all four spans to the screen starting at sx without clamping. -void rt_tlateadd4cols (int sx, int yl, int yh) +void STACK_ARGS rt_tlateadd4cols (int sx, int yl, int yh) { - BYTE *translation; - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - DWORD *fg2rgb = dc_srcblend; - DWORD *bg2rgb = dc_destblend; - translation = dc_translation; - colormap = dc_colormap; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4]; - pitch = dc_pitch; - - do { - DWORD fg = colormap[translation[source[0]]]; - DWORD bg = dest[0]; - fg = fg2rgb[fg]; - bg = bg2rgb[bg]; - fg = (fg+bg) | 0x1f07c1f; - dest[0] = RGB32k[0][0][fg & (fg>>15)]; - - fg = colormap[translation[source[1]]]; - bg = dest[1]; - fg = fg2rgb[fg]; - bg = bg2rgb[bg]; - fg = (fg+bg) | 0x1f07c1f; - dest[1] = RGB32k[0][0][fg & (fg>>15)]; - - - fg = colormap[translation[source[2]]]; - bg = dest[2]; - fg = fg2rgb[fg]; - bg = bg2rgb[bg]; - fg = (fg+bg) | 0x1f07c1f; - dest[2] = RGB32k[0][0][fg & (fg>>15)]; - - fg = colormap[translation[source[3]]]; - bg = dest[3]; - fg = fg2rgb[fg]; - bg = bg2rgb[bg]; - fg = (fg+bg) | 0x1f07c1f; - dest[3] = RGB32k[0][0][fg & (fg>>15)]; - - source += 4; - dest += pitch; - } while (--count); + rt_Translate4cols(dc_translation, yl, yh); + rt_add4cols(sx, yl, yh); } // Shades one span at hx to the screen at sx. @@ -489,9 +452,8 @@ void rt_shaded1col (int hx, int sx, int yl, int yh) } // Shades all four spans to the screen starting at sx. -void rt_shaded4cols (int sx, int yl, int yh) +void STACK_ARGS rt_shaded4cols_c (int sx, int yl, int yh) { - BYTE fill; DWORD *fgstart; BYTE *colormap; BYTE *source; @@ -509,66 +471,73 @@ void rt_shaded4cols (int sx, int yl, int yh) dest = ylookup[yl] + sx + dc_destorg; source = &dc_temp[yl*4]; pitch = dc_pitch; - { - DWORD val = fgstart[64<<8] | 0x1f07c1f; - fill = RGB32k[0][0][val & (val>>15)]; - } - + + // 107.1, 108.4, 118.2/117.7, 119.4 do { - DWORD val = colormap[source[0]]; - DWORD fg; - if (val < 64) - { - fg = fgstart[val<<8]; - val = (Col2RGB8[64-val][dest[0]] + fg) | 0x1f07c1f; - dest[0] = RGB32k[0][0][val & (val>>15)]; - } - else - { - dest[0] = fill; - } + DWORD val; + + val = colormap[source[0]]; + val = (Col2RGB8[64-val][dest[0]] + fgstart[val<<8]) | 0x1f07c1f; + dest[0] = RGB32k[0][0][val & (val>>15)]; val = colormap[source[1]]; - if (val < 64) - { - fg = fgstart[val<<8]; - val = (Col2RGB8[64-val][dest[1]] + fg) | 0x1f07c1f; - dest[1] = RGB32k[0][0][val & (val>>15)]; - } - else - { - dest[1] = fill; - } + val = (Col2RGB8[64-val][dest[1]] + fgstart[val<<8]) | 0x1f07c1f; + dest[1] = RGB32k[0][0][val & (val>>15)]; val = colormap[source[2]]; - if (val < 64) - { - fg = fgstart[val<<8]; - val = (Col2RGB8[64-val][dest[2]] + fg) | 0x1f07c1f; - dest[2] = RGB32k[0][0][val & (val>>15)]; - } - else - { - dest[2] = fill; - } + val = (Col2RGB8[64-val][dest[2]] + fgstart[val<<8]) | 0x1f07c1f; + dest[2] = RGB32k[0][0][val & (val>>15)]; val = colormap[source[3]]; - if (val < 64) - { - fg = fgstart[val<<8]; - val = (Col2RGB8[64-val][dest[3]] + fg) | 0x1f07c1f; - dest[3] = RGB32k[0][0][val & (val>>15)]; - } - else - { - dest[3] = fill; - } + val = (Col2RGB8[64-val][dest[3]] + fgstart[val<<8]) | 0x1f07c1f; + dest[3] = RGB32k[0][0][val & (val>>15)]; source += 4; dest += pitch; } while (--count); } +#if 0 +static DWORD t_fgstart[1]; +static BYTE t_colormap[1]; + +void STACK_ARGS rt_shaded4cols_t (int sx, int yl, int yh) +{ + BYTE *source; + BYTE *dest; + int count; + + count = yh-yl; + if (count < 0) + return; + count++; + + dest = ylookup[yl] + sx + dc_destorg; + source = &dc_temp[yl*4]; + + do { + DWORD val, val2; + + val = t_colormap[source[0]]; + val2 = t_colormap[source[1]]; + val = (Col2RGB8[64-val][dest[0]] + t_fgstart[val<<8]) | 0x1f07c1f; + val2 = (Col2RGB8[64-val2][dest[1]] + t_fgstart[val2<<8]) | 0x1f07c1f; + dest[0] = RGB32k[0][0][val & (val>>15)]; + dest[1] = RGB32k[0][0][val2 & (val2>>15)]; + + val = t_colormap[source[2]]; + val2 = t_colormap[source[3]]; + val = (Col2RGB8[64-val][dest[2]] + t_fgstart[val<<8]) | 0x1f07c1f; + val2 = (Col2RGB8[64-val2][dest[3]] + t_fgstart[val2<<8]) | 0x1f07c1f; + dest[2] = RGB32k[0][0][val & (val>>15)]; + dest[3] = RGB32k[0][0][val2 & (val2>>15)]; + + source += 4; + dest += 320; + } while (--count); +} +#endif + // Adds one span at hx to the screen at sx with clamping. void rt_addclamp1col (int hx, int sx, int yl, int yh) { @@ -606,7 +575,7 @@ void rt_addclamp1col (int hx, int sx, int yl, int yh) } // Adds all four spans to the screen starting at sx with clamping. -void rt_addclamp4cols (int sx, int yl, int yh) +void STACK_ARGS rt_addclamp4cols (int sx, int yl, int yh) { BYTE *colormap; BYTE *source; @@ -672,105 +641,15 @@ void rt_addclamp4cols (int sx, int yl, int yh) // Translates and adds one span at hx to the screen at sx with clamping. void rt_tlateaddclamp1col (int hx, int sx, int yl, int yh) { - BYTE *translation; - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - DWORD *fg2rgb = dc_srcblend; - DWORD *bg2rgb = dc_destblend; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4 + hx]; - pitch = dc_pitch; - colormap = dc_colormap; - translation = dc_translation; - - do { - DWORD a = fg2rgb[colormap[translation[*source]]] + bg2rgb[*dest]; - DWORD b = a; - - a |= 0x01f07c1f; - b &= 0x40100400; - a &= 0x3fffffff; - b = b - (b >> 5); - a |= b; - *dest = RGB32k[0][0][(a>>15) & a]; - source += 4; - dest += pitch; - } while (--count); + rt_Translate1col(dc_translation, hx, yl, yh); + rt_addclamp1col(hx, sx, yl, yh); } // Translates and adds all four spans to the screen starting at sx with clamping. -void rt_tlateaddclamp4cols (int sx, int yl, int yh) +void STACK_ARGS rt_tlateaddclamp4cols (int sx, int yl, int yh) { - BYTE *translation; - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - DWORD *fg2rgb = dc_srcblend; - DWORD *bg2rgb = dc_destblend; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4]; - pitch = dc_pitch; - colormap = dc_colormap; - translation = dc_translation; - - do { - DWORD a = fg2rgb[colormap[translation[source[0]]]] + bg2rgb[dest[0]]; - DWORD b = a; - - a |= 0x01f07c1f; - b &= 0x40100400; - a &= 0x3fffffff; - b = b - (b >> 5); - a |= b; - dest[0] = RGB32k[0][0][(a>>15) & a]; - - a = fg2rgb[colormap[translation[source[1]]]] + bg2rgb[dest[1]]; - b = a; - a |= 0x01f07c1f; - b &= 0x40100400; - a &= 0x3fffffff; - b = b - (b >> 5); - a |= b; - dest[1] = RGB32k[0][0][(a>>15) & a]; - - a = fg2rgb[colormap[translation[source[2]]]] + bg2rgb[dest[2]]; - b = a; - a |= 0x01f07c1f; - b &= 0x40100400; - a &= 0x3fffffff; - b = b - (b >> 5); - a |= b; - dest[2] = RGB32k[0][0][(a>>15) & a]; - - a = fg2rgb[colormap[translation[source[3]]]] + bg2rgb[dest[3]]; - b = a; - a |= 0x01f07c1f; - b &= 0x40100400; - a &= 0x3fffffff; - b = b - (b >> 5); - a |= b; - dest[3] = RGB32k[0][0][(a>>15) & a]; - - source += 4; - dest += pitch; - } while (--count); + rt_Translate4cols(dc_translation, yl, yh); + rt_addclamp4cols(sx, yl, yh); } // Subtracts one span at hx to the screen at sx with clamping. @@ -809,7 +688,7 @@ void rt_subclamp1col (int hx, int sx, int yl, int yh) } // Subtracts all four spans to the screen starting at sx with clamping. -void rt_subclamp4cols (int sx, int yl, int yh) +void STACK_ARGS rt_subclamp4cols (int sx, int yl, int yh) { BYTE *colormap; BYTE *source; @@ -871,100 +750,15 @@ void rt_subclamp4cols (int sx, int yl, int yh) // Translates and subtracts one span at hx to the screen at sx with clamping. void rt_tlatesubclamp1col (int hx, int sx, int yl, int yh) { - BYTE *translation; - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - DWORD *fg2rgb = dc_srcblend; - DWORD *bg2rgb = dc_destblend; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4 + hx]; - pitch = dc_pitch; - colormap = dc_colormap; - translation = dc_translation; - - do { - DWORD a = (fg2rgb[colormap[translation[*source]]] | 0x40100400) - bg2rgb[*dest]; - DWORD b = a; - - b &= 0x40100400; - b = b - (b >> 5); - a &= b; - a |= 0x01f07c1f; - *dest = RGB32k[0][0][(a>>15) & a]; - source += 4; - dest += pitch; - } while (--count); + rt_Translate1col(dc_translation, hx, yl, yh); + rt_subclamp1col(hx, sx, yl, yh); } // Translates and subtracts all four spans to the screen starting at sx with clamping. -void rt_tlatesubclamp4cols (int sx, int yl, int yh) +void STACK_ARGS rt_tlatesubclamp4cols (int sx, int yl, int yh) { - BYTE *translation; - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - DWORD *fg2rgb = dc_srcblend; - DWORD *bg2rgb = dc_destblend; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4]; - pitch = dc_pitch; - colormap = dc_colormap; - translation = dc_translation; - - do { - DWORD a = (fg2rgb[colormap[translation[source[0]]]] | 0x40100400) - bg2rgb[dest[0]]; - DWORD b = a; - - b &= 0x40100400; - b = b - (b >> 5); - a &= b; - a |= 0x01f07c1f; - dest[0] = RGB32k[0][0][(a>>15) & a]; - - a = (fg2rgb[colormap[translation[source[1]]]] | 0x40100400) - bg2rgb[dest[1]]; - b = a; - b &= 0x40100400; - b = b - (b >> 5); - a &= b; - a |= 0x01f07c1f; - dest[1] = RGB32k[0][0][(a>>15) & a]; - - a = (fg2rgb[colormap[translation[source[2]]]] | 0x40100400) - bg2rgb[dest[2]]; - b = a; - b &= 0x40100400; - b = b - (b >> 5); - a &= b; - a |= 0x01f07c1f; - dest[2] = RGB32k[0][0][(a>>15) & a]; - - a = (fg2rgb[colormap[translation[source[3]]]] | 0x40100400) - bg2rgb[dest[3]]; - b = a; - b &= 0x40100400; - b = b - (b >> 5); - a &= b; - a |= 0x01f07c1f; - dest[3] = RGB32k[0][0][(a>>15) & a]; - - source += 4; - dest += pitch; - } while (--count); + rt_Translate4cols(dc_translation, yl, yh); + rt_subclamp4cols(sx, yl, yh); } // Subtracts one span at hx from the screen at sx with clamping. @@ -1003,7 +797,7 @@ void rt_revsubclamp1col (int hx, int sx, int yl, int yh) } // Subtracts all four spans from the screen starting at sx with clamping. -void rt_revsubclamp4cols (int sx, int yl, int yh) +void STACK_ARGS rt_revsubclamp4cols (int sx, int yl, int yh) { BYTE *colormap; BYTE *source; @@ -1065,104 +859,19 @@ void rt_revsubclamp4cols (int sx, int yl, int yh) // Translates and subtracts one span at hx from the screen at sx with clamping. void rt_tlaterevsubclamp1col (int hx, int sx, int yl, int yh) { - BYTE *translation; - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - DWORD *fg2rgb = dc_srcblend; - DWORD *bg2rgb = dc_destblend; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4 + hx]; - pitch = dc_pitch; - colormap = dc_colormap; - translation = dc_translation; - - do { - DWORD a = (bg2rgb[dest[0]] | 0x40100400) - fg2rgb[colormap[translation[source[0]]]]; - DWORD b = a; - - b &= 0x40100400; - b = b - (b >> 5); - a &= b; - a |= 0x01f07c1f; - *dest = RGB32k[0][0][(a>>15) & a]; - source += 4; - dest += pitch; - } while (--count); + rt_Translate1col(dc_translation, hx, yl, yh); + rt_revsubclamp1col(hx, sx, yl, yh); } // Translates and subtracts all four spans from the screen starting at sx with clamping. -void rt_tlaterevsubclamp4cols (int sx, int yl, int yh) +void STACK_ARGS rt_tlaterevsubclamp4cols (int sx, int yl, int yh) { - BYTE *translation; - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - DWORD *fg2rgb = dc_srcblend; - DWORD *bg2rgb = dc_destblend; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4]; - pitch = dc_pitch; - colormap = dc_colormap; - translation = dc_translation; - - do { - DWORD a = (bg2rgb[dest[0]] | 0x40100400) - fg2rgb[colormap[translation[source[0]]]]; - DWORD b = a; - - b &= 0x40100400; - b = b - (b >> 5); - a &= b; - a |= 0x01f07c1f; - dest[0] = RGB32k[0][0][(a>>15) & a]; - - a = (bg2rgb[dest[1]] | 0x40100400) - fg2rgb[colormap[translation[source[1]]]]; - b = a; - b &= 0x40100400; - b = b - (b >> 5); - a &= b; - a |= 0x01f07c1f; - dest[1] = RGB32k[0][0][(a>>15) & a]; - - a = (bg2rgb[dest[2]] | 0x40100400) - fg2rgb[colormap[translation[source[2]]]]; - b = a; - b &= 0x40100400; - b = b - (b >> 5); - a &= b; - a |= 0x01f07c1f; - dest[2] = RGB32k[0][0][(a>>15) & a]; - - a = (bg2rgb[dest[3]] | 0x40100400) - fg2rgb[colormap[translation[source[3]]]]; - b = a; - b &= 0x40100400; - b = b - (b >> 5); - a &= b; - a |= 0x01f07c1f; - dest[3] = RGB32k[0][0][(a>>15) & a]; - - source += 4; - dest += pitch; - } while (--count); + rt_Translate4cols(dc_translation, yl, yh); + rt_revsubclamp4cols(sx, yl, yh); } // Copies all spans in all four columns to the screen starting at sx. -// sx should be longword-aligned. +// sx should be dword-aligned. void rt_draw4cols (int sx) { int x, bad; diff --git a/src/r_main.cpp b/src/r_main.cpp index bd15a101aa..9ca9c42313 100644 --- a/src/r_main.cpp +++ b/src/r_main.cpp @@ -205,7 +205,7 @@ void (*spanfunc) (void); void (*hcolfunc_pre) (void); void (*hcolfunc_post1) (int hx, int sx, int yl, int yh); void (*hcolfunc_post2) (int hx, int sx, int yl, int yh); -void (*hcolfunc_post4) (int sx, int yl, int yh); +void (STACK_ARGS *hcolfunc_post4) (int sx, int yl, int yh); cycle_t WallCycles, PlaneCycles, MaskedCycles, WallScanCycles; diff --git a/src/r_main.h b/src/r_main.h index abecbd971f..fdb3b636ff 100644 --- a/src/r_main.h +++ b/src/r_main.h @@ -147,7 +147,7 @@ extern void (*spanfunc) (void); extern void (*hcolfunc_pre) (void); extern void (*hcolfunc_post1) (int hx, int sx, int yl, int yh); extern void (*hcolfunc_post2) (int hx, int sx, int yl, int yh); -extern void (*hcolfunc_post4) (int sx, int yl, int yh); +extern void (STACK_ARGS *hcolfunc_post4) (int sx, int yl, int yh); // diff --git a/src/tmap.nas b/src/tmap.nas index e03df7163a..259f5e6a61 100644 --- a/src/tmap.nas +++ b/src/tmap.nas @@ -55,6 +55,7 @@ EXTERN CPU EXTERN dc_pitch EXTERN dc_colormap +EXTERN dc_color EXTERN dc_iscale EXTERN dc_texturefrac EXTERN dc_source @@ -68,6 +69,9 @@ EXTERN dc_destorg EXTERN dc_ctspan EXTERN dc_temp +EXTERN Col2RGB8 +EXTERN RGB32k + EXTERN ds_xstep EXTERN ds_ystep EXTERN ds_colormap @@ -94,6 +98,7 @@ EXTERN _CPU EXTERN _dc_pitch EXTERN _dc_colormap +EXTERN _dc_color EXTERN _dc_iscale EXTERN _dc_texturefrac EXTERN _dc_source @@ -107,6 +112,9 @@ EXTERN _dc_destorg EXTERN _dc_ctspan EXTERN _dc_temp +EXTERN _Col2RGB8 +EXTERN _RGB32k + EXTERN _ds_xstep EXTERN _ds_ystep EXTERN _ds_colormap @@ -131,6 +139,7 @@ GLOBAL _ds_curcolormap %define dc_pitch _dc_pitch %define dc_colormap _dc_colormap +%define dc_color _dc_color %define dc_iscale _dc_iscale %define dc_texturefrac _dc_texturefrac %define dc_source _dc_source @@ -141,6 +150,9 @@ GLOBAL _ds_curcolormap %define dc_dest _dc_dest %define dc_destorg _dc_destorg +%define Col2RGB8 _Col2RGB8 +%define RGB32k _RGB32k + %define dc_ctspan _dc_ctspan %define dc_temp _dc_temp @@ -1476,10 +1488,121 @@ _rt_map4cols_asm2: pop ebx ret 4 + align 16 + +GLOBAL rt_shaded4cols_asm +GLOBAL _rt_shaded4cols_asm + +rt_shaded4cols_asm: +_rt_shaded4cols_asm: + mov ecx,[esp+8] + push ebp + mov ebp,[esp+16] + sub ebp,ecx + js near s4nil + mov eax,[ylookup+ecx*4] + add eax,[dc_destorg] ; eax = destination + push ebx + push esi + inc ebp ; ebp = count + add eax,[esp+16] + push edi + lea esi,[dc_temp+ecx*4] ; esi = source + + align 16 + +s4loop: movzx edx,byte [esi] + movzx ecx,byte [esi+1] +s4cm1: movzx edx,byte [SPACEFILLER4+edx] ; colormap +s4cm2: movzx edi,byte [SPACEFILLER4+ecx] ; colormap + shl edx,8 + movzx ebx,byte [eax] + shl edi,8 + movzx ecx,byte [eax+1] + sub ebx,edx + sub ecx,edi + mov ebx,[Col2RGB8+0x10000+ebx*4] + mov ecx,[Col2RGB8+0x10000+ecx*4] +s4fg1: add ebx,[SPACEFILLER4+edx*4] +s4fg2: add ecx,[SPACEFILLER4+edi*4] + or ebx,0x1f07c1f + or ecx,0x1f07c1f + mov edx,ebx + shr ebx,15 + mov edi,ecx + shr ecx,15 + and edx,ebx + and ecx,edi + mov bl,[RGB32k+edx] + movzx edx,byte [esi+2] + mov bh,[RGB32k+ecx] + movzx ecx,byte [esi+3] + mov [eax],bl + mov [eax+1],bh + +s4cm3: movzx edx,byte [SPACEFILLER4+edx] ; colormap +s4cm4: movzx edi,byte [SPACEFILLER4+ecx] ; colormap + shl edx,8 + movzx ebx,byte [eax+2] + shl edi,8 + movzx ecx,byte [eax+3] + sub ebx,edx + sub ecx,edi + mov ebx,[Col2RGB8+0x10000+ebx*4] + mov ecx,[Col2RGB8+0x10000+ecx*4] +s4fg3: add ebx,[SPACEFILLER4+edx*4] +s4fg4: add ecx,[SPACEFILLER4+edi*4] + or ebx,0x1f07c1f + or ecx,0x1f07c1f + mov edx,ebx + shr ebx,15 + mov edi,ecx + shr ecx,15 + and edx,ebx + and ecx,edi +s4p: add eax,320 ; pitch + add esi,4 + mov bl,[RGB32k+edx] + mov bh,[RGB32k+ecx] +s4p2: mov [eax-320+2],bl +s4p3: mov [eax-320+3],bh + dec ebp + jne s4loop + + pop edi + pop esi + pop ebx +s4nil: pop ebp + ret + + align 16 + ;************************ SECTION .text +GLOBAL R_SetupShadedCol +GLOBAL _R_SetupShadedCol +GLOBAL @R_SetupShadedCol@0 + +# Patch the values of dc_colormap and dc_color into the shaded column drawer. + +R_SetupShadedCol: +_R_SetupShadedCol: +@R_SetupShadedCol@0: + mov eax,[dc_colormap] + mov [s4cm1+3],eax + mov [s4cm2+3],eax + mov [s4cm3+3],eax + mov [s4cm4+3],eax + mov eax,[dc_color] + lea eax,[Col2RGB8+eax*4] + mov [s4fg1+3],eax + mov [s4fg2+3],eax + mov [s4fg3+3],eax + mov [s4fg4+3],eax + ret + EXTERN setvlinebpl_ EXTERN setpitch3 @@ -1490,11 +1613,19 @@ GLOBAL ASM_PatchPitch ASM_PatchPitch: _ASM_PatchPitch: @ASM_PatchPitch@0: - mov eax,[dc_pitch] - mov [rdcp1+2],eax - mov [rdcp2+2],eax - mov [rdcp3+2],eax - call setpitch3 - jmp setvlinebpl_ + mov eax,[dc_pitch] + mov [rdcp1+2],eax + mov [rdcp2+2],eax + mov [rdcp3+2],eax + mov [s4p+1],eax + mov ecx,eax + neg ecx + inc ecx + inc ecx + mov [s4p2+2],ecx + inc ecx + mov [s4p3+2],ecx + call setpitch3 + jmp setvlinebpl_