diff --git a/Makefile.mingw b/Makefile.mingw index 8ea1c471b..9200f0949 100644 --- a/Makefile.mingw +++ b/Makefile.mingw @@ -7,7 +7,7 @@ # Where did you install the FMOD API to? Change this line so that the build process can find it. FMODDIR = "c:/program files/fmodapi375win" -ifeq ($(findstring msys,$(shell sh --version 2>nul)),msys) +ifeq ($(findstring msys,$(shell sh --version 2>nul)),msys) WINCMD=0 else WINCMD=1 diff --git a/docs/rh-log.txt b/docs/rh-log.txt index 69cdf4b4e..af2594224 100644 --- a/docs/rh-log.txt +++ b/docs/rh-log.txt @@ -1,3 +1,17 @@ +February 26, 2008 +- Added an assembly version of rt_shaded4cols, since that's the main decal + drawing function. The most improvement came from being able to turn some + constant variables into immediate values with self-modifying code, but I + also managed to reorder it to make it a little faster. It's about 9% faster + than VC++'s code and 19% faster than GCC's code. That's not a huge + improvement (for VC++), but at least it's measurable. +- Removed the solid fill "optimization" from rt_shaded4cols(), because in my + testing, it didn't help any and in fact, hurt just a little bit. +- In the name of simplification, all the rt_tlate* drawers were changed to do + the translation in one step and the drawing in another. This lets me call + the untranslated drawer to do the real drawing instead of mostly duplicating + them. Performance wise, there is practically no difference from before. + February 25, 2008 (Changes by Graf Zahl) - Fixed: The DECORATE expression evaluator's random function could produce incorrect results for ranges > 255. Changed so that FRandom's default diff --git a/src/r_draw.cpp b/src/r_draw.cpp index 7c61b1f8c..acb5413a9 100644 --- a/src/r_draw.cpp +++ b/src/r_draw.cpp @@ -80,6 +80,7 @@ int detailyshift; // [RH] Y shift for vertical detail level extern "C" void STACK_ARGS DoubleHoriz_MMX (int height, int width, BYTE *dest, int pitch); extern "C" void STACK_ARGS DoubleHorizVert_MMX (int height, int width, BYTE *dest, int pitch); extern "C" void STACK_ARGS DoubleVert_ASM (int height, int width, BYTE *dest, int pitch); +extern "C" void R_SetupShadedCol(); #endif // [RH] Pointers to the different column drawers. @@ -94,7 +95,7 @@ void (*R_DrawSpan)(void); void (*R_DrawSpanMasked)(void); void (*R_DrawSpanTranslucent)(void); void (*R_DrawSpanMaskedTranslucent)(void); -void (*rt_map4cols)(int,int,int); +void (STACK_ARGS *rt_map4cols)(int,int,int); // // R_DrawColumn @@ -2270,6 +2271,7 @@ ESPSResult R_SetPatchStyle (FRenderStyle style, fixed_t alpha, int translation, { dc_colormap += fixedlightlev; } + R_SetupShadedCol(); return r_columnmethod ? DoDraw1 : DoDraw0; } diff --git a/src/r_draw.h b/src/r_draw.h index 7193bfe30..23ec1b74b 100644 --- a/src/r_draw.h +++ b/src/r_draw.h @@ -106,47 +106,59 @@ void R_DrawMaskedColumnHoriz (const BYTE *column, const FTexture::Span *spans); void R_InitColumnDrawers (); // [RH] Moves data from the temporary buffer to the screen. +extern "C" +{ void rt_copy1col_c (int hx, int sx, int yl, int yh); -void rt_copy4cols_c (int sx, int yl, int yh); -void rt_map1col_c (int hx, int sx, int yl, int yh); -void rt_map4cols_c (int sx, int yl, int yh); -void rt_add1col (int hx, int sx, int yl, int yh); -void rt_add4cols (int sx, int yl, int yh); -void rt_tlate1col (int hx, int sx, int yl, int yh); -void rt_tlate4cols (int sx, int yl, int yh); -void rt_tlateadd1col (int hx, int sx, int yl, int yh); -void rt_tlateadd4cols (int sx, int yl, int yh); +void STACK_ARGS rt_copy4cols_c (int sx, int yl, int yh); + void rt_shaded1col (int hx, int sx, int yl, int yh); -void rt_shaded4cols (int sx, int yl, int yh); +void STACK_ARGS rt_shaded4cols_c (int sx, int yl, int yh); +void STACK_ARGS rt_shaded4cols_asm (int sx, int yl, int yh); + +void rt_map1col_c (int hx, int sx, int yl, int yh); +void rt_add1col (int hx, int sx, int yl, int yh); void rt_addclamp1col (int hx, int sx, int yl, int yh); -void rt_addclamp4cols (int sx, int yl, int yh); -void rt_tlateaddclamp1col (int hx, int sx, int yl, int yh); -void rt_tlateaddclamp4cols (int sx, int yl, int yh); void rt_subclamp1col (int hx, int sx, int yl, int yh); -void rt_subclamp4cols (int sx, int yl, int yh); -void rt_tlatesubclamp1col (int hx, int sx, int yl, int yh); -void rt_tlatesubclamp4cols (int sx, int yl, int yh); void rt_revsubclamp1col (int hx, int sx, int yl, int yh); -void rt_revsubclamp4cols (int sx, int yl, int yh); + +void rt_tlate1col (int hx, int sx, int yl, int yh); +void rt_tlateadd1col (int hx, int sx, int yl, int yh); +void rt_tlateaddclamp1col (int hx, int sx, int yl, int yh); +void rt_tlatesubclamp1col (int hx, int sx, int yl, int yh); void rt_tlaterevsubclamp1col (int hx, int sx, int yl, int yh); -void rt_tlaterevsubclamp4cols (int sx, int yl, int yh); -extern "C" void rt_copy1col_asm (int hx, int sx, int yl, int yh); -extern "C" void rt_copy4cols_asm (int sx, int yl, int yh); -extern "C" void rt_map1col_asm (int hx, int sx, int yl, int yh); -extern "C" void rt_map4cols_asm1 (int sx, int yl, int yh); -extern "C" void rt_map4cols_asm2 (int sx, int yl, int yh); +void STACK_ARGS rt_map4cols_c (int sx, int yl, int yh); +void STACK_ARGS rt_add4cols (int sx, int yl, int yh); +void STACK_ARGS rt_addclamp4cols (int sx, int yl, int yh); +void STACK_ARGS rt_subclamp4cols (int sx, int yl, int yh); +void STACK_ARGS rt_revsubclamp4cols (int sx, int yl, int yh); -extern void (*rt_map4cols)(int sx, int yl, int yh); +void STACK_ARGS rt_tlate4cols (int sx, int yl, int yh); +void STACK_ARGS rt_tlateadd4cols (int sx, int yl, int yh); +void STACK_ARGS rt_tlateaddclamp4cols (int sx, int yl, int yh); +void STACK_ARGS rt_tlatesubclamp4cols (int sx, int yl, int yh); +void STACK_ARGS rt_tlaterevsubclamp4cols (int sx, int yl, int yh); + +void rt_copy1col_asm (int hx, int sx, int yl, int yh); +void rt_map1col_asm (int hx, int sx, int yl, int yh); + +void STACK_ARGS rt_copy4cols_asm (int sx, int yl, int yh); +void STACK_ARGS rt_map4cols_asm1 (int sx, int yl, int yh); +void STACK_ARGS rt_map4cols_asm2 (int sx, int yl, int yh); +} + +extern void (STACK_ARGS *rt_map4cols)(int sx, int yl, int yh); #ifdef USEASM #define rt_copy1col rt_copy1col_asm #define rt_copy4cols rt_copy4cols_asm #define rt_map1col rt_map1col_asm +#define rt_shaded4cols rt_shaded4cols_asm #else #define rt_copy1col rt_copy1col_c #define rt_copy4cols rt_copy4cols_c #define rt_map1col rt_map1col_c +#define rt_shaded4cols rt_shaded4cols_c #endif void rt_draw4cols (int sx); diff --git a/src/r_drawt.cpp b/src/r_drawt.cpp index 5880fa7ce..49e30016c 100644 --- a/src/r_drawt.cpp +++ b/src/r_drawt.cpp @@ -102,7 +102,7 @@ void rt_copy1col_c (int hx, int sx, int yl, int yh) } // Copies all four spans to the screen starting at sx. -void rt_copy4cols_c (int sx, int yl, int yh) +void STACK_ARGS rt_copy4cols_c (int sx, int yl, int yh) { int *source; int *dest; @@ -170,7 +170,7 @@ void rt_map1col_c (int hx, int sx, int yl, int yh) } // Maps all four spans to the screen starting at sx. -void rt_map4cols_c (int sx, int yl, int yh) +void STACK_ARGS rt_map4cols_c (int sx, int yl, int yh) { BYTE *colormap; BYTE *source; @@ -214,63 +214,106 @@ void rt_map4cols_c (int sx, int yl, int yh) } #endif /* !USEASM */ +void rt_Translate1col(const BYTE *translation, int hx, int yl, int yh) +{ + int count = yh - yl + 1; + BYTE *source = &dc_temp[yl*4 + hx]; + + // Things we do to hit the compiler's optimizer with a clue bat: + // 1. Parallelism is explicitly spelled out by using a separate + // C instruction for each assembly instruction. GCC lets me + // have four temporaries, but VC++ spills to the stack with + // more than two. Two is probably optimal, anyway. + // 2. The results of the translation lookups are explicitly + // stored in byte-sized variables. This causes the VC++ code + // to use byte mov instructions in most cases; for apparently + // random reasons, it will use movzx for some places. GCC + // ignores this and uses movzx always. + + // Do 8 rows at a time. + for (int count8 = count >> 3; count8; --count8) + { + int c0, c1; + BYTE b0, b1; + + c0 = source[0]; c1 = source[4]; + b0 = translation[c0]; b1 = translation[c1]; + source[0] = b0; source[4] = b1; + + c0 = source[8]; c1 = source[12]; + b0 = translation[c0]; b1 = translation[c1]; + source[8] = b0; source[12] = b1; + + c0 = source[16]; c1 = source[20]; + b0 = translation[c0]; b1 = translation[c1]; + source[16] = b0; source[20] = b1; + + c0 = source[24]; c1 = source[28]; + b0 = translation[c0]; b1 = translation[c1]; + source[28] = b0; source[28] = b1; + + source += 32; + } + // Finish by doing 1 row at a time. + for (count &= 7; count; --count, source++) + { + source[0] = translation[source[0]]; + } +} + +void rt_Translate4cols(const BYTE *translation, int yl, int yh) +{ + int count = yh - yl + 1; + BYTE *source = &dc_temp[yl*4]; + int c0, c1; + BYTE b0, b1; + + // Do 2 rows at a time. + for (int count8 = count >> 1; count8; --count8) + { + c0 = source[0]; c1 = source[1]; + b0 = translation[c0]; b1 = translation[c1]; + source[0] = b0; source[1] = b1; + + c0 = source[2]; c1 = source[3]; + b0 = translation[c0]; b1 = translation[c1]; + source[2] = b0; source[3] = b1; + + c0 = source[4]; c1 = source[5]; + b0 = translation[c0]; b1 = translation[c1]; + source[4] = b0; source[5] = b1; + + c0 = source[6]; c1 = source[7]; + b0 = translation[c0]; b1 = translation[c1]; + source[6] = b0; source[7] = b1; + + source += 8; + } + // Do the final row if count was odd. + if (count & 1) + { + c0 = source[0]; c1 = source[1]; + b0 = translation[c0]; b1 = translation[c1]; + source[0] = b0; source[1] = b1; + + c0 = source[2]; c1 = source[3]; + b0 = translation[c0]; b1 = translation[c1]; + source[2] = b0; source[3] = b1; + } +} + // Translates one span at hx to the screen at sx. void rt_tlate1col (int hx, int sx, int yl, int yh) { - BYTE *translation; - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - translation = dc_translation; - colormap = dc_colormap; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4 + hx]; - pitch = dc_pitch; - - do { - *dest = colormap[translation[*source]]; - source += 4; - dest += pitch; - } while (--count); + rt_Translate1col(dc_translation, hx, yl, yh); + rt_map1col(hx, sx, yl, yh); } // Translates all four spans to the screen starting at sx. -void rt_tlate4cols (int sx, int yl, int yh) +void STACK_ARGS rt_tlate4cols (int sx, int yl, int yh) { - BYTE *translation; - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - translation = dc_translation; - count = yh-yl; - if (count < 0) - return; - count++; - - colormap = dc_colormap; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4]; - pitch = dc_pitch; - - do { - dest[0] = colormap[translation[source[0]]]; - dest[1] = colormap[translation[source[1]]]; - dest[2] = colormap[translation[source[2]]]; - dest[3] = colormap[translation[source[3]]]; - source += 4; - dest += pitch; - } while (--count); + rt_Translate4cols(dc_translation, yl, yh); + rt_map4cols(sx, yl, yh); } // Adds one span at hx to the screen at sx without clamping. @@ -308,7 +351,7 @@ void rt_add1col (int hx, int sx, int yl, int yh) } // Adds all four spans to the screen starting at sx without clamping. -void rt_add4cols (int sx, int yl, int yh) +void STACK_ARGS rt_add4cols (int sx, int yl, int yh) { BYTE *colormap; BYTE *source; @@ -366,95 +409,15 @@ void rt_add4cols (int sx, int yl, int yh) // Translates and adds one span at hx to the screen at sx without clamping. void rt_tlateadd1col (int hx, int sx, int yl, int yh) { - BYTE *translation; - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - DWORD *fg2rgb = dc_srcblend; - DWORD *bg2rgb = dc_destblend; - translation = dc_translation; - colormap = dc_colormap; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4 + hx]; - pitch = dc_pitch; - - do { - DWORD fg = colormap[translation[*source]]; - DWORD bg = *dest; - - fg = fg2rgb[fg]; - bg = bg2rgb[bg]; - fg = (fg+bg) | 0x1f07c1f; - *dest = RGB32k[0][0][fg & (fg>>15)]; - source += 4; - dest += pitch; - } while (--count); + rt_Translate1col(dc_translation, hx, yl, yh); + rt_add1col(hx, sx, yl, yh); } // Translates and adds all four spans to the screen starting at sx without clamping. -void rt_tlateadd4cols (int sx, int yl, int yh) +void STACK_ARGS rt_tlateadd4cols (int sx, int yl, int yh) { - BYTE *translation; - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - DWORD *fg2rgb = dc_srcblend; - DWORD *bg2rgb = dc_destblend; - translation = dc_translation; - colormap = dc_colormap; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4]; - pitch = dc_pitch; - - do { - DWORD fg = colormap[translation[source[0]]]; - DWORD bg = dest[0]; - fg = fg2rgb[fg]; - bg = bg2rgb[bg]; - fg = (fg+bg) | 0x1f07c1f; - dest[0] = RGB32k[0][0][fg & (fg>>15)]; - - fg = colormap[translation[source[1]]]; - bg = dest[1]; - fg = fg2rgb[fg]; - bg = bg2rgb[bg]; - fg = (fg+bg) | 0x1f07c1f; - dest[1] = RGB32k[0][0][fg & (fg>>15)]; - - - fg = colormap[translation[source[2]]]; - bg = dest[2]; - fg = fg2rgb[fg]; - bg = bg2rgb[bg]; - fg = (fg+bg) | 0x1f07c1f; - dest[2] = RGB32k[0][0][fg & (fg>>15)]; - - fg = colormap[translation[source[3]]]; - bg = dest[3]; - fg = fg2rgb[fg]; - bg = bg2rgb[bg]; - fg = (fg+bg) | 0x1f07c1f; - dest[3] = RGB32k[0][0][fg & (fg>>15)]; - - source += 4; - dest += pitch; - } while (--count); + rt_Translate4cols(dc_translation, yl, yh); + rt_add4cols(sx, yl, yh); } // Shades one span at hx to the screen at sx. @@ -489,9 +452,8 @@ void rt_shaded1col (int hx, int sx, int yl, int yh) } // Shades all four spans to the screen starting at sx. -void rt_shaded4cols (int sx, int yl, int yh) +void STACK_ARGS rt_shaded4cols_c (int sx, int yl, int yh) { - BYTE fill; DWORD *fgstart; BYTE *colormap; BYTE *source; @@ -509,66 +471,73 @@ void rt_shaded4cols (int sx, int yl, int yh) dest = ylookup[yl] + sx + dc_destorg; source = &dc_temp[yl*4]; pitch = dc_pitch; - { - DWORD val = fgstart[64<<8] | 0x1f07c1f; - fill = RGB32k[0][0][val & (val>>15)]; - } - + + // 107.1, 108.4, 118.2/117.7, 119.4 do { - DWORD val = colormap[source[0]]; - DWORD fg; - if (val < 64) - { - fg = fgstart[val<<8]; - val = (Col2RGB8[64-val][dest[0]] + fg) | 0x1f07c1f; - dest[0] = RGB32k[0][0][val & (val>>15)]; - } - else - { - dest[0] = fill; - } + DWORD val; + + val = colormap[source[0]]; + val = (Col2RGB8[64-val][dest[0]] + fgstart[val<<8]) | 0x1f07c1f; + dest[0] = RGB32k[0][0][val & (val>>15)]; val = colormap[source[1]]; - if (val < 64) - { - fg = fgstart[val<<8]; - val = (Col2RGB8[64-val][dest[1]] + fg) | 0x1f07c1f; - dest[1] = RGB32k[0][0][val & (val>>15)]; - } - else - { - dest[1] = fill; - } + val = (Col2RGB8[64-val][dest[1]] + fgstart[val<<8]) | 0x1f07c1f; + dest[1] = RGB32k[0][0][val & (val>>15)]; val = colormap[source[2]]; - if (val < 64) - { - fg = fgstart[val<<8]; - val = (Col2RGB8[64-val][dest[2]] + fg) | 0x1f07c1f; - dest[2] = RGB32k[0][0][val & (val>>15)]; - } - else - { - dest[2] = fill; - } + val = (Col2RGB8[64-val][dest[2]] + fgstart[val<<8]) | 0x1f07c1f; + dest[2] = RGB32k[0][0][val & (val>>15)]; val = colormap[source[3]]; - if (val < 64) - { - fg = fgstart[val<<8]; - val = (Col2RGB8[64-val][dest[3]] + fg) | 0x1f07c1f; - dest[3] = RGB32k[0][0][val & (val>>15)]; - } - else - { - dest[3] = fill; - } + val = (Col2RGB8[64-val][dest[3]] + fgstart[val<<8]) | 0x1f07c1f; + dest[3] = RGB32k[0][0][val & (val>>15)]; source += 4; dest += pitch; } while (--count); } +#if 0 +static DWORD t_fgstart[1]; +static BYTE t_colormap[1]; + +void STACK_ARGS rt_shaded4cols_t (int sx, int yl, int yh) +{ + BYTE *source; + BYTE *dest; + int count; + + count = yh-yl; + if (count < 0) + return; + count++; + + dest = ylookup[yl] + sx + dc_destorg; + source = &dc_temp[yl*4]; + + do { + DWORD val, val2; + + val = t_colormap[source[0]]; + val2 = t_colormap[source[1]]; + val = (Col2RGB8[64-val][dest[0]] + t_fgstart[val<<8]) | 0x1f07c1f; + val2 = (Col2RGB8[64-val2][dest[1]] + t_fgstart[val2<<8]) | 0x1f07c1f; + dest[0] = RGB32k[0][0][val & (val>>15)]; + dest[1] = RGB32k[0][0][val2 & (val2>>15)]; + + val = t_colormap[source[2]]; + val2 = t_colormap[source[3]]; + val = (Col2RGB8[64-val][dest[2]] + t_fgstart[val<<8]) | 0x1f07c1f; + val2 = (Col2RGB8[64-val2][dest[3]] + t_fgstart[val2<<8]) | 0x1f07c1f; + dest[2] = RGB32k[0][0][val & (val>>15)]; + dest[3] = RGB32k[0][0][val2 & (val2>>15)]; + + source += 4; + dest += 320; + } while (--count); +} +#endif + // Adds one span at hx to the screen at sx with clamping. void rt_addclamp1col (int hx, int sx, int yl, int yh) { @@ -606,7 +575,7 @@ void rt_addclamp1col (int hx, int sx, int yl, int yh) } // Adds all four spans to the screen starting at sx with clamping. -void rt_addclamp4cols (int sx, int yl, int yh) +void STACK_ARGS rt_addclamp4cols (int sx, int yl, int yh) { BYTE *colormap; BYTE *source; @@ -672,105 +641,15 @@ void rt_addclamp4cols (int sx, int yl, int yh) // Translates and adds one span at hx to the screen at sx with clamping. void rt_tlateaddclamp1col (int hx, int sx, int yl, int yh) { - BYTE *translation; - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - DWORD *fg2rgb = dc_srcblend; - DWORD *bg2rgb = dc_destblend; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4 + hx]; - pitch = dc_pitch; - colormap = dc_colormap; - translation = dc_translation; - - do { - DWORD a = fg2rgb[colormap[translation[*source]]] + bg2rgb[*dest]; - DWORD b = a; - - a |= 0x01f07c1f; - b &= 0x40100400; - a &= 0x3fffffff; - b = b - (b >> 5); - a |= b; - *dest = RGB32k[0][0][(a>>15) & a]; - source += 4; - dest += pitch; - } while (--count); + rt_Translate1col(dc_translation, hx, yl, yh); + rt_addclamp1col(hx, sx, yl, yh); } // Translates and adds all four spans to the screen starting at sx with clamping. -void rt_tlateaddclamp4cols (int sx, int yl, int yh) +void STACK_ARGS rt_tlateaddclamp4cols (int sx, int yl, int yh) { - BYTE *translation; - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - DWORD *fg2rgb = dc_srcblend; - DWORD *bg2rgb = dc_destblend; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4]; - pitch = dc_pitch; - colormap = dc_colormap; - translation = dc_translation; - - do { - DWORD a = fg2rgb[colormap[translation[source[0]]]] + bg2rgb[dest[0]]; - DWORD b = a; - - a |= 0x01f07c1f; - b &= 0x40100400; - a &= 0x3fffffff; - b = b - (b >> 5); - a |= b; - dest[0] = RGB32k[0][0][(a>>15) & a]; - - a = fg2rgb[colormap[translation[source[1]]]] + bg2rgb[dest[1]]; - b = a; - a |= 0x01f07c1f; - b &= 0x40100400; - a &= 0x3fffffff; - b = b - (b >> 5); - a |= b; - dest[1] = RGB32k[0][0][(a>>15) & a]; - - a = fg2rgb[colormap[translation[source[2]]]] + bg2rgb[dest[2]]; - b = a; - a |= 0x01f07c1f; - b &= 0x40100400; - a &= 0x3fffffff; - b = b - (b >> 5); - a |= b; - dest[2] = RGB32k[0][0][(a>>15) & a]; - - a = fg2rgb[colormap[translation[source[3]]]] + bg2rgb[dest[3]]; - b = a; - a |= 0x01f07c1f; - b &= 0x40100400; - a &= 0x3fffffff; - b = b - (b >> 5); - a |= b; - dest[3] = RGB32k[0][0][(a>>15) & a]; - - source += 4; - dest += pitch; - } while (--count); + rt_Translate4cols(dc_translation, yl, yh); + rt_addclamp4cols(sx, yl, yh); } // Subtracts one span at hx to the screen at sx with clamping. @@ -809,7 +688,7 @@ void rt_subclamp1col (int hx, int sx, int yl, int yh) } // Subtracts all four spans to the screen starting at sx with clamping. -void rt_subclamp4cols (int sx, int yl, int yh) +void STACK_ARGS rt_subclamp4cols (int sx, int yl, int yh) { BYTE *colormap; BYTE *source; @@ -871,100 +750,15 @@ void rt_subclamp4cols (int sx, int yl, int yh) // Translates and subtracts one span at hx to the screen at sx with clamping. void rt_tlatesubclamp1col (int hx, int sx, int yl, int yh) { - BYTE *translation; - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - DWORD *fg2rgb = dc_srcblend; - DWORD *bg2rgb = dc_destblend; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4 + hx]; - pitch = dc_pitch; - colormap = dc_colormap; - translation = dc_translation; - - do { - DWORD a = (fg2rgb[colormap[translation[*source]]] | 0x40100400) - bg2rgb[*dest]; - DWORD b = a; - - b &= 0x40100400; - b = b - (b >> 5); - a &= b; - a |= 0x01f07c1f; - *dest = RGB32k[0][0][(a>>15) & a]; - source += 4; - dest += pitch; - } while (--count); + rt_Translate1col(dc_translation, hx, yl, yh); + rt_subclamp1col(hx, sx, yl, yh); } // Translates and subtracts all four spans to the screen starting at sx with clamping. -void rt_tlatesubclamp4cols (int sx, int yl, int yh) +void STACK_ARGS rt_tlatesubclamp4cols (int sx, int yl, int yh) { - BYTE *translation; - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - DWORD *fg2rgb = dc_srcblend; - DWORD *bg2rgb = dc_destblend; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4]; - pitch = dc_pitch; - colormap = dc_colormap; - translation = dc_translation; - - do { - DWORD a = (fg2rgb[colormap[translation[source[0]]]] | 0x40100400) - bg2rgb[dest[0]]; - DWORD b = a; - - b &= 0x40100400; - b = b - (b >> 5); - a &= b; - a |= 0x01f07c1f; - dest[0] = RGB32k[0][0][(a>>15) & a]; - - a = (fg2rgb[colormap[translation[source[1]]]] | 0x40100400) - bg2rgb[dest[1]]; - b = a; - b &= 0x40100400; - b = b - (b >> 5); - a &= b; - a |= 0x01f07c1f; - dest[1] = RGB32k[0][0][(a>>15) & a]; - - a = (fg2rgb[colormap[translation[source[2]]]] | 0x40100400) - bg2rgb[dest[2]]; - b = a; - b &= 0x40100400; - b = b - (b >> 5); - a &= b; - a |= 0x01f07c1f; - dest[2] = RGB32k[0][0][(a>>15) & a]; - - a = (fg2rgb[colormap[translation[source[3]]]] | 0x40100400) - bg2rgb[dest[3]]; - b = a; - b &= 0x40100400; - b = b - (b >> 5); - a &= b; - a |= 0x01f07c1f; - dest[3] = RGB32k[0][0][(a>>15) & a]; - - source += 4; - dest += pitch; - } while (--count); + rt_Translate4cols(dc_translation, yl, yh); + rt_subclamp4cols(sx, yl, yh); } // Subtracts one span at hx from the screen at sx with clamping. @@ -1003,7 +797,7 @@ void rt_revsubclamp1col (int hx, int sx, int yl, int yh) } // Subtracts all four spans from the screen starting at sx with clamping. -void rt_revsubclamp4cols (int sx, int yl, int yh) +void STACK_ARGS rt_revsubclamp4cols (int sx, int yl, int yh) { BYTE *colormap; BYTE *source; @@ -1065,104 +859,19 @@ void rt_revsubclamp4cols (int sx, int yl, int yh) // Translates and subtracts one span at hx from the screen at sx with clamping. void rt_tlaterevsubclamp1col (int hx, int sx, int yl, int yh) { - BYTE *translation; - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - DWORD *fg2rgb = dc_srcblend; - DWORD *bg2rgb = dc_destblend; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4 + hx]; - pitch = dc_pitch; - colormap = dc_colormap; - translation = dc_translation; - - do { - DWORD a = (bg2rgb[dest[0]] | 0x40100400) - fg2rgb[colormap[translation[source[0]]]]; - DWORD b = a; - - b &= 0x40100400; - b = b - (b >> 5); - a &= b; - a |= 0x01f07c1f; - *dest = RGB32k[0][0][(a>>15) & a]; - source += 4; - dest += pitch; - } while (--count); + rt_Translate1col(dc_translation, hx, yl, yh); + rt_revsubclamp1col(hx, sx, yl, yh); } // Translates and subtracts all four spans from the screen starting at sx with clamping. -void rt_tlaterevsubclamp4cols (int sx, int yl, int yh) +void STACK_ARGS rt_tlaterevsubclamp4cols (int sx, int yl, int yh) { - BYTE *translation; - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - DWORD *fg2rgb = dc_srcblend; - DWORD *bg2rgb = dc_destblend; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4]; - pitch = dc_pitch; - colormap = dc_colormap; - translation = dc_translation; - - do { - DWORD a = (bg2rgb[dest[0]] | 0x40100400) - fg2rgb[colormap[translation[source[0]]]]; - DWORD b = a; - - b &= 0x40100400; - b = b - (b >> 5); - a &= b; - a |= 0x01f07c1f; - dest[0] = RGB32k[0][0][(a>>15) & a]; - - a = (bg2rgb[dest[1]] | 0x40100400) - fg2rgb[colormap[translation[source[1]]]]; - b = a; - b &= 0x40100400; - b = b - (b >> 5); - a &= b; - a |= 0x01f07c1f; - dest[1] = RGB32k[0][0][(a>>15) & a]; - - a = (bg2rgb[dest[2]] | 0x40100400) - fg2rgb[colormap[translation[source[2]]]]; - b = a; - b &= 0x40100400; - b = b - (b >> 5); - a &= b; - a |= 0x01f07c1f; - dest[2] = RGB32k[0][0][(a>>15) & a]; - - a = (bg2rgb[dest[3]] | 0x40100400) - fg2rgb[colormap[translation[source[3]]]]; - b = a; - b &= 0x40100400; - b = b - (b >> 5); - a &= b; - a |= 0x01f07c1f; - dest[3] = RGB32k[0][0][(a>>15) & a]; - - source += 4; - dest += pitch; - } while (--count); + rt_Translate4cols(dc_translation, yl, yh); + rt_revsubclamp4cols(sx, yl, yh); } // Copies all spans in all four columns to the screen starting at sx. -// sx should be longword-aligned. +// sx should be dword-aligned. void rt_draw4cols (int sx) { int x, bad; diff --git a/src/r_main.cpp b/src/r_main.cpp index bd15a101a..9ca9c4231 100644 --- a/src/r_main.cpp +++ b/src/r_main.cpp @@ -205,7 +205,7 @@ void (*spanfunc) (void); void (*hcolfunc_pre) (void); void (*hcolfunc_post1) (int hx, int sx, int yl, int yh); void (*hcolfunc_post2) (int hx, int sx, int yl, int yh); -void (*hcolfunc_post4) (int sx, int yl, int yh); +void (STACK_ARGS *hcolfunc_post4) (int sx, int yl, int yh); cycle_t WallCycles, PlaneCycles, MaskedCycles, WallScanCycles; diff --git a/src/r_main.h b/src/r_main.h index abecbd971..fdb3b636f 100644 --- a/src/r_main.h +++ b/src/r_main.h @@ -147,7 +147,7 @@ extern void (*spanfunc) (void); extern void (*hcolfunc_pre) (void); extern void (*hcolfunc_post1) (int hx, int sx, int yl, int yh); extern void (*hcolfunc_post2) (int hx, int sx, int yl, int yh); -extern void (*hcolfunc_post4) (int sx, int yl, int yh); +extern void (STACK_ARGS *hcolfunc_post4) (int sx, int yl, int yh); // diff --git a/src/tmap.nas b/src/tmap.nas index e03df7163..259f5e6a6 100644 --- a/src/tmap.nas +++ b/src/tmap.nas @@ -55,6 +55,7 @@ EXTERN CPU EXTERN dc_pitch EXTERN dc_colormap +EXTERN dc_color EXTERN dc_iscale EXTERN dc_texturefrac EXTERN dc_source @@ -68,6 +69,9 @@ EXTERN dc_destorg EXTERN dc_ctspan EXTERN dc_temp +EXTERN Col2RGB8 +EXTERN RGB32k + EXTERN ds_xstep EXTERN ds_ystep EXTERN ds_colormap @@ -94,6 +98,7 @@ EXTERN _CPU EXTERN _dc_pitch EXTERN _dc_colormap +EXTERN _dc_color EXTERN _dc_iscale EXTERN _dc_texturefrac EXTERN _dc_source @@ -107,6 +112,9 @@ EXTERN _dc_destorg EXTERN _dc_ctspan EXTERN _dc_temp +EXTERN _Col2RGB8 +EXTERN _RGB32k + EXTERN _ds_xstep EXTERN _ds_ystep EXTERN _ds_colormap @@ -131,6 +139,7 @@ GLOBAL _ds_curcolormap %define dc_pitch _dc_pitch %define dc_colormap _dc_colormap +%define dc_color _dc_color %define dc_iscale _dc_iscale %define dc_texturefrac _dc_texturefrac %define dc_source _dc_source @@ -141,6 +150,9 @@ GLOBAL _ds_curcolormap %define dc_dest _dc_dest %define dc_destorg _dc_destorg +%define Col2RGB8 _Col2RGB8 +%define RGB32k _RGB32k + %define dc_ctspan _dc_ctspan %define dc_temp _dc_temp @@ -1476,10 +1488,121 @@ _rt_map4cols_asm2: pop ebx ret 4 + align 16 + +GLOBAL rt_shaded4cols_asm +GLOBAL _rt_shaded4cols_asm + +rt_shaded4cols_asm: +_rt_shaded4cols_asm: + mov ecx,[esp+8] + push ebp + mov ebp,[esp+16] + sub ebp,ecx + js near s4nil + mov eax,[ylookup+ecx*4] + add eax,[dc_destorg] ; eax = destination + push ebx + push esi + inc ebp ; ebp = count + add eax,[esp+16] + push edi + lea esi,[dc_temp+ecx*4] ; esi = source + + align 16 + +s4loop: movzx edx,byte [esi] + movzx ecx,byte [esi+1] +s4cm1: movzx edx,byte [SPACEFILLER4+edx] ; colormap +s4cm2: movzx edi,byte [SPACEFILLER4+ecx] ; colormap + shl edx,8 + movzx ebx,byte [eax] + shl edi,8 + movzx ecx,byte [eax+1] + sub ebx,edx + sub ecx,edi + mov ebx,[Col2RGB8+0x10000+ebx*4] + mov ecx,[Col2RGB8+0x10000+ecx*4] +s4fg1: add ebx,[SPACEFILLER4+edx*4] +s4fg2: add ecx,[SPACEFILLER4+edi*4] + or ebx,0x1f07c1f + or ecx,0x1f07c1f + mov edx,ebx + shr ebx,15 + mov edi,ecx + shr ecx,15 + and edx,ebx + and ecx,edi + mov bl,[RGB32k+edx] + movzx edx,byte [esi+2] + mov bh,[RGB32k+ecx] + movzx ecx,byte [esi+3] + mov [eax],bl + mov [eax+1],bh + +s4cm3: movzx edx,byte [SPACEFILLER4+edx] ; colormap +s4cm4: movzx edi,byte [SPACEFILLER4+ecx] ; colormap + shl edx,8 + movzx ebx,byte [eax+2] + shl edi,8 + movzx ecx,byte [eax+3] + sub ebx,edx + sub ecx,edi + mov ebx,[Col2RGB8+0x10000+ebx*4] + mov ecx,[Col2RGB8+0x10000+ecx*4] +s4fg3: add ebx,[SPACEFILLER4+edx*4] +s4fg4: add ecx,[SPACEFILLER4+edi*4] + or ebx,0x1f07c1f + or ecx,0x1f07c1f + mov edx,ebx + shr ebx,15 + mov edi,ecx + shr ecx,15 + and edx,ebx + and ecx,edi +s4p: add eax,320 ; pitch + add esi,4 + mov bl,[RGB32k+edx] + mov bh,[RGB32k+ecx] +s4p2: mov [eax-320+2],bl +s4p3: mov [eax-320+3],bh + dec ebp + jne s4loop + + pop edi + pop esi + pop ebx +s4nil: pop ebp + ret + + align 16 + ;************************ SECTION .text +GLOBAL R_SetupShadedCol +GLOBAL _R_SetupShadedCol +GLOBAL @R_SetupShadedCol@0 + +# Patch the values of dc_colormap and dc_color into the shaded column drawer. + +R_SetupShadedCol: +_R_SetupShadedCol: +@R_SetupShadedCol@0: + mov eax,[dc_colormap] + mov [s4cm1+3],eax + mov [s4cm2+3],eax + mov [s4cm3+3],eax + mov [s4cm4+3],eax + mov eax,[dc_color] + lea eax,[Col2RGB8+eax*4] + mov [s4fg1+3],eax + mov [s4fg2+3],eax + mov [s4fg3+3],eax + mov [s4fg4+3],eax + ret + EXTERN setvlinebpl_ EXTERN setpitch3 @@ -1490,11 +1613,19 @@ GLOBAL ASM_PatchPitch ASM_PatchPitch: _ASM_PatchPitch: @ASM_PatchPitch@0: - mov eax,[dc_pitch] - mov [rdcp1+2],eax - mov [rdcp2+2],eax - mov [rdcp3+2],eax - call setpitch3 - jmp setvlinebpl_ + mov eax,[dc_pitch] + mov [rdcp1+2],eax + mov [rdcp2+2],eax + mov [rdcp3+2],eax + mov [s4p+1],eax + mov ecx,eax + neg ecx + inc ecx + inc ecx + mov [s4p2+2],ecx + inc ecx + mov [s4p3+2],ecx + call setpitch3 + jmp setvlinebpl_