gzdoom/src/r_draw.h

286 lines
9.4 KiB
C
Raw Normal View History

// Emacs style mode select -*- C++ -*-
//-----------------------------------------------------------------------------
//
// $Id:$
//
// Copyright (C) 1993-1996 by id Software, Inc.
//
// This source is available for distribution and/or modification
// only under the terms of the DOOM Source Code License as
// published by id Software. All rights reserved.
//
// The source is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// FITNESS FOR A PARTICULAR PURPOSE. See the DOOM Source Code License
// for more details.
//
// DESCRIPTION:
// System specific interface stuff.
//
//-----------------------------------------------------------------------------
#ifndef __R_DRAW__
#define __R_DRAW__
#include "r_defs.h"
extern "C" int ylookup[MAXHEIGHT];
extern "C" int dc_pitch; // [RH] Distance between rows
extern "C" lighttable_t*dc_colormap;
extern "C" int dc_x;
extern "C" int dc_yl;
extern "C" int dc_yh;
extern "C" fixed_t dc_iscale;
extern "C" fixed_t dc_texturemid;
extern "C" fixed_t dc_texturefrac;
extern "C" int dc_color; // [RH] For flat colors (no texturing)
- Updated lempar.c to v1.31. - Added .txt files to the list of types (wad, zip, and pk3) that can be loaded without listing them after -file. - Fonts that are created by the ACS setfont command to wrap a texture now support animated textures. - FON2 fonts can now use their full palette for CR_UNTRANSLATED when drawn with the hardware 2D path instead of being restricted to the game palette. - Fixed: Toggling vid_vsync would reset the displayed fullscreen gamma to 1 on a Radeon 9000. - Added back the off-by-one palette handling, but in a much more limited scope than before. The skipped entry is assumed to always be at 248, and it is assumed that all Shader Model 1.4 cards suffer from this. That's because all SM1.4 cards are based on variants of the ATI R200 core, and the RV250 in a Radeon 9000 craps up like this. I see no reason to assume that other flavors of the R200 are any different. (Interesting note: With the Radeon 9000, D3DTADDRESS_CLAMP is an invalid address mode when using the debug Direct3D 9 runtime, but it works perfectly fine with the retail Direct3D 9 runtime.) (Insight: The R200 probably uses bytes for all its math inside pixel shaders. That would explain perfectly why I can't use constants greater than 1 with PS1.4 and why it can't do an exact mapping to every entry in the color palette. - Fixed: The software shaded drawer did not work for 2D, because its selected "color"map was replaced with the identitymap before being used. - Fixed: I cannot use Printf to output messages before the framebuffer was completely setup, meaning that Shader Model 1.4 cards could not change resolution. - I have decided to let remap palettes specify variable alpha values for their colors. D3DFB no longer forces them to 255. - Updated re2c to version 0.12.3. - Fixed: A_Wander used threshold as a timer, when it should have used reactiontime. - Fixed: A_CustomRailgun would not fire at all for actors without a target when the aim parameter was disabled. - Made the warp command work in multiplayer, again courtesy of Karate Chris. - Fixed: Trying to spawn a bot while not in a game made for a crashing time. (Patch courtesy of Karate Chris.) - Removed some floating point math from hu_scores.cpp that somebody's GCC gave warnings for (not mine, though). - Fixed: The SBarInfo drawbar command crashed if the sprite image was unavailable. - Fixed: FString::operator=(const char *) did not release its old buffer when being assigned to the null string. - The scanner no longer has an upper limit on the length of strings it accepts, though short strings will be faster than long ones. - Moved all the text scanning functions into a class. Mainly, this means that multiple script scanner states can be stored without being forced to do so recursively. I think I might be taking advantage of that in the near future. Possibly. Maybe. - Removed some potential buffer overflows from the decal parser. - Applied Blzut3's SBARINFO update #9: * Fixed: When using even length values in drawnumber it would cap to a 98 value instead of a 99 as intended. * The SBarInfo parser can now accept negatives for coordinates. This doesn't allow much right now, but later I plan to add better fullscreen hud support in which the negatives will be more useful. This also cleans up the source a bit since all calls for (x, y) coordinates are with the function getCoordinates(). - Added support for stencilling actors. - Added support for non-black colors specified with DTA_ColorOverlay to the software renderer. - Fixed: The inverse, gold, red, and green fixed colormaps each allocated space for 32 different colormaps, even though each only used the first one. - Added two new blending flags to make reverse subtract blending more useful: STYLEF_InvertSource and STYLEF_InvertOverlay. These invert the color that gets blended with the background, since that seems like a good idea for reverse subtraction. They also work with the other two blending operations. - Added subtract and reverse subtract blending operations to the renderer. Since the ERenderStyle enumeration was getting rather unwieldy, I converted it into a new FRenderStyle structure that lets each parameter of the blending equation be set separately. This simplified the set up for the blend quite a bit, and it means a number of new combinations are available by setting the parameters properly. SVN r710 (trunk)
2008-01-25 23:57:44 +00:00
extern "C" DWORD dc_srccolor;
extern "C" DWORD *dc_srcblend;
extern "C" DWORD *dc_destblend;
// first pixel in a column
extern "C" const BYTE* dc_source;
extern "C" BYTE *dc_dest, *dc_destorg;
extern "C" int dc_count;
extern "C" DWORD vplce[4];
extern "C" DWORD vince[4];
extern "C" BYTE* palookupoffse[4];
extern "C" const BYTE* bufplce[4];
// [RH] Temporary buffer for column drawing
extern "C" BYTE *dc_temp;
extern "C" unsigned int dc_tspans[4][MAXHEIGHT];
extern "C" unsigned int *dc_ctspan[4];
extern "C" unsigned int horizspans[4];
// [RH] Pointers to the different column and span drawers...
// The span blitting interface.
// Hook in assembler or system specific BLT here.
extern void (*R_DrawColumn)(void);
extern DWORD (STACK_ARGS *dovline1) ();
extern DWORD (STACK_ARGS *doprevline1) ();
- Ported vlinetallasm4 to AMD64 assembly. Even with the increased number of registers AMD64 provides, this routine still needs to be written as self- modifying code for maximum performance. The additional registers do allow for further optimization over the x86 version by allowing all four pixels to be in flight at the same time. The end result is that AMD64 ASM is about 2.18 times faster than AMD64 C and about 1.06 times faster than x86 ASM. (For further comparison, AMD64 C and x86 C are practically the same for this function.) Should I port any more assembly to AMD64, mvlineasm4 is the most likely candidate, but it's not used enough at this point to bother. Also, this may or may not work with Linux at the moment, since it doesn't have the eh_handler metadata. Win64 is easier, since I just need to structure the function prologue and epilogue properly and use some assembler directives/macros to automatically generate the metadata. And that brings up another point: You need YASM to assemble the AMD64 code, because NASM doesn't support the Win64 metadata directives. - Added an SSE version of DoBlending. This is strictly C intrinsics. VC++ still throws around unneccessary register moves. GCC seems to be pretty close to optimal, requiring only about 2 cycles/color. They're both faster than my hand-written MMX routine, so I don't need to feel bad about not hand-optimizing this for x64 builds. - Removed an extra instruction from DoBlending_MMX, transposed two instructions, and unrolled it once, shaving off about 80 cycles from the time required to blend 256 palette entries. Why? Because I tried writing a C version of the routine using compiler intrinsics and was appalled by all the extra movq's VC++ added to the code. GCC was better, but still generated extra instructions. I only wanted a C version because I can't use inline assembly with VC++'s x64 compiler, and x64 assembly is a bit of a pain. (It's a pain because Linux and Windows have different calling conventions, and you need to maintain extra metadata for functions.) So, the assembly version stays and the C version stays out. - Removed all the pixel doubling r_detail modes, since the one platform they were intended to assist (486) actually sees very little benefit from them. - Rewrote CheckMMX in C and renamed it to CheckCPU. - Fixed: CPUID function 0x80000005 is specified to return detailed L1 cache only for AMD processors, so we must not use it on other architectures, or we end up overwriting the L1 cache line size with 0 or some other number we don't actually understand. SVN r1134 (trunk)
2008-08-09 03:13:43 +00:00
#ifdef X64_ASM
#define dovline4 vlinetallasm4
extern "C" void vlinetallasm4();
#else
extern void (STACK_ARGS *dovline4) ();
- Ported vlinetallasm4 to AMD64 assembly. Even with the increased number of registers AMD64 provides, this routine still needs to be written as self- modifying code for maximum performance. The additional registers do allow for further optimization over the x86 version by allowing all four pixels to be in flight at the same time. The end result is that AMD64 ASM is about 2.18 times faster than AMD64 C and about 1.06 times faster than x86 ASM. (For further comparison, AMD64 C and x86 C are practically the same for this function.) Should I port any more assembly to AMD64, mvlineasm4 is the most likely candidate, but it's not used enough at this point to bother. Also, this may or may not work with Linux at the moment, since it doesn't have the eh_handler metadata. Win64 is easier, since I just need to structure the function prologue and epilogue properly and use some assembler directives/macros to automatically generate the metadata. And that brings up another point: You need YASM to assemble the AMD64 code, because NASM doesn't support the Win64 metadata directives. - Added an SSE version of DoBlending. This is strictly C intrinsics. VC++ still throws around unneccessary register moves. GCC seems to be pretty close to optimal, requiring only about 2 cycles/color. They're both faster than my hand-written MMX routine, so I don't need to feel bad about not hand-optimizing this for x64 builds. - Removed an extra instruction from DoBlending_MMX, transposed two instructions, and unrolled it once, shaving off about 80 cycles from the time required to blend 256 palette entries. Why? Because I tried writing a C version of the routine using compiler intrinsics and was appalled by all the extra movq's VC++ added to the code. GCC was better, but still generated extra instructions. I only wanted a C version because I can't use inline assembly with VC++'s x64 compiler, and x64 assembly is a bit of a pain. (It's a pain because Linux and Windows have different calling conventions, and you need to maintain extra metadata for functions.) So, the assembly version stays and the C version stays out. - Removed all the pixel doubling r_detail modes, since the one platform they were intended to assist (486) actually sees very little benefit from them. - Rewrote CheckMMX in C and renamed it to CheckCPU. - Fixed: CPUID function 0x80000005 is specified to return detailed L1 cache only for AMD processors, so we must not use it on other architectures, or we end up overwriting the L1 cache line size with 0 or some other number we don't actually understand. SVN r1134 (trunk)
2008-08-09 03:13:43 +00:00
#endif
extern void setupvline (int);
extern DWORD (STACK_ARGS *domvline1) ();
extern void (STACK_ARGS *domvline4) ();
extern void setupmvline (int);
extern void setuptmvline (int);
// The Spectre/Invisibility effect.
extern void (*R_DrawFuzzColumn)(void);
// [RH] Draw shaded column
extern void (*R_DrawShadedColumn)(void);
// Draw with color translation tables, for player sprite rendering,
// Green/Red/Blue/Indigo shirts.
extern void (*R_DrawTranslatedColumn)(void);
// Span drawing for rows, floor/ceiling. No Spectre effect needed.
extern void (*R_DrawSpan)(void);
void R_SetupSpanBits(FTexture *tex);
void R_SetSpanColormap(BYTE *colormap);
void R_SetSpanSource(const BYTE *pixels);
// Span drawing for masked textures.
extern void (*R_DrawSpanMasked)(void);
// Span drawing for translucent textures.
extern void (*R_DrawSpanTranslucent)(void);
// Span drawing for masked, translucent textures.
extern void (*R_DrawSpanMaskedTranslucent)(void);
// [RH] Span blit into an interleaved intermediate buffer
extern void (*R_DrawColumnHoriz)(void);
void R_DrawMaskedColumnHoriz (const BYTE *column, const FTexture::Span *spans);
// [RH] Initialize the above pointers
void R_InitColumnDrawers ();
// [RH] Moves data from the temporary buffer to the screen.
extern "C"
{
void rt_copy1col_c (int hx, int sx, int yl, int yh);
void STACK_ARGS rt_copy4cols_c (int sx, int yl, int yh);
void rt_shaded1col (int hx, int sx, int yl, int yh);
void STACK_ARGS rt_shaded4cols_c (int sx, int yl, int yh);
void STACK_ARGS rt_shaded4cols_asm (int sx, int yl, int yh);
void rt_map1col_c (int hx, int sx, int yl, int yh);
void rt_add1col (int hx, int sx, int yl, int yh);
void rt_addclamp1col (int hx, int sx, int yl, int yh);
void rt_subclamp1col (int hx, int sx, int yl, int yh);
void rt_revsubclamp1col (int hx, int sx, int yl, int yh);
void rt_tlate1col (int hx, int sx, int yl, int yh);
void rt_tlateadd1col (int hx, int sx, int yl, int yh);
void rt_tlateaddclamp1col (int hx, int sx, int yl, int yh);
- Updated lempar.c to v1.31. - Added .txt files to the list of types (wad, zip, and pk3) that can be loaded without listing them after -file. - Fonts that are created by the ACS setfont command to wrap a texture now support animated textures. - FON2 fonts can now use their full palette for CR_UNTRANSLATED when drawn with the hardware 2D path instead of being restricted to the game palette. - Fixed: Toggling vid_vsync would reset the displayed fullscreen gamma to 1 on a Radeon 9000. - Added back the off-by-one palette handling, but in a much more limited scope than before. The skipped entry is assumed to always be at 248, and it is assumed that all Shader Model 1.4 cards suffer from this. That's because all SM1.4 cards are based on variants of the ATI R200 core, and the RV250 in a Radeon 9000 craps up like this. I see no reason to assume that other flavors of the R200 are any different. (Interesting note: With the Radeon 9000, D3DTADDRESS_CLAMP is an invalid address mode when using the debug Direct3D 9 runtime, but it works perfectly fine with the retail Direct3D 9 runtime.) (Insight: The R200 probably uses bytes for all its math inside pixel shaders. That would explain perfectly why I can't use constants greater than 1 with PS1.4 and why it can't do an exact mapping to every entry in the color palette. - Fixed: The software shaded drawer did not work for 2D, because its selected "color"map was replaced with the identitymap before being used. - Fixed: I cannot use Printf to output messages before the framebuffer was completely setup, meaning that Shader Model 1.4 cards could not change resolution. - I have decided to let remap palettes specify variable alpha values for their colors. D3DFB no longer forces them to 255. - Updated re2c to version 0.12.3. - Fixed: A_Wander used threshold as a timer, when it should have used reactiontime. - Fixed: A_CustomRailgun would not fire at all for actors without a target when the aim parameter was disabled. - Made the warp command work in multiplayer, again courtesy of Karate Chris. - Fixed: Trying to spawn a bot while not in a game made for a crashing time. (Patch courtesy of Karate Chris.) - Removed some floating point math from hu_scores.cpp that somebody's GCC gave warnings for (not mine, though). - Fixed: The SBarInfo drawbar command crashed if the sprite image was unavailable. - Fixed: FString::operator=(const char *) did not release its old buffer when being assigned to the null string. - The scanner no longer has an upper limit on the length of strings it accepts, though short strings will be faster than long ones. - Moved all the text scanning functions into a class. Mainly, this means that multiple script scanner states can be stored without being forced to do so recursively. I think I might be taking advantage of that in the near future. Possibly. Maybe. - Removed some potential buffer overflows from the decal parser. - Applied Blzut3's SBARINFO update #9: * Fixed: When using even length values in drawnumber it would cap to a 98 value instead of a 99 as intended. * The SBarInfo parser can now accept negatives for coordinates. This doesn't allow much right now, but later I plan to add better fullscreen hud support in which the negatives will be more useful. This also cleans up the source a bit since all calls for (x, y) coordinates are with the function getCoordinates(). - Added support for stencilling actors. - Added support for non-black colors specified with DTA_ColorOverlay to the software renderer. - Fixed: The inverse, gold, red, and green fixed colormaps each allocated space for 32 different colormaps, even though each only used the first one. - Added two new blending flags to make reverse subtract blending more useful: STYLEF_InvertSource and STYLEF_InvertOverlay. These invert the color that gets blended with the background, since that seems like a good idea for reverse subtraction. They also work with the other two blending operations. - Added subtract and reverse subtract blending operations to the renderer. Since the ERenderStyle enumeration was getting rather unwieldy, I converted it into a new FRenderStyle structure that lets each parameter of the blending equation be set separately. This simplified the set up for the blend quite a bit, and it means a number of new combinations are available by setting the parameters properly. SVN r710 (trunk)
2008-01-25 23:57:44 +00:00
void rt_tlatesubclamp1col (int hx, int sx, int yl, int yh);
void rt_tlaterevsubclamp1col (int hx, int sx, int yl, int yh);
void STACK_ARGS rt_map4cols_c (int sx, int yl, int yh);
void STACK_ARGS rt_add4cols_c (int sx, int yl, int yh);
void STACK_ARGS rt_addclamp4cols_c (int sx, int yl, int yh);
void STACK_ARGS rt_subclamp4cols (int sx, int yl, int yh);
void STACK_ARGS rt_revsubclamp4cols (int sx, int yl, int yh);
void STACK_ARGS rt_tlate4cols (int sx, int yl, int yh);
void STACK_ARGS rt_tlateadd4cols (int sx, int yl, int yh);
void STACK_ARGS rt_tlateaddclamp4cols (int sx, int yl, int yh);
void STACK_ARGS rt_tlatesubclamp4cols (int sx, int yl, int yh);
void STACK_ARGS rt_tlaterevsubclamp4cols (int sx, int yl, int yh);
void rt_copy1col_asm (int hx, int sx, int yl, int yh);
void rt_map1col_asm (int hx, int sx, int yl, int yh);
void STACK_ARGS rt_copy4cols_asm (int sx, int yl, int yh);
void STACK_ARGS rt_map4cols_asm1 (int sx, int yl, int yh);
void STACK_ARGS rt_map4cols_asm2 (int sx, int yl, int yh);
void STACK_ARGS rt_add4cols_asm (int sx, int yl, int yh);
void STACK_ARGS rt_addclamp4cols_asm (int sx, int yl, int yh);
}
extern void (STACK_ARGS *rt_map4cols)(int sx, int yl, int yh);
- Ported vlinetallasm4 to AMD64 assembly. Even with the increased number of registers AMD64 provides, this routine still needs to be written as self- modifying code for maximum performance. The additional registers do allow for further optimization over the x86 version by allowing all four pixels to be in flight at the same time. The end result is that AMD64 ASM is about 2.18 times faster than AMD64 C and about 1.06 times faster than x86 ASM. (For further comparison, AMD64 C and x86 C are practically the same for this function.) Should I port any more assembly to AMD64, mvlineasm4 is the most likely candidate, but it's not used enough at this point to bother. Also, this may or may not work with Linux at the moment, since it doesn't have the eh_handler metadata. Win64 is easier, since I just need to structure the function prologue and epilogue properly and use some assembler directives/macros to automatically generate the metadata. And that brings up another point: You need YASM to assemble the AMD64 code, because NASM doesn't support the Win64 metadata directives. - Added an SSE version of DoBlending. This is strictly C intrinsics. VC++ still throws around unneccessary register moves. GCC seems to be pretty close to optimal, requiring only about 2 cycles/color. They're both faster than my hand-written MMX routine, so I don't need to feel bad about not hand-optimizing this for x64 builds. - Removed an extra instruction from DoBlending_MMX, transposed two instructions, and unrolled it once, shaving off about 80 cycles from the time required to blend 256 palette entries. Why? Because I tried writing a C version of the routine using compiler intrinsics and was appalled by all the extra movq's VC++ added to the code. GCC was better, but still generated extra instructions. I only wanted a C version because I can't use inline assembly with VC++'s x64 compiler, and x64 assembly is a bit of a pain. (It's a pain because Linux and Windows have different calling conventions, and you need to maintain extra metadata for functions.) So, the assembly version stays and the C version stays out. - Removed all the pixel doubling r_detail modes, since the one platform they were intended to assist (486) actually sees very little benefit from them. - Rewrote CheckMMX in C and renamed it to CheckCPU. - Fixed: CPUID function 0x80000005 is specified to return detailed L1 cache only for AMD processors, so we must not use it on other architectures, or we end up overwriting the L1 cache line size with 0 or some other number we don't actually understand. SVN r1134 (trunk)
2008-08-09 03:13:43 +00:00
#ifdef X86_ASM
#define rt_copy1col rt_copy1col_asm
#define rt_copy4cols rt_copy4cols_asm
#define rt_map1col rt_map1col_asm
#define rt_shaded4cols rt_shaded4cols_asm
#define rt_add4cols rt_add4cols_asm
#define rt_addclamp4cols rt_addclamp4cols_asm
#else
#define rt_copy1col rt_copy1col_c
#define rt_copy4cols rt_copy4cols_c
#define rt_map1col rt_map1col_c
#define rt_shaded4cols rt_shaded4cols_c
#define rt_add4cols rt_add4cols_c
#define rt_addclamp4cols rt_addclamp4cols_c
#endif
void rt_draw4cols (int sx);
// [RH] Preps the temporary horizontal buffer.
void rt_initcols (BYTE *buffer=NULL);
void R_DrawFogBoundary (int x1, int x2, short *uclip, short *dclip);
- Ported vlinetallasm4 to AMD64 assembly. Even with the increased number of registers AMD64 provides, this routine still needs to be written as self- modifying code for maximum performance. The additional registers do allow for further optimization over the x86 version by allowing all four pixels to be in flight at the same time. The end result is that AMD64 ASM is about 2.18 times faster than AMD64 C and about 1.06 times faster than x86 ASM. (For further comparison, AMD64 C and x86 C are practically the same for this function.) Should I port any more assembly to AMD64, mvlineasm4 is the most likely candidate, but it's not used enough at this point to bother. Also, this may or may not work with Linux at the moment, since it doesn't have the eh_handler metadata. Win64 is easier, since I just need to structure the function prologue and epilogue properly and use some assembler directives/macros to automatically generate the metadata. And that brings up another point: You need YASM to assemble the AMD64 code, because NASM doesn't support the Win64 metadata directives. - Added an SSE version of DoBlending. This is strictly C intrinsics. VC++ still throws around unneccessary register moves. GCC seems to be pretty close to optimal, requiring only about 2 cycles/color. They're both faster than my hand-written MMX routine, so I don't need to feel bad about not hand-optimizing this for x64 builds. - Removed an extra instruction from DoBlending_MMX, transposed two instructions, and unrolled it once, shaving off about 80 cycles from the time required to blend 256 palette entries. Why? Because I tried writing a C version of the routine using compiler intrinsics and was appalled by all the extra movq's VC++ added to the code. GCC was better, but still generated extra instructions. I only wanted a C version because I can't use inline assembly with VC++'s x64 compiler, and x64 assembly is a bit of a pain. (It's a pain because Linux and Windows have different calling conventions, and you need to maintain extra metadata for functions.) So, the assembly version stays and the C version stays out. - Removed all the pixel doubling r_detail modes, since the one platform they were intended to assist (486) actually sees very little benefit from them. - Rewrote CheckMMX in C and renamed it to CheckCPU. - Fixed: CPUID function 0x80000005 is specified to return detailed L1 cache only for AMD processors, so we must not use it on other architectures, or we end up overwriting the L1 cache line size with 0 or some other number we don't actually understand. SVN r1134 (trunk)
2008-08-09 03:13:43 +00:00
#ifdef X86_ASM
extern "C" void R_DrawColumnP_Unrolled (void);
extern "C" void R_DrawColumnHorizP_ASM (void);
extern "C" void R_DrawColumnP_ASM (void);
extern "C" void R_DrawFuzzColumnP_ASM (void);
- Ported vlinetallasm4 to AMD64 assembly. Even with the increased number of registers AMD64 provides, this routine still needs to be written as self- modifying code for maximum performance. The additional registers do allow for further optimization over the x86 version by allowing all four pixels to be in flight at the same time. The end result is that AMD64 ASM is about 2.18 times faster than AMD64 C and about 1.06 times faster than x86 ASM. (For further comparison, AMD64 C and x86 C are practically the same for this function.) Should I port any more assembly to AMD64, mvlineasm4 is the most likely candidate, but it's not used enough at this point to bother. Also, this may or may not work with Linux at the moment, since it doesn't have the eh_handler metadata. Win64 is easier, since I just need to structure the function prologue and epilogue properly and use some assembler directives/macros to automatically generate the metadata. And that brings up another point: You need YASM to assemble the AMD64 code, because NASM doesn't support the Win64 metadata directives. - Added an SSE version of DoBlending. This is strictly C intrinsics. VC++ still throws around unneccessary register moves. GCC seems to be pretty close to optimal, requiring only about 2 cycles/color. They're both faster than my hand-written MMX routine, so I don't need to feel bad about not hand-optimizing this for x64 builds. - Removed an extra instruction from DoBlending_MMX, transposed two instructions, and unrolled it once, shaving off about 80 cycles from the time required to blend 256 palette entries. Why? Because I tried writing a C version of the routine using compiler intrinsics and was appalled by all the extra movq's VC++ added to the code. GCC was better, but still generated extra instructions. I only wanted a C version because I can't use inline assembly with VC++'s x64 compiler, and x64 assembly is a bit of a pain. (It's a pain because Linux and Windows have different calling conventions, and you need to maintain extra metadata for functions.) So, the assembly version stays and the C version stays out. - Removed all the pixel doubling r_detail modes, since the one platform they were intended to assist (486) actually sees very little benefit from them. - Rewrote CheckMMX in C and renamed it to CheckCPU. - Fixed: CPUID function 0x80000005 is specified to return detailed L1 cache only for AMD processors, so we must not use it on other architectures, or we end up overwriting the L1 cache line size with 0 or some other number we don't actually understand. SVN r1134 (trunk)
2008-08-09 03:13:43 +00:00
void R_DrawTranslatedColumnP_C (void);
void R_DrawShadedColumnP_C (void);
extern "C" void R_DrawSpanP_ASM (void);
extern "C" void R_DrawSpanMaskedP_ASM (void);
- Ported vlinetallasm4 to AMD64 assembly. Even with the increased number of registers AMD64 provides, this routine still needs to be written as self- modifying code for maximum performance. The additional registers do allow for further optimization over the x86 version by allowing all four pixels to be in flight at the same time. The end result is that AMD64 ASM is about 2.18 times faster than AMD64 C and about 1.06 times faster than x86 ASM. (For further comparison, AMD64 C and x86 C are practically the same for this function.) Should I port any more assembly to AMD64, mvlineasm4 is the most likely candidate, but it's not used enough at this point to bother. Also, this may or may not work with Linux at the moment, since it doesn't have the eh_handler metadata. Win64 is easier, since I just need to structure the function prologue and epilogue properly and use some assembler directives/macros to automatically generate the metadata. And that brings up another point: You need YASM to assemble the AMD64 code, because NASM doesn't support the Win64 metadata directives. - Added an SSE version of DoBlending. This is strictly C intrinsics. VC++ still throws around unneccessary register moves. GCC seems to be pretty close to optimal, requiring only about 2 cycles/color. They're both faster than my hand-written MMX routine, so I don't need to feel bad about not hand-optimizing this for x64 builds. - Removed an extra instruction from DoBlending_MMX, transposed two instructions, and unrolled it once, shaving off about 80 cycles from the time required to blend 256 palette entries. Why? Because I tried writing a C version of the routine using compiler intrinsics and was appalled by all the extra movq's VC++ added to the code. GCC was better, but still generated extra instructions. I only wanted a C version because I can't use inline assembly with VC++'s x64 compiler, and x64 assembly is a bit of a pain. (It's a pain because Linux and Windows have different calling conventions, and you need to maintain extra metadata for functions.) So, the assembly version stays and the C version stays out. - Removed all the pixel doubling r_detail modes, since the one platform they were intended to assist (486) actually sees very little benefit from them. - Rewrote CheckMMX in C and renamed it to CheckCPU. - Fixed: CPUID function 0x80000005 is specified to return detailed L1 cache only for AMD processors, so we must not use it on other architectures, or we end up overwriting the L1 cache line size with 0 or some other number we don't actually understand. SVN r1134 (trunk)
2008-08-09 03:13:43 +00:00
#else
void R_DrawColumnHorizP_C (void);
void R_DrawColumnP_C (void);
void R_DrawFuzzColumnP_C (void);
void R_DrawTranslatedColumnP_C (void);
void R_DrawShadedColumnP_C (void);
void R_DrawSpanP_C (void);
void R_DrawSpanMaskedP_C (void);
#endif
void R_DrawSpanTranslucentP_C (void);
void R_DrawSpanMaskedTranslucentP_C (void);
void R_DrawTlatedLucentColumnP_C (void);
#define R_DrawTlatedLucentColumn R_DrawTlatedLucentColumnP_C
void R_FillColumnP (void);
void R_FillColumnHorizP (void);
void R_FillSpan (void);
#ifdef X86_ASM
#define R_SetupDrawSlab R_SetupDrawSlabA
#define R_DrawSlab R_DrawSlabA
#else
#define R_SetupDrawSlab R_SetupDrawSlabC
#define R_DrawSlab R_DrawSlabC
#endif
extern "C" void R_SetupDrawSlab(const BYTE *colormap);
extern "C" void STACK_ARGS R_DrawSlab(int dx, fixed_t v, int dy, fixed_t vi, const BYTE *vptr, BYTE *p);
extern "C" int ds_y;
extern "C" int ds_x1;
extern "C" int ds_x2;
extern "C" lighttable_t* ds_colormap;
extern "C" dsfixed_t ds_xfrac;
extern "C" dsfixed_t ds_yfrac;
extern "C" dsfixed_t ds_xstep;
extern "C" dsfixed_t ds_ystep;
extern "C" int ds_xbits;
extern "C" int ds_ybits;
extern "C" fixed_t ds_alpha;
// start of a 64*64 tile image
extern "C" const BYTE* ds_source;
extern "C" int ds_color; // [RH] For flat color (no texturing)
extern BYTE shadetables[/*NUMCOLORMAPS*16*256*/];
- Updated lempar.c to v1.31. - Added .txt files to the list of types (wad, zip, and pk3) that can be loaded without listing them after -file. - Fonts that are created by the ACS setfont command to wrap a texture now support animated textures. - FON2 fonts can now use their full palette for CR_UNTRANSLATED when drawn with the hardware 2D path instead of being restricted to the game palette. - Fixed: Toggling vid_vsync would reset the displayed fullscreen gamma to 1 on a Radeon 9000. - Added back the off-by-one palette handling, but in a much more limited scope than before. The skipped entry is assumed to always be at 248, and it is assumed that all Shader Model 1.4 cards suffer from this. That's because all SM1.4 cards are based on variants of the ATI R200 core, and the RV250 in a Radeon 9000 craps up like this. I see no reason to assume that other flavors of the R200 are any different. (Interesting note: With the Radeon 9000, D3DTADDRESS_CLAMP is an invalid address mode when using the debug Direct3D 9 runtime, but it works perfectly fine with the retail Direct3D 9 runtime.) (Insight: The R200 probably uses bytes for all its math inside pixel shaders. That would explain perfectly why I can't use constants greater than 1 with PS1.4 and why it can't do an exact mapping to every entry in the color palette. - Fixed: The software shaded drawer did not work for 2D, because its selected "color"map was replaced with the identitymap before being used. - Fixed: I cannot use Printf to output messages before the framebuffer was completely setup, meaning that Shader Model 1.4 cards could not change resolution. - I have decided to let remap palettes specify variable alpha values for their colors. D3DFB no longer forces them to 255. - Updated re2c to version 0.12.3. - Fixed: A_Wander used threshold as a timer, when it should have used reactiontime. - Fixed: A_CustomRailgun would not fire at all for actors without a target when the aim parameter was disabled. - Made the warp command work in multiplayer, again courtesy of Karate Chris. - Fixed: Trying to spawn a bot while not in a game made for a crashing time. (Patch courtesy of Karate Chris.) - Removed some floating point math from hu_scores.cpp that somebody's GCC gave warnings for (not mine, though). - Fixed: The SBarInfo drawbar command crashed if the sprite image was unavailable. - Fixed: FString::operator=(const char *) did not release its old buffer when being assigned to the null string. - The scanner no longer has an upper limit on the length of strings it accepts, though short strings will be faster than long ones. - Moved all the text scanning functions into a class. Mainly, this means that multiple script scanner states can be stored without being forced to do so recursively. I think I might be taking advantage of that in the near future. Possibly. Maybe. - Removed some potential buffer overflows from the decal parser. - Applied Blzut3's SBARINFO update #9: * Fixed: When using even length values in drawnumber it would cap to a 98 value instead of a 99 as intended. * The SBarInfo parser can now accept negatives for coordinates. This doesn't allow much right now, but later I plan to add better fullscreen hud support in which the negatives will be more useful. This also cleans up the source a bit since all calls for (x, y) coordinates are with the function getCoordinates(). - Added support for stencilling actors. - Added support for non-black colors specified with DTA_ColorOverlay to the software renderer. - Fixed: The inverse, gold, red, and green fixed colormaps each allocated space for 32 different colormaps, even though each only used the first one. - Added two new blending flags to make reverse subtract blending more useful: STYLEF_InvertSource and STYLEF_InvertOverlay. These invert the color that gets blended with the background, since that seems like a good idea for reverse subtraction. They also work with the other two blending operations. - Added subtract and reverse subtract blending operations to the renderer. Since the ERenderStyle enumeration was getting rather unwieldy, I converted it into a new FRenderStyle structure that lets each parameter of the blending equation be set separately. This simplified the set up for the blend quite a bit, and it means a number of new combinations are available by setting the parameters properly. SVN r710 (trunk)
2008-01-25 23:57:44 +00:00
extern FDynamicColormap ShadeFakeColormap[16];
extern BYTE identitymap[256];
- Discovered that Shader Model 1.4 clamps my constants, so I can't use palettes smaller than 256 entries with the shader I wrote for it. Is there a list of gotchas like this listed some where? I'd really like to see it. Well, when compiled with SM2.0, the PalTex shader seems to be every-so- slightly faster on my GF7950GT than the SM1.4 version, so I guess it's a minor win for cards that support it. - Fixed: ST_Endoom() failed to free the bitmap it used. - Added the DTA_ColorOverlay attribute to blend a color with the texture being drawn. For software, this (currently) only works with black. For hardware, it works with any color. The motiviation for this was so I could rewrite the status bar calls that passed DIM_MAP to DTA_Translation to draw darker icons into something that didn't require making a whole new remap table. - After having an "OMG! How could I have been so stupid?" moment, I have removed the off-by-one check from D3DFB. I had thought the off-by-one error was caused by rounding errors by the shader hardware. Not so. Rather, I wasn't sampling what I thought I was sampling. A texture that uses palette index 255 passes the value 1.0 to the shader. The shader needs to adjust the range of its palette indexes, or it will end up trying to read color 256 from the palette texture when it should be reading color 255. Doh! - The TranslationToTable() function has been added to map from translation numbers used by actors to the tables those numbers represent. This function performs validation for the input and returns NULL if the input value is invalid. - Major changes to the way translation tables work: No longer are they each a 256-byte array. Instead, the FRemapTable structure is used to represent each one. It includes a remap array for the software renderer, a palette array for a hardware renderer, and a native texture pointer for D3DFB. The translationtables array itself is now an array of TArrays that point to the real tables. The DTA_Translation attribute must also be passed a pointer to a FRemapTable, not a byte array as previously. - Modified DFrameBuffer::DrawRateStuff() so that it can do its thing properly for D3DFB's 2D mode. Before, any fullscreen graphics (like help images) covered it up. SVN r640 (trunk)
2007-12-26 04:42:15 +00:00
extern BYTE *dc_translation;
// [RH] Added for muliresolution support
void R_InitFuzzTable (int fuzzoff);
// [RH] Consolidate column drawer selection
enum ESPSResult
{
DontDraw, // not useful to draw this
DoDraw0, // draw this as if r_columnmethod is 0
DoDraw1, // draw this as if r_columnmethod is 1
};
- Updated lempar.c to v1.31. - Added .txt files to the list of types (wad, zip, and pk3) that can be loaded without listing them after -file. - Fonts that are created by the ACS setfont command to wrap a texture now support animated textures. - FON2 fonts can now use their full palette for CR_UNTRANSLATED when drawn with the hardware 2D path instead of being restricted to the game palette. - Fixed: Toggling vid_vsync would reset the displayed fullscreen gamma to 1 on a Radeon 9000. - Added back the off-by-one palette handling, but in a much more limited scope than before. The skipped entry is assumed to always be at 248, and it is assumed that all Shader Model 1.4 cards suffer from this. That's because all SM1.4 cards are based on variants of the ATI R200 core, and the RV250 in a Radeon 9000 craps up like this. I see no reason to assume that other flavors of the R200 are any different. (Interesting note: With the Radeon 9000, D3DTADDRESS_CLAMP is an invalid address mode when using the debug Direct3D 9 runtime, but it works perfectly fine with the retail Direct3D 9 runtime.) (Insight: The R200 probably uses bytes for all its math inside pixel shaders. That would explain perfectly why I can't use constants greater than 1 with PS1.4 and why it can't do an exact mapping to every entry in the color palette. - Fixed: The software shaded drawer did not work for 2D, because its selected "color"map was replaced with the identitymap before being used. - Fixed: I cannot use Printf to output messages before the framebuffer was completely setup, meaning that Shader Model 1.4 cards could not change resolution. - I have decided to let remap palettes specify variable alpha values for their colors. D3DFB no longer forces them to 255. - Updated re2c to version 0.12.3. - Fixed: A_Wander used threshold as a timer, when it should have used reactiontime. - Fixed: A_CustomRailgun would not fire at all for actors without a target when the aim parameter was disabled. - Made the warp command work in multiplayer, again courtesy of Karate Chris. - Fixed: Trying to spawn a bot while not in a game made for a crashing time. (Patch courtesy of Karate Chris.) - Removed some floating point math from hu_scores.cpp that somebody's GCC gave warnings for (not mine, though). - Fixed: The SBarInfo drawbar command crashed if the sprite image was unavailable. - Fixed: FString::operator=(const char *) did not release its old buffer when being assigned to the null string. - The scanner no longer has an upper limit on the length of strings it accepts, though short strings will be faster than long ones. - Moved all the text scanning functions into a class. Mainly, this means that multiple script scanner states can be stored without being forced to do so recursively. I think I might be taking advantage of that in the near future. Possibly. Maybe. - Removed some potential buffer overflows from the decal parser. - Applied Blzut3's SBARINFO update #9: * Fixed: When using even length values in drawnumber it would cap to a 98 value instead of a 99 as intended. * The SBarInfo parser can now accept negatives for coordinates. This doesn't allow much right now, but later I plan to add better fullscreen hud support in which the negatives will be more useful. This also cleans up the source a bit since all calls for (x, y) coordinates are with the function getCoordinates(). - Added support for stencilling actors. - Added support for non-black colors specified with DTA_ColorOverlay to the software renderer. - Fixed: The inverse, gold, red, and green fixed colormaps each allocated space for 32 different colormaps, even though each only used the first one. - Added two new blending flags to make reverse subtract blending more useful: STYLEF_InvertSource and STYLEF_InvertOverlay. These invert the color that gets blended with the background, since that seems like a good idea for reverse subtraction. They also work with the other two blending operations. - Added subtract and reverse subtract blending operations to the renderer. Since the ERenderStyle enumeration was getting rather unwieldy, I converted it into a new FRenderStyle structure that lets each parameter of the blending equation be set separately. This simplified the set up for the blend quite a bit, and it means a number of new combinations are available by setting the parameters properly. SVN r710 (trunk)
2008-01-25 23:57:44 +00:00
ESPSResult R_SetPatchStyle (FRenderStyle style, fixed_t alpha, int translation, DWORD color);
// Call this after finished drawing the current thing, in case its
// style was STYLE_Shade
void R_FinishSetPatchStyle ();
// transmaskwallscan calls this to find out what column drawers to use
bool R_GetTransMaskDrawers (fixed_t (**tmvline1)(), void (**tmvline4)());
// Retrieve column data for wallscan. Should probably be removed
// to just use the texture's GetColumn() method. It just exists
// for double-layer skies.
const BYTE *R_GetColumn (FTexture *tex, int col);
void wallscan (int x1, int x2, short *uwal, short *dwal, fixed_t *swal, fixed_t *lwal, fixed_t yrepeat, const BYTE *(*getcol)(FTexture *tex, int col)=R_GetColumn);
// maskwallscan is exactly like wallscan but does not draw anything where the texture is color 0.
void maskwallscan (int x1, int x2, short *uwal, short *dwal, fixed_t *swal, fixed_t *lwal, fixed_t yrepeat, const BYTE *(*getcol)(FTexture *tex, int col)=R_GetColumn);
// transmaskwallscan is like maskwallscan, but it can also blend to the background
void transmaskwallscan (int x1, int x2, short *uwal, short *dwal, fixed_t *swal, fixed_t *lwal, fixed_t yrepeat, const BYTE *(*getcol)(FTexture *tex, int col)=R_GetColumn);
#endif