From 1e42c6f227aac0c2248abf9d7a4910caddee354f Mon Sep 17 00:00:00 2001 From: Christoph Oelckers Date: Wed, 7 Dec 2016 11:40:59 +0100 Subject: [PATCH 1/7] - added copyright headers to two files missing them. --- src/r_draw.cpp | 33 +++++++++++++++++++++++++++++++++ src/r_draw_pal.cpp | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/src/r_draw.cpp b/src/r_draw.cpp index a2bf412e8b..52f5f24e16 100644 --- a/src/r_draw.cpp +++ b/src/r_draw.cpp @@ -1,3 +1,36 @@ +/* +** r_draw.cpp +** +**--------------------------------------------------------------------------- +** Copyright 1998-2016 Randy Heit +** Copyright 2016 Magnus Norddahl +** All rights reserved. +** +** Redistribution and use in source and binary forms, with or without +** modification, are permitted provided that the following conditions +** are met: +** +** 1. Redistributions of source code must retain the above copyright +** notice, this list of conditions and the following disclaimer. +** 2. Redistributions in binary form must reproduce the above copyright +** notice, this list of conditions and the following disclaimer in the +** documentation and/or other materials provided with the distribution. +** 3. The name of the author may not be used to endorse or promote products +** derived from this software without specific prior written permission. +** +** THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +** IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +** OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +** IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, +** INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +** NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +** DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +** THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +** (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +** THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**--------------------------------------------------------------------------- +** +*/ #include diff --git a/src/r_draw_pal.cpp b/src/r_draw_pal.cpp index 0264dcbf9e..b508dd221c 100644 --- a/src/r_draw_pal.cpp +++ b/src/r_draw_pal.cpp @@ -1,3 +1,36 @@ +/* +** r_draw_pal.cpp +** +**--------------------------------------------------------------------------- +** Copyright 1998-2016 Randy Heit +** Copyright 2016 Magnus Norddahl +** All rights reserved. +** +** Redistribution and use in source and binary forms, with or without +** modification, are permitted provided that the following conditions +** are met: +** +** 1. Redistributions of source code must retain the above copyright +** notice, this list of conditions and the following disclaimer. +** 2. Redistributions in binary form must reproduce the above copyright +** notice, this list of conditions and the following disclaimer in the +** documentation and/or other materials provided with the distribution. +** 3. The name of the author may not be used to endorse or promote products +** derived from this software without specific prior written permission. +** +** THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +** IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +** OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +** IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, +** INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +** NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +** DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +** THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +** (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +** THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**--------------------------------------------------------------------------- +** +*/ #include "templates.h" #include "doomtype.h" From 42346c58d3587fec0fdebd6a29c63b130049ead8 Mon Sep 17 00:00:00 2001 From: Christoph Oelckers Date: Wed, 7 Dec 2016 12:31:43 +0100 Subject: [PATCH 2/7] - disabled assembly entirely to make the MT drawer submission compile. This still requires a review of the two non-drawer functions that get 'lost'. --- src/CMakeLists.txt | 138 --------------------------------------------- src/doomtype.h | 51 ----------------- src/r_main.cpp | 3 - src/v_video.h | 4 -- 4 files changed, 196 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e164a338cb..3f54e0fcf9 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -14,12 +14,6 @@ include( CheckIncludeFiles ) include( CheckLibraryExists ) include( FindPkgConfig ) -if( NOT APPLE ) - option( NO_ASM "Disable assembly code" OFF ) -else() - # At the moment asm code doesn't work with OS X, so disable by default - option( NO_ASM "Disable assembly code" ON ) -endif() if( ZD_CMAKE_COMPILER_IS_GNUCXX_COMPATIBLE ) option( NO_STRIP "Do not strip Release or MinSizeRel builds" ) # At least some versions of Xcode fail if you strip with the linker @@ -114,7 +108,6 @@ if( WIN32 ) ) set( FMOD_INC_PATH_SUFFIXES PATH_SUFFIXES inc ) set( FMOD_LIB_PATH_SUFFIXES PATH_SUFFIXES lib ) - set( NASM_NAMES nasmw nasm ) find_path( D3D_INCLUDE_DIR d3d9.h PATHS ENV DXSDK_DIR @@ -239,7 +232,6 @@ else() endif() endif() endif() - set( NASM_NAMES nasm ) if( NO_GTK ) add_definitions( -DNO_GTK ) @@ -379,105 +371,6 @@ endif() find_package( FluidSynth ) -# Search for NASM - -if( NOT NO_ASM ) - if( UNIX AND X64 ) - find_program( GAS_PATH as ) - - if( GAS_PATH ) - set( ASSEMBLER ${GAS_PATH} ) - else() - message( STATUS "Could not find as. Disabling assembly code." ) - set( NO_ASM ON ) - endif() - else() - find_program( NASM_PATH NAMES ${NASM_NAMES} ) - find_program( YASM_PATH yasm ) - - if( X64 ) - if( YASM_PATH ) - set( ASSEMBLER ${YASM_PATH} ) - else() - message( STATUS "Could not find YASM. Disabling assembly code." ) - set( NO_ASM ON ) - endif() - else() - if( NASM_PATH ) - set( ASSEMBLER ${NASM_PATH} ) - else() - message( STATUS "Could not find NASM. Disabling assembly code." ) - set( NO_ASM ON ) - endif() - endif() - endif() - - # I think the only reason there was a version requirement was because the - # executable name for Windows changed from 0.x to 2.0, right? This is - # how to do it in case I need to do something similar later. - - # execute_process( COMMAND ${NASM_PATH} -v - # OUTPUT_VARIABLE NASM_VER_STRING ) - # string( REGEX REPLACE ".*version ([0-9]+[.][0-9]+).*" "\\1" NASM_VER "${NASM_VER_STRING}" ) - # if( NOT NASM_VER LESS 2 ) - # message( SEND_ERROR "NASM version should be 2 or later. (Installed version is ${NASM_VER}.)" ) - # endif() -endif() - -if( NOT NO_ASM ) - # Valgrind support is meaningless without assembly code. - if( VALGRIND ) - add_definitions( -DVALGRIND_AWARE=1 ) - # If you're Valgrinding, you probably want to keep symbols around. - set( NO_STRIP ON ) - endif() - - # Tell CMake how to assemble our files - if( UNIX ) - set( ASM_OUTPUT_EXTENSION .o ) - if( X64 ) - set( ASM_FLAGS ) - set( ASM_SOURCE_EXTENSION .s ) - else() - if( APPLE ) - set( ASM_FLAGS -fmacho -DM_TARGET_MACHO ) - else() - set( ASM_FLAGS -felf -DM_TARGET_LINUX ) - endif() - set( ASM_FLAGS "${ASM_FLAGS}" -i${CMAKE_CURRENT_SOURCE_DIR}/ ) - set( ASM_SOURCE_EXTENSION .asm ) - endif() - else() - set( ASM_OUTPUT_EXTENSION .obj ) - set( ASM_SOURCE_EXTENSION .asm ) - if( X64 ) - set( ASM_FLAGS -f win64 -DWIN32 -DWIN64 ) - else() - set( ASM_FLAGS -f win32 -DWIN32 -i${CMAKE_CURRENT_SOURCE_DIR}/ ) - endif() - endif() - if( WIN32 AND NOT X64 ) - set( FIXRTEXT fixrtext ) - else() - set( FIXRTEXT "" ) - endif() - message( STATUS "Selected assembler: ${ASSEMBLER}" ) - MACRO( ADD_ASM_FILE indir infile ) - set( ASM_OUTPUT_${infile} "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/zdoom.dir/${indir}/${infile}${ASM_OUTPUT_EXTENSION}" ) - if( WIN32 AND NOT X64 ) - set( FIXRTEXT_${infile} COMMAND ${FIXRTEXT} "${ASM_OUTPUT_${infile}}" ) - else() - set( FIXRTEXT_${infile} COMMAND "" ) - endif() - add_custom_command( OUTPUT ${ASM_OUTPUT_${infile}} - COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/zdoom.dir/${indir} - COMMAND ${ASSEMBLER} ${ASM_FLAGS} -o"${ASM_OUTPUT_${infile}}" "${CMAKE_CURRENT_SOURCE_DIR}/${indir}/${infile}${ASM_SOURCE_EXTENSION}" - ${FIXRTEXT_${infile}} - DEPENDS ${indir}/${infile}.asm ${FIXRTEXT} ) - set( ASM_SOURCES ${ASM_SOURCES} "${ASM_OUTPUT_${infile}}" ) - ENDMACRO() -endif() - # Decide on SSE setup set( SSE_MATTERS NO ) @@ -756,24 +649,6 @@ else() set( OTHER_SYSTEM_SOURCES ${PLAT_WIN32_SOURCES} ${PLAT_OSX_SOURCES} ${PLAT_COCOA_SOURCES} ) endif() -if( NOT ASM_SOURCES ) - set( ASM_SOURCES "" ) -endif() - -if( NO_ASM ) - add_definitions( -DNOASM ) -else() - if( X64 ) - ADD_ASM_FILE( asm_x86_64 tmap3 ) - else() - ADD_ASM_FILE( asm_ia32 a ) - ADD_ASM_FILE( asm_ia32 misc ) - ADD_ASM_FILE( asm_ia32 tmap ) - ADD_ASM_FILE( asm_ia32 tmap2 ) - ADD_ASM_FILE( asm_ia32 tmap3 ) - endif() -endif() - add_custom_command( OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/xlat_parser.c ${CMAKE_CURRENT_BINARY_DIR}/xlat_parser.h COMMAND lemon -C${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/xlat/xlat_parser.y DEPENDS lemon ${CMAKE_CURRENT_SOURCE_DIR}/xlat/xlat_parser.y ) @@ -866,16 +741,6 @@ set( NOT_COMPILED_SOURCE_FILES scripting/zscript/zcc-parse.lemon zcc-parse.c zcc-parse.h - - # We could have the ASM macro add these files, but it wouldn't add all - # platforms. - asm_ia32/a.asm - asm_ia32/misc.asm - asm_ia32/tmap.asm - asm_ia32/tmap2.asm - asm_ia32/tmap3.asm - asm_x86_64/tmap3.asm - asm_x86_64/tmap3.s ) set( FASTMATH_PCH_SOURCES @@ -1208,7 +1073,6 @@ add_executable( zdoom WIN32 MACOSX_BUNDLE ${HEADER_FILES} ${NOT_COMPILED_SOURCE_FILES} __autostart.cpp - ${ASM_SOURCES} ${SYSTEM_SOURCES} ${X86_SOURCES} ${FASTMATH_SOURCES} @@ -1371,8 +1235,6 @@ install(TARGETS zdoom DESTINATION ${INSTALL_PATH} COMPONENT "Game executable") -source_group("Assembly Files\\ia32" REGULAR_EXPRESSION "^${CMAKE_CURRENT_SOURCE_DIR}/asm_ia32/.+") -source_group("Assembly Files\\x86_64" REGULAR_EXPRESSION "^${CMAKE_CURRENT_SOURCE_DIR}/asm_x86_64/.+") source_group("Audio Files" REGULAR_EXPRESSION "^${CMAKE_CURRENT_SOURCE_DIR}/sound/.+") source_group("Audio Files\\OPL Synth" REGULAR_EXPRESSION "^${CMAKE_CURRENT_SOURCE_DIR}/oplsynth/.+") source_group("Audio Files\\OPL Synth\\DOSBox" FILES oplsynth/dosbox/opl.cpp oplsynth/dosbox/opl.h) diff --git a/src/doomtype.h b/src/doomtype.h index a9818df78c..264713d1b7 100644 --- a/src/doomtype.h +++ b/src/doomtype.h @@ -48,57 +48,6 @@ class PClassActor; typedef TMap FClassMap; -// Since this file is included by everything, it seems an appropriate place -// to check the NOASM/USEASM macros. - -// There are three assembly-related macros: -// -// NOASM - Assembly code is disabled -// X86_ASM - Using ia32 assembly code -// X64_ASM - Using amd64 assembly code -// -// Note that these relate only to using the pure assembly code. Inline -// assembly may still be used without respect to these macros, as -// deemed appropriate. - -#ifndef NOASM -// Select the appropriate type of assembly code to use. - -#if defined(_M_IX86) || defined(__i386__) - -#define X86_ASM -#ifdef X64_ASM -#undef X64_ASM -#endif - -#elif defined(_M_X64) || defined(__amd64__) - -#define X64_ASM -#ifdef X86_ASM -#undef X86_ASM -#endif - -#else - -#define NOASM - -#endif - -#endif - -#ifdef NOASM -// Ensure no assembly macros are defined if NOASM is defined. - -#ifdef X86_ASM -#undef X86_ASM -#endif - -#ifdef X64_ASM -#undef X64_ASM -#endif - -#endif - #if defined(_MSC_VER) #define NOVTABLE __declspec(novtable) diff --git a/src/r_main.cpp b/src/r_main.cpp index a6ae47de1b..0ee075140d 100644 --- a/src/r_main.cpp +++ b/src/r_main.cpp @@ -812,9 +812,6 @@ void R_SetupBuffer () { dc_pitch = pitch; R_InitFuzzTable (pitch); -#if defined(X86_ASM) || defined(X64_ASM) - ASM_PatchPitch (); -#endif } dc_destorg = lineptr; for (int i = 0; i < RenderTarget->GetHeight(); i++) diff --git a/src/v_video.h b/src/v_video.h index 971aa6c13d..b72f670947 100644 --- a/src/v_video.h +++ b/src/v_video.h @@ -516,10 +516,6 @@ void V_RefreshViewBorder (); void V_SetBorderNeedRefresh(); -#if defined(X86_ASM) || defined(X64_ASM) -extern "C" void ASM_PatchPitch (void); -#endif - int CheckRatio (int width, int height, int *trueratio=NULL); static inline int CheckRatio (double width, double height) { return CheckRatio(int(width), int(height)); } inline bool IsRatioWidescreen(int ratio) { return (ratio & 3) != 0; } From 5910067c4473a682727d8e1e7cdd92f0ea060260 Mon Sep 17 00:00:00 2001 From: Christoph Oelckers Date: Wed, 7 Dec 2016 14:26:26 +0100 Subject: [PATCH 3/7] - discontinue using the MMX assembly version of DoBlending. Some benchmarking shows that on SSE systems it only harms performance and compared to the intrinsics version the gains are too marginal for something this infrequently called. Doing 100000 calls of DoBlending results in a 5 ms decrease of using assembly vs intrinsics on a 3.4 GHz Core i7, meaning that even on a computer that is 10x slower you can still do 1000 or so blends per frame without a speed hit. --- src/v_palette.cpp | 8 +++++--- src/x86.cpp | 23 +++-------------------- 2 files changed, 8 insertions(+), 23 deletions(-) diff --git a/src/v_palette.cpp b/src/v_palette.cpp index 934a57dd3c..49fbd6cb6f 100644 --- a/src/v_palette.cpp +++ b/src/v_palette.cpp @@ -384,8 +384,8 @@ void InitPalette () R_InitColormaps (); } -extern "C" void DoBlending_MMX (const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a); -extern void DoBlending_SSE2 (const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a); +void DoBlending_MMX (const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a); +void DoBlending_SSE2 (const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a); void DoBlending (const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a) { @@ -395,6 +395,7 @@ void DoBlending (const PalEntry *from, PalEntry *to, int count, int r, int g, in { memcpy (to, from, count * sizeof(DWORD)); } + return; } else if (a == 256) { @@ -405,6 +406,7 @@ void DoBlending (const PalEntry *from, PalEntry *to, int count, int r, int g, in { to[i] = t; } + return; } #if defined(_M_X64) || defined(_M_IX86) || defined(__i386__) || defined(__amd64__) else if (CPU.bSSE2) @@ -423,7 +425,7 @@ void DoBlending (const PalEntry *from, PalEntry *to, int count, int r, int g, in } } #endif -#ifdef X86_ASM +#if defined(_M_IX86) || defined(__i386__) else if (CPU.bMMX) { if (count >= 4) diff --git a/src/x86.cpp b/src/x86.cpp index f6c878da61..17c946ac0f 100644 --- a/src/x86.cpp +++ b/src/x86.cpp @@ -227,10 +227,9 @@ void DumpCPUInfo(const CPUInfo *cpu) } } -#if 0 -// Compiler output for this function is crap compared to the assembly -// version, which is why it isn't used. -void DoBlending_MMX2(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a) +#if !defined(__amd64__) && !defined(_M_X64) + +void DoBlending_MMX(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a) { __m64 blendcolor; __m64 blendalpha; @@ -272,9 +271,6 @@ void DoBlending_MMX2(const PalEntry *from, PalEntry *to, int count, int r, int g } #endif -#ifdef X86_ASM -extern "C" void DoBlending_MMX(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a); -#endif void DoBlending_SSE2(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a) { @@ -288,17 +284,6 @@ void DoBlending_SSE2(const PalEntry *from, PalEntry *to, int count, int r, int g unaligned = ((size_t)from | (size_t)to) & 0xF; -#ifdef X86_ASM - // For unaligned accesses, the assembly MMX version is slightly faster. - // Note that using unaligned SSE loads and stores is still faster than - // the compiler-generated MMX version. - if (unaligned) - { - DoBlending_MMX(from, to, count, r, g, b, a); - return; - } -#endif - #if defined(__amd64__) || defined(_M_X64) long long color; @@ -326,7 +311,6 @@ void DoBlending_SSE2(const PalEntry *from, PalEntry *to, int count, int r, int g zero = _mm_setzero_si128(); -#ifndef X86_ASM if (unaligned) { for (count >>= 2; count > 0; --count) @@ -346,7 +330,6 @@ void DoBlending_SSE2(const PalEntry *from, PalEntry *to, int count, int r, int g } } else -#endif { for (count >>= 2; count > 0; --count) { From ed141943e1391e4abc2949e5f01feaf6ad53145a Mon Sep 17 00:00:00 2001 From: Christoph Oelckers Date: Wed, 7 Dec 2016 14:39:15 +0100 Subject: [PATCH 4/7] - removed use of BestColor_MMX because there is no measurable improvement at all on a modern system. On top of that this function does not get called nearly often enough to justify the hassle. Like DoBlending this would require hundreds of calls per frame to make any impact that would be measurable. --- src/v_palette.cpp | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/v_palette.cpp b/src/v_palette.cpp index 49fbd6cb6f..aa39ba7913 100644 --- a/src/v_palette.cpp +++ b/src/v_palette.cpp @@ -106,20 +106,11 @@ CCMD (bumpgamma) /* Palette management stuff */ /****************************/ -extern "C" BYTE BestColor_MMX (DWORD rgb, const DWORD *pal); - int BestColor (const uint32 *pal_in, int r, int g, int b, int first, int num) { -#ifdef X86_ASM - if (CPU.bMMX) - { - int pre = 256 - num - first; - return BestColor_MMX (((first+pre)<<24)|(r<<16)|(g<<8)|b, pal_in-pre) - pre; - } -#endif const PalEntry *pal = (const PalEntry *)pal_in; int bestcolor = first; - int bestdist = 257*257+257*257+257*257; + int bestdist = 257 * 257 + 257 * 257 + 257 * 257; for (int color = first; color < num; color++) { From a118903e3ef88e093ab5cfb3babef03684cd9608 Mon Sep 17 00:00:00 2001 From: Christoph Oelckers Date: Wed, 7 Dec 2016 14:41:21 +0100 Subject: [PATCH 5/7] - complete removal of assembly stuff. Nothing of this gets used anymore. --- src/asm_ia32/a.asm | 812 ------------------------------ src/asm_ia32/misc.asm | 200 -------- src/asm_ia32/tmap.asm | 1002 -------------------------------------- src/asm_ia32/tmap2.asm | 643 ------------------------ src/asm_ia32/tmap3.asm | 344 ------------- src/asm_x86_64/tmap3.asm | 150 ------ src/asm_x86_64/tmap3.s | 141 ------ 7 files changed, 3292 deletions(-) delete mode 100644 src/asm_ia32/a.asm delete mode 100644 src/asm_ia32/misc.asm delete mode 100644 src/asm_ia32/tmap.asm delete mode 100644 src/asm_ia32/tmap2.asm delete mode 100644 src/asm_ia32/tmap3.asm delete mode 100644 src/asm_x86_64/tmap3.asm delete mode 100644 src/asm_x86_64/tmap3.s diff --git a/src/asm_ia32/a.asm b/src/asm_ia32/a.asm deleted file mode 100644 index 786396d4a4..0000000000 --- a/src/asm_ia32/a.asm +++ /dev/null @@ -1,812 +0,0 @@ -; "Build Engine & Tools" Copyright (c) 1993-1997 Ken Silverman -; Ken Silverman's official web site: "http://www.advsys.net/ken" -; See the included license file "BUILDLIC.TXT" for license info. -; This file has been modified from Ken Silverman's original release - -%include "valgrind.inc" - - SECTION .data - -%ifndef M_TARGET_LINUX -%define ylookup _ylookup -%define vince _vince -%define vplce _vplce -%define palookupoffse _palookupoffse -%define bufplce _bufplce -%define dc_iscale _dc_iscale -%define dc_colormap _dc_colormap -%define dc_count _dc_count -%define dc_dest _dc_dest -%define dc_source _dc_source -%define dc_texturefrac _dc_texturefrac - -%define setupvlineasm _setupvlineasm -%define prevlineasm1 _prevlineasm1 -%define vlineasm1 _vlineasm1 -%define vlineasm4 _vlineasm4 - -%define setupmvlineasm _setupmvlineasm -%define mvlineasm1 _mvlineasm1 -%define mvlineasm4 _mvlineasm4 - -%define R_SetupDrawSlabA _R_SetupDrawSlabA -%define R_DrawSlabA _R_DrawSlabA -%endif - -EXTERN ylookup ; near - -EXTERN vplce ; near -EXTERN vince ; near -EXTERN palookupoffse ; near -EXTERN bufplce ; near - -EXTERN dc_iscale -EXTERN dc_colormap -EXTERN dc_count -EXTERN dc_dest -EXTERN dc_source -EXTERN dc_texturefrac - - SECTION .text - -ALIGN 16 -GLOBAL setvlinebpl_ -setvlinebpl_: - mov [fixchain1a+2], eax - mov [fixchain1b+2], eax - mov [fixchain2a+2], eax - mov [fixchain1m+2], eax - mov [fixchain2ma+2], eax - mov [fixchain2mb+2], eax - selfmod fixchain1a, fixchain2mb+6 - -setdrawslabbpl: - mov dword [voxbpl1+2], eax - mov dword [voxbpl2+2], eax - mov dword [voxbpl3+2], eax - mov dword [voxbpl4+2], eax - mov dword [voxbpl5+2], eax - mov dword [voxbpl6+2], eax - mov dword [voxbpl7+2], eax - mov dword [voxbpl8+2], eax - selfmod voxbpl1, voxpl8+6 - ret - - SECTION .data - -lastslabcolormap: - dd 4 - - SECTION .text - -GLOBAL R_SetupDrawSlabA -GLOBAL @R_SetupDrawSlabA@4 -R_SetupDrawSlabA: - mov ecx, [esp+4] -@R_SetupDrawSlabA@4: - cmp [lastslabcolormap], ecx - je .done - mov [lastslabcolormap], ecx - mov dword [voxpal1+2], ecx - mov dword [voxpal2+2], ecx - mov dword [voxpal3+2], ecx - mov dword [voxpal4+2], ecx - mov dword [voxpal5+2], ecx - mov dword [voxpal6+2], ecx - mov dword [voxpal7+2], ecx - mov dword [voxpal8+2], ecx -.done ret - - -; pass it log2(texheight) - -ALIGN 16 -GLOBAL setupvlineasm -setupvlineasm: - mov ecx, [esp+4] - - ;First 2 lines for VLINEASM1, rest for VLINEASM4 - mov byte [premach3a+2], cl - mov byte [mach3a+2], cl - - mov byte [machvsh1+2], cl ;32-shy - mov byte [machvsh3+2], cl ;32-shy - mov byte [machvsh5+2], cl ;32-shy - mov byte [machvsh6+2], cl ;32-shy - mov ch, cl - sub ch, 16 - mov byte [machvsh8+2], ch ;16-shy - neg cl - mov byte [machvsh7+2], cl ;shy - mov byte [machvsh9+2], cl ;shy - mov byte [machvsh10+2], cl ;shy - mov byte [machvsh11+2], cl ;shy - mov byte [machvsh12+2], cl ;shy - mov eax, 1 - shl eax, cl - dec eax - mov dword [machvsh2+2], eax ;(1<>sh) -;vplc3 = (ebp<<(32-sh))+((edx&65535)<<(16-sh)) -machvsh5: shl esi, 88h ;32-sh - mov eax, edx -machvsh6: shl ebp, 88h ;32-sh - and edx, 0000ffffh -machvsh7: shr eax, 88h ;sh - add esi, eax -machvsh8: shl edx, 88h ;16-sh - add ebp, edx - mov dword [vplce+12], esi - mov dword [vplce+4], ebp - - pop edi - pop esi - pop ebx - pop ebp - ret - -;************************************************************************* -;************************* Masked Vertical Lines ************************* -;************************************************************************* - -; pass it log2(texheight) - -ALIGN 16 -GLOBAL setupmvlineasm -setupmvlineasm: - mov ecx, dword [esp+4] - mov byte [maskmach3a+2], cl - mov byte [machmv13+2], cl - - mov byte [machmv14+2], cl - mov byte [machmv15+2], cl - mov byte [machmv16+2], cl - selfmod maskmach3a, machmv13+6 - ret - -ALIGN 16 -GLOBAL mvlineasm1 ;Masked vline -mvlineasm1: - push ebx - push edi - push esi - push ebp - mov ecx, [dc_count] - mov ebp, [dc_colormap] - mov edi, [dc_dest] - mov eax, [dc_iscale] - mov edx, [dc_texturefrac] - mov esi, [dc_source] -beginmvline: - mov ebx, edx -maskmach3a: shr ebx, 32 - movzx ebx, byte [esi+ebx] - cmp ebx, 0 - je short skipmask1 -maskmach3c: mov bl, byte [ebp+ebx] - mov [edi], bl -skipmask1: add edx, eax -fixchain1m: add edi, 320 - dec ecx - jnz short beginmvline - - pop ebp - pop esi - pop edi - pop ebx - mov eax, edx - ret - -ALIGN 16 -GLOBAL mvlineasm4 -mvlineasm4: - push ebx - push esi - push edi - push ebp - - mov ecx,[dc_count] - mov edi,[dc_dest] - - mov eax, [bufplce+0] - mov ebx, [bufplce+4] - mov [machmv1+3], eax - mov [machmv4+3], ebx - mov eax, [bufplce+8] - mov ebx, [bufplce+12] - mov [machmv7+3], eax - mov [machmv10+3], ebx - - mov eax, [palookupoffse] - mov ebx, [palookupoffse+4] - mov [machmv2+2], eax - mov [machmv5+2], ebx - mov eax, [palookupoffse+8] - mov ebx, [palookupoffse+12] - mov [machmv8+2], eax - mov [machmv11+2], ebx - - mov eax, [vince] ;vince - mov ebx, [vince+4] - xor bl, bl - mov [machmv3+2], eax - mov [machmv6+2], ebx - mov eax, [vince+8] - mov ebx, [vince+12] - mov [machmv9+2], eax - mov [machmv12+2], ebx - - inc ecx - push ecx - mov ecx, [vplce+0] - mov edx, [vplce+4] - mov esi, [vplce+8] - mov ebp, [vplce+12] -fixchain2ma: sub edi, 320 - - selfmod beginmvlineasm4, machmv2+6 - jmp short beginmvlineasm4 -ALIGN 16 -beginmvlineasm4: - dec dword [esp] - jz near endmvlineasm4 - - mov eax, ebp - mov ebx, esi -machmv16: shr eax, 32 -machmv12: add ebp, 0x88888888 ;vince[3] -machmv15: shr ebx, 32 -machmv9: add esi, 0x88888888 ;vince[2] -machmv10: movzx eax, byte [eax+0x88888888];bufplce[3] -machmv7: movzx ebx, byte [ebx+0x88888888];bufplce[2] - cmp eax, 1 - adc dl, dl - cmp ebx, 1 - adc dl, dl -machmv8: mov bl, [ebx+0x88888888] ;palookupoffs[2] -machmv11: mov bh, [eax+0x88888888] ;palookupoffs[3] - - mov eax, edx -machmv6: add edx, 0x88888888 ;vince[1] -machmv14: shr eax, 32 - shl ebx, 16 -machmv4: movzx eax, byte [eax+0x88888888];bufplce[1] - cmp eax, 1 - adc dl, dl -machmv5: mov bh, [eax+0x88888888] ;palookupoffs[1] - - mov eax, ecx -machmv3: add ecx, 0x88888888 ;vince[0] -machmv13: shr eax, 32 -machmv1: movzx eax, byte [eax+0x88888888];bufplce[0] - cmp eax, 1 - adc dl, dl -machmv2: mov bl, [eax+0x88888888] ;palookupoffs[0] - - xor eax, eax - shl dl, 4 -fixchain2mb: add edi, 320 - mov al, dl - add eax, mvcase15 - jmp eax ;16 byte cases - -ALIGN 16 -endmvlineasm4: - mov [vplce], ecx - mov [vplce+4], edx - mov [vplce+8], esi - mov [vplce+12], ebp - pop ecx - pop ebp - pop edi - pop esi - pop ebx - ret - - ;5,7,8,8,11,13,12,14,11,13,14,14,12,14,15,7 -ALIGN 16 -mvcase15: mov [edi], ebx - jmp beginmvlineasm4 -ALIGN 16 -mvcase14: mov [edi+1], bh - shr ebx, 16 - mov [edi+2], bx - jmp beginmvlineasm4 -ALIGN 16 -mvcase13: mov [edi], bl - shr ebx, 16 - mov [edi+2], bx - jmp beginmvlineasm4 -ALIGN 16 -mvcase12: shr ebx, 16 - mov [edi+2], bx - jmp beginmvlineasm4 -ALIGN 16 -mvcase11: mov [edi], bx - shr ebx, 16 - mov [edi+3], bh - jmp beginmvlineasm4 -ALIGN 16 -mvcase10: mov [edi+1], bh - shr ebx, 16 - mov [edi+3], bh - jmp beginmvlineasm4 -ALIGN 16 -mvcase9: mov [edi], bl - shr ebx, 16 - mov [edi+3], bh - jmp beginmvlineasm4 -ALIGN 16 -mvcase8: shr ebx, 16 - mov [edi+3], bh - jmp beginmvlineasm4 -ALIGN 16 -mvcase7: mov [edi], bx - shr ebx, 16 - mov [edi+2], bl - jmp beginmvlineasm4 -ALIGN 16 -mvcase6: shr ebx, 8 - mov [edi+1], bx - jmp beginmvlineasm4 -ALIGN 16 -mvcase5: mov [edi], bl - shr ebx, 16 - mov [edi+2], bl - jmp beginmvlineasm4 -ALIGN 16 -mvcase4: shr ebx, 16 - mov [edi+2], bl - jmp beginmvlineasm4 -ALIGN 16 -mvcase3: mov [edi], bx - jmp beginmvlineasm4 -ALIGN 16 -mvcase2: mov [edi+1], bh - jmp beginmvlineasm4 -ALIGN 16 -mvcase1: mov [edi], bl - jmp beginmvlineasm4 -ALIGN 16 -mvcase0: jmp beginmvlineasm4 - -align 16 - - -;************************************************************************* -;***************************** Voxel Slabs ******************************* -;************************************************************************* - -GLOBAL R_DrawSlabA -R_DrawSlabA: - push ebx - push ebp - push esi - push edi - - mov eax, [esp+5*4+0] - mov ebx, [esp+5*4+4] - mov ecx, [esp+5*4+8] - mov edx, [esp+5*4+12] - mov esi, [esp+5*4+16] - mov edi, [esp+5*4+20] - - cmp eax, 2 - je voxbegdraw2 - ja voxskip2 - xor eax, eax -voxbegdraw1: - mov ebp, ebx - shr ebp, 16 - add ebx, edx - dec ecx - mov al, byte [esi+ebp] -voxpal1: mov al, byte [eax+88888888h] - mov byte [edi], al -voxbpl1: lea edi, [edi+88888888h] - jnz voxbegdraw1 - jmp voxskipslab5 - -voxbegdraw2: - mov ebp, ebx - shr ebp, 16 - add ebx, edx - xor eax, eax - dec ecx - mov al, byte [esi+ebp] -voxpal2: mov al, byte [eax+88888888h] - mov ah, al - mov word [edi], ax -voxbpl2: lea edi, [edi+88888888h] - jnz voxbegdraw2 - jmp voxskipslab5 - -voxskip2: - cmp eax, 4 - jne voxskip4 - xor eax, eax -voxbegdraw4: - mov ebp, ebx - add ebx, edx - shr ebp, 16 - xor eax, eax - mov al, byte [esi+ebp] -voxpal3: mov al, byte [eax+88888888h] - mov ah, al - shl eax, 8 - mov al, ah - shl eax, 8 - mov al, ah - mov dword [edi], eax -voxbpl3: add edi, 88888888h - dec ecx - jnz voxbegdraw4 - jmp voxskipslab5 - -voxskip4: - add eax, edi - - test edi, 1 - jz voxskipslab1 - cmp edi, eax - je voxskipslab1 - - push eax - push ebx - push ecx - push edi -voxbegslab1: - mov ebp, ebx - add ebx, edx - shr ebp, 16 - xor eax, eax - mov al, byte [esi+ebp] -voxpal4: mov al, byte [eax+88888888h] - mov byte [edi], al -voxbpl4: add edi, 88888888h - dec ecx - jnz voxbegslab1 - pop edi - pop ecx - pop ebx - pop eax - inc edi - -voxskipslab1: - push eax - test edi, 2 - jz voxskipslab2 - dec eax - cmp edi, eax - jge voxskipslab2 - - push ebx - push ecx - push edi -voxbegslab2: - mov ebp, ebx - add ebx, edx - shr ebp, 16 - xor eax, eax - mov al, byte [esi+ebp] -voxpal5: mov al, byte [eax+88888888h] - mov ah, al - mov word [edi], ax -voxbpl5: add edi, 88888888h - dec ecx - jnz voxbegslab2 - pop edi - pop ecx - pop ebx - add edi, 2 - -voxskipslab2: - mov eax, [esp] - - sub eax, 3 - cmp edi, eax - jge voxskipslab3 - -voxprebegslab3: - push ebx - push ecx - push edi -voxbegslab3: - mov ebp, ebx - add ebx, edx - shr ebp, 16 - xor eax, eax - mov al, byte [esi+ebp] -voxpal6: mov al, byte [eax+88888888h] - mov ah, al - shl eax, 8 - mov al, ah - shl eax, 8 - mov al, ah - mov dword [edi], eax -voxbpl6: add edi, 88888888h - dec ecx - jnz voxbegslab3 - pop edi - pop ecx - pop ebx - add edi, 4 - - mov eax, [esp] - - sub eax, 3 - cmp edi, eax - jl voxprebegslab3 - -voxskipslab3: - mov eax, [esp] - - dec eax - cmp edi, eax - jge voxskipslab4 - - push ebx - push ecx - push edi -voxbegslab4: - mov ebp, ebx - add ebx, edx - shr ebp, 16 - xor eax, eax - mov al, byte [esi+ebp] -voxpal7: mov al, byte [eax+88888888h] - mov ah, al - mov word [edi], ax -voxbpl7: add edi, 88888888h - dec ecx - jnz voxbegslab4 - pop edi - pop ecx - pop ebx - add edi, 2 - -voxskipslab4: - pop eax - - cmp edi, eax - je voxskipslab5 - -voxbegslab5: - mov ebp, ebx - add ebx, edx - shr ebp, 16 - xor eax, eax - mov al, byte [esi+ebp] -voxpal8: mov al, byte [eax+88888888h] - mov byte [edi], al -voxbpl8: add edi, 88888888h - dec ecx - jnz voxbegslab5 - -voxskipslab5: - pop edi - pop esi - pop ebp - pop ebx - ret - -align 16 - -%ifdef M_TARGET_MACHO -GLOBAL _rtext_a_end -_rtext_a_end: -%endif diff --git a/src/asm_ia32/misc.asm b/src/asm_ia32/misc.asm deleted file mode 100644 index b825a4d02a..0000000000 --- a/src/asm_ia32/misc.asm +++ /dev/null @@ -1,200 +0,0 @@ -;* -;* misc.nas -;* Miscellaneous assembly functions -;* -;*--------------------------------------------------------------------------- -;* Copyright 1998-2006 Randy Heit -;* All rights reserved. -;* -;* Redistribution and use in source and binary forms, with or without -;* modification, are permitted provided that the following conditions -;* are met: -;* -;* 1. Redistributions of source code must retain the above copyright -;* notice, this list of conditions and the following disclaimer. -;* 2. Redistributions in binary form must reproduce the above copyright -;* notice, this list of conditions and the following disclaimer in the -;* documentation and/or other materials provided with the distribution. -;* 3. The name of the author may not be used to endorse or promote products -;* derived from this software without specific prior written permission. -;* -;* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR -;* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES -;* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. -;* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, -;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT -;* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -;* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -;* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -;* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF -;* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -;*--------------------------------------------------------------------------- -;* - -BITS 32 - -%ifndef M_TARGET_LINUX - -%define DoBlending_MMX _DoBlending_MMX -%define BestColor_MMX _BestColor_MMX - -%endif - -%ifdef M_TARGET_WATCOM - SEGMENT DATA PUBLIC ALIGN=16 CLASS=DATA USE32 - SEGMENT DATA -%else - SECTION .data -%endif - -Blending256: - dd 0x01000100,0x00000100 - -%ifdef M_TARGET_WATCOM - SEGMENT CODE PUBLIC ALIGN=16 CLASS=CODE USE32 - SEGMENT CODE -%else - SECTION .text -%endif - -;----------------------------------------------------------- -; -; DoBlending_MMX -; -; MMX version of DoBlending -; -; (DWORD *from, DWORD *to, count, tor, tog, tob, toa) -;----------------------------------------------------------- - -GLOBAL DoBlending_MMX - -DoBlending_MMX: - pxor mm0,mm0 ; mm0 = 0 - mov eax,[esp+4*4] - shl eax,16 - mov edx,[esp+4*5] - shl edx,8 - or eax,[esp+4*6] - or eax,edx - mov ecx,[esp+4*3] ; ecx = count - movd mm1,eax ; mm1 = 00000000 00RRGGBB - mov eax,[esp+4*7] - shl eax,16 - mov edx,[esp+4*7] - shl edx,8 - or eax,[esp+4*7] - or eax,edx - mov edx,[esp+4*2] ; edx = dest - movd mm6,eax ; mm6 = 00000000 00AAAAAA - punpcklbw mm1,mm0 ; mm1 = 000000RR 00GG00BB - movq mm7,[Blending256] - punpcklbw mm6,mm0 ; mm6 = 000000AA 00AA00AA - mov eax,[esp+4*1] ; eax = source - pmullw mm1,mm6 ; mm1 = 000000RR 00GG00BB (multiplied by alpha) - psubusw mm7,mm6 ; mm7 = 000000aa 00aa00aa (one minus alpha) - nop ; Does this actually pair on a Pentium? - -; Do four colors per iteration: Count must be a multiple of four. - -.loop movq mm2,[eax] ; mm2 = 00r2g2b2 00r1g1b1 - add eax,8 - movq mm3,mm2 ; mm3 = 00r2g2b2 00r1g1b1 - punpcklbw mm2,mm0 ; mm2 = 000000r1 00g100b1 - punpckhbw mm3,mm0 ; mm3 = 000000r2 00g200b2 - pmullw mm2,mm7 ; mm2 = 0000r1rr g1ggb1bb - add edx,8 - pmullw mm3,mm7 ; mm3 = 0000r2rr g2ggb2bb - sub ecx,2 - paddusw mm2,mm1 - psrlw mm2,8 - paddusw mm3,mm1 - psrlw mm3,8 - packuswb mm2,mm3 ; mm2 = 00r2g2b2 00r1g1b1 - movq [edx-8],mm2 - - movq mm2,[eax] ; mm2 = 00r2g2b2 00r1g1b1 - add eax,8 - movq mm3,mm2 ; mm3 = 00r2g2b2 00r1g1b1 - punpcklbw mm2,mm0 ; mm2 = 000000r1 00g100b1 - punpckhbw mm3,mm0 ; mm3 = 000000r2 00g200b2 - pmullw mm2,mm7 ; mm2 = 0000r1rr g1ggb1bb - add edx,8 - pmullw mm3,mm7 ; mm3 = 0000r2rr g2ggb2bb - sub ecx,2 - paddusw mm2,mm1 - psrlw mm2,8 - paddusw mm3,mm1 - psrlw mm3,8 - packuswb mm2,mm3 ; mm2 = 00r2g2b2 00r1g1b1 - movq [edx-8],mm2 - - jnz .loop - - emms - ret - -;----------------------------------------------------------- -; -; BestColor_MMX -; -; Picks the closest matching color from a palette -; -; Passed FFRRGGBB and palette array in same format -; FF is the index of the first palette entry to consider -; -;----------------------------------------------------------- - -GLOBAL BestColor_MMX -GLOBAL @BestColor_MMX@8 - -BestColor_MMX: - mov ecx,[esp+4] - mov edx,[esp+8] -@BestColor_MMX@8: - pxor mm0,mm0 - movd mm1,ecx ; mm1 = color searching for - mov eax,257*257+257*257+257*257 ;eax = bestdist - push ebx - punpcklbw mm1,mm0 - mov ebx,ecx ; ebx = best color - shr ecx,24 ; ecx = count - and ebx,0xffffff - push esi - push ebp - -.loop movd mm2,[edx+ecx*4] ; mm2 = color considering now - inc ecx - punpcklbw mm2,mm0 - movq mm3,mm1 - psubsw mm3,mm2 - pmullw mm3,mm3 ; mm3 = color distance squared - - movd ebp,mm3 ; add the three components - psrlq mm3,32 ; into ebp to get the real - mov esi,ebp ; (squared) distance - shr esi,16 - and ebp,0xffff - add ebp,esi - movd esi,mm3 - add ebp,esi - - jz .perf ; found a perfect match - cmp eax,ebp - jb .skip - mov eax,ebp - lea ebx,[ecx-1] -.skip cmp ecx,256 - jne .loop - mov eax,ebx - pop ebp - pop esi - pop ebx - emms - ret - -.perf lea eax,[ecx-1] - pop ebp - pop esi - pop ebx - emms - ret diff --git a/src/asm_ia32/tmap.asm b/src/asm_ia32/tmap.asm deleted file mode 100644 index 2096b92229..0000000000 --- a/src/asm_ia32/tmap.asm +++ /dev/null @@ -1,1002 +0,0 @@ -;* -;* tmap.nas -;* The texture-mapping inner loops in pure assembly language. -;* -;*--------------------------------------------------------------------------- -;* Copyright 1998-2006 Randy Heit -;* All rights reserved. -;* -;* Redistribution and use in source and binary forms, with or without -;* modification, are permitted provided that the following conditions -;* are met: -;* -;* 1. Redistributions of source code must retain the above copyright -;* notice, this list of conditions and the following disclaimer. -;* 2. Redistributions in binary form must reproduce the above copyright -;* notice, this list of conditions and the following disclaimer in the -;* documentation and/or other materials provided with the distribution. -;* 3. The name of the author may not be used to endorse or promote products -;* derived from this software without specific prior written permission. -;* -;* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR -;* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES -;* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. -;* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, -;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT -;* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -;* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -;* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -;* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF -;* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -;*--------------------------------------------------------------------------- -;* - -BITS 32 - -%include "valgrind.inc" - -; Segment/section definition macros. - - SECTION .data - -%define SPACEFILLER4 (0x44444444) - -; If you change this in r_draw.c, be sure to change it here, too! -FUZZTABLE equ 50 - -%ifndef M_TARGET_LINUX - -%define ylookup _ylookup -%define centery _centery -%define fuzzpos _fuzzpos -%define fuzzoffset _fuzzoffset -%define NormalLight _NormalLight -%define viewheight _viewheight -%define fuzzviewheight _fuzzviewheight -%define CPU _CPU - -%define dc_pitch _dc_pitch -%define dc_colormap _dc_colormap -%define dc_color _dc_color -%define dc_iscale _dc_iscale -%define dc_texturefrac _dc_texturefrac -%define dc_srcblend _dc_srcblend -%define dc_destblend _dc_destblend -%define dc_source _dc_source -%define dc_yl _dc_yl -%define dc_yh _dc_yh -%define dc_x _dc_x -%define dc_count _dc_count -%define dc_dest _dc_dest -%define dc_destorg _dc_destorg - -%define Col2RGB8 _Col2RGB8 -%define RGB32k _RGB32k - -%define dc_ctspan _dc_ctspan -%define dc_temp _dc_temp - -%define ds_xstep _ds_xstep -%define ds_ystep _ds_ystep -%define ds_colormap _ds_colormap -%define ds_source _ds_source -%define ds_x1 _ds_x1 -%define ds_x2 _ds_x2 -%define ds_xfrac _ds_xfrac -%define ds_yfrac _ds_yfrac -%define ds_y _ds_y - -%define ds_cursource _ds_cursource -%define ds_curcolormap _ds_curcolormap - -%define R_SetSpanSource_ASM _R_SetSpanSource_ASM -%define R_SetSpanSize_ASM _R_SetSpanSize_ASM -%define R_SetSpanColormap_ASM _R_SetSpanColormap_ASM -%define R_SetupShadedCol _R_SetupShadedCol -%define R_SetupAddCol _R_SetupAddCol -%define R_SetupAddClampCol _R_SetupAddClampCol - -%endif - -EXTERN ylookup -EXTERN centery -EXTERN fuzzpos -EXTERN fuzzoffset -EXTERN NormalLight -EXTERN viewheight -EXTERN fuzzviewheight -EXTERN CPU - -EXTERN dc_pitch -EXTERN dc_colormap -EXTERN dc_color -EXTERN dc_iscale -EXTERN dc_texturefrac -EXTERN dc_srcblend -EXTERN dc_destblend -EXTERN dc_source -EXTERN dc_yl -EXTERN dc_yh -EXTERN dc_x -EXTERN dc_count -EXTERN dc_dest -EXTERN dc_destorg - -EXTERN dc_ctspan -EXTERN dc_temp - -EXTERN Col2RGB8 -EXTERN RGB32k - -EXTERN ds_xstep -EXTERN ds_ystep -EXTERN ds_colormap -EXTERN ds_source -EXTERN ds_x1 -EXTERN ds_x2 -EXTERN ds_xfrac -EXTERN ds_yfrac -EXTERN ds_y - -GLOBAL ds_cursource -GLOBAL ds_curcolormap - - -ds_cursource: - DD 0 - -ds_curcolormap: - DD 0 - - -; Local stuff: -lastAddress DD 0 -pixelcount DD 0 - - SECTION .text - - -GLOBAL @R_SetSpanSource_ASM@4 -GLOBAL R_SetSpanSource_ASM - -R_SetSpanSource_ASM: - mov ecx,[esp+4] - -@R_SetSpanSource_ASM@4: - mov [spreada+2],ecx - mov [spreadb+2],ecx - mov [spreadc+2],ecx - mov [spreadd+2],ecx - mov [spreade+2],ecx - mov [spreadf+2],ecx - mov [spreadg+2],ecx - - mov [mspreada+2],ecx - mov [mspreadb+2],ecx - mov [mspreadc+2],ecx - mov [mspreadd+2],ecx - mov [mspreade+2],ecx - mov [mspreadf+2],ecx - mov [mspreadg+2],ecx - - selfmod spreada, mspreadg+6 - - mov [ds_cursource],ecx - ret - -GLOBAL @R_SetSpanColormap_ASM@4 -GLOBAL R_SetSpanColormap_ASM - -R_SetSpanColormap_ASM: - mov ecx,[esp+4] - -@R_SetSpanColormap_ASM@4: - mov [spmapa+2],ecx - mov [spmapb+2],ecx - mov [spmapc+2],ecx - mov [spmapd+2],ecx - mov [spmape+2],ecx - mov [spmapf+2],ecx - mov [spmapg+2],ecx - - mov [mspmapa+2],ecx - mov [mspmapb+2],ecx - mov [mspmapc+2],ecx - mov [mspmapd+2],ecx - mov [mspmape+2],ecx - mov [mspmapf+2],ecx - mov [mspmapg+2],ecx - - selfmod spmapa, mspmapg+6 - - mov [ds_curcolormap],ecx - ret - -GLOBAL R_SetSpanSize_ASM - -EXTERN SetTiltedSpanSize - -R_SetSpanSize_ASM: - mov edx,[esp+4] - mov ecx,[esp+8] - call SetTiltedSpanSize - - mov [dsy1+2],dl - mov [dsy2+2],dl - - mov [dsx1+2],cl - mov [dsx2+2],cl - mov [dsx3+2],cl - mov [dsx4+2],cl - mov [dsx5+2],cl - mov [dsx6+2],cl - mov [dsx7+2],cl - - mov [dmsy1+2],dl - mov [dmsy2+2],dl - - mov [dmsx1+2],cl - mov [dmsx2+2],cl - mov [dmsx3+2],cl - mov [dmsx4+2],cl - mov [dmsx5+2],cl - mov [dmsx6+2],cl - mov [dmsx7+2],cl - - push ecx - add ecx,edx - mov eax,1 - shl eax,cl - dec eax - mov [dsm1+2],eax - mov [dsm5+1],eax - mov [dsm6+1],eax - mov [dsm7+1],eax - - mov [dmsm1+2],eax - mov [dmsm5+1],eax - mov [dmsm6+1],eax - mov [dmsm7+1],eax - pop ecx - ror eax,cl - mov [dsm2+2],eax - mov [dsm3+2],eax - mov [dsm4+2],eax - - mov [dmsm2+2],eax - mov [dmsm3+2],eax - mov [dmsm4+2],eax - and eax,0xffff - not eax - mov [dsm8+2],eax - mov [dsm9+2],eax - - mov [dmsm8+2],eax - mov [dmsm9+2],eax - - neg dl - mov [dsy3+2],dl - mov [dsy4+2],dl - - mov [dmsy3+2],dl - mov [dmsy4+2],dl - - selfmod dsy1, dmsm7+6 - -aret: ret - -%ifdef M_TARGET_MACHO - SECTION .text align=64 -%else - SECTION .rtext progbits alloc exec write align=64 -%endif - -%ifdef M_TARGET_MACHO -GLOBAL _rtext_tmap_start -_rtext_tmap_start: -%endif - -rtext_start: - -GLOBAL @R_DrawSpanP_ASM@0 -GLOBAL _R_DrawSpanP_ASM -GLOBAL R_DrawSpanP_ASM - -; eax: scratch -; ebx: zero -; ecx: yfrac at top end, xfrac int part at low end -; edx: xfrac frac part at top end -; edi: dest -; ebp: scratch -; esi: count -; [esp]: xstep -; [esp+4]: ystep - - align 16 - -@R_DrawSpanP_ASM@0: -_R_DrawSpanP_ASM: -R_DrawSpanP_ASM: - mov eax,[ds_x2] - mov ecx,[ds_x1] - sub eax,ecx - jl near rdspret ; count < 0: nothing to do, so leave - - push ebx - push edi - push ebp - push esi - sub esp, 8 - - mov edi,ecx - add edi,[dc_destorg] - mov ecx,[ds_y] - add edi,[ylookup+ecx*4] - mov edx,[ds_xstep] -dsy1: shl edx,6 - mov ebp,[ds_xstep] -dsy3: shr ebp,26 - xor ebx,ebx - lea esi,[eax+1] - mov [esp],edx - mov edx,[ds_ystep] - mov ecx,[ds_xfrac] -dsy4: shr ecx,26 -dsm8: and edx,strict dword 0xffffffc0 - or ebp,edx - mov [esp+4],ebp - mov ebp,[ds_yfrac] - mov edx,[ds_xfrac] -dsy2: shl edx,6 -dsm9: and ebp,strict dword 0xffffffc0 - or ecx,ebp - shr esi,1 - jnc dseven1 - -; do odd pixel - - mov ebp,ecx -dsx1: rol ebp,6 -dsm1: and ebp,0xfff - add edx,[esp] - adc ecx,[esp+4] -spreada mov bl,[ebp+SPACEFILLER4] -spmapa mov bl,[ebx+SPACEFILLER4] - mov [edi],bl - inc edi - -dseven1 shr esi,1 - jnc dsrest - -; do two more pixels - mov ebp,ecx - add edx,[esp] - adc ecx,[esp+4] -dsm2: and ebp,0xfc00003f -dsx2: rol ebp,6 - mov eax,ecx - add edx,[esp] - adc ecx,[esp+4] -spreadb mov bl,[ebp+SPACEFILLER4] ;read texel1 -dsx3: rol eax,6 -dsm6: and eax,0xfff -spmapb mov bl,[ebx+SPACEFILLER4] ;map texel1 - mov [edi],bl ;store texel1 - add edi,2 -spreadc mov bl,[eax+SPACEFILLER4] ;read texel2 -spmapc mov bl,[ebx+SPACEFILLER4] ;map texel2 - mov [edi-1],bl ;store texel2 - -; do the rest - -dsrest test esi,esi - jz near dsdone - - align 16 - -dsloop mov ebp,ecx -spstep1d add edx,[esp] -spstep2d adc ecx,[esp+4] -dsm3: and ebp,0xfc00003f -dsx4: rol ebp,6 - mov eax,ecx -spstep1e add edx,[esp] -spstep2e adc ecx,[esp+4] -spreadd mov bl,[ebp+SPACEFILLER4] ;read texel1 -dsx5: rol eax,6 -dsm5: and eax,0xfff -spmapd mov bl,[ebx+SPACEFILLER4] ;map texel1 - mov [edi],bl ;store texel1 - mov ebp,ecx -spreade mov bl,[eax+SPACEFILLER4] ;read texel2 -spstep1f add edx,[esp] -spstep2f adc ecx,[esp+4] -dsm4: and ebp,0xfc00003f -dsx6: rol ebp,6 -spmape mov bl,[ebx+SPACEFILLER4] ;map texel2 - mov eax,ecx - mov [edi+1],bl ;store texel2 -spreadf mov bl,[ebp+SPACEFILLER4] ;read texel3 -spmapf mov bl,[ebx+SPACEFILLER4] ;map texel3 - add edi,4 -dsx7: rol eax,6 -dsm7: and eax,0xfff - mov [edi-2],bl ;store texel3 -spreadg mov bl,[eax+SPACEFILLER4] ;read texel4 -spstep1g add edx,[esp] -spstep2g adc ecx,[esp+4] -spmapg mov bl,[ebx+SPACEFILLER4] ;map texel4 - dec esi - mov [edi-1],bl ;store texel4 - jnz near dsloop - -dsdone add esp,8 - pop esi - pop ebp - pop edi - pop ebx - -rdspret ret - -; This is the same as the previous routine, except it doesn't draw pixels -; where the texture's color value is 0. - -GLOBAL @R_DrawSpanMaskedP_ASM@0 -GLOBAL _R_DrawSpanMaskedP_ASM -GLOBAL R_DrawSpanMaskedP_ASM - -; eax: scratch -; ebx: zero -; ecx: yfrac at top end, xfrac int part at low end -; edx: xfrac frac part at top end -; edi: dest -; ebp: scratch -; esi: count -; [esp]: xstep -; [esp+4]: ystep - - align 16 - -@R_DrawSpanMaskedP_ASM@0: -_R_DrawSpanMaskedP_ASM: -R_DrawSpanMaskedP_ASM: - mov eax,[ds_x2] - mov ecx,[ds_x1] - sub eax,ecx - jl rdspret ; count < 0: nothing to do, so leave - - push ebx - push edi - push ebp - push esi - sub esp,8 - - mov edi,ecx - add edi,[dc_destorg] - mov ecx,[ds_y] - add edi,[ylookup+ecx*4] - mov edx,[ds_xstep] -dmsy1: shl edx,6 - mov ebp,[ds_xstep] -dmsy3: shr ebp,26 - xor ebx,ebx - lea esi,[eax+1] - mov [esp],edx - mov edx,[ds_ystep] - mov ecx,[ds_xfrac] -dmsy4: shr ecx,26 -dmsm8: and edx,strict dword 0xffffffc0 - or ebp,edx - mov [esp+4],ebp - mov ebp,[ds_yfrac] - mov edx,[ds_xfrac] -dmsy2: shl edx,6 -dmsm9: and ebp,strict dword 0xffffffc0 - or ecx,ebp - shr esi,1 - jnc dmseven1 - -; do odd pixel - - mov ebp,ecx -dmsx1: rol ebp,6 -dmsm1: and ebp,0xfff - add edx,[esp] - adc ecx,[esp+4] -mspreada mov bl,[ebp+SPACEFILLER4] - cmp bl,0 - je mspskipa -mspmapa mov bl,[ebx+SPACEFILLER4] - mov [edi],bl -mspskipa: inc edi - -dmseven1 shr esi,1 - jnc dmsrest - -; do two more pixels - mov ebp,ecx - add edx,[esp] - adc ecx,[esp+4] -dmsm2: and ebp,0xfc00003f -dmsx2: rol ebp,6 - mov eax,ecx - add edx,[esp] - adc ecx,[esp+4] -mspreadb mov bl,[ebp+SPACEFILLER4] ;read texel1 -dmsx3: rol eax,6 -dmsm6: and eax,0xfff - cmp bl,0 - je mspskipb -mspmapb mov bl,[ebx+SPACEFILLER4] ;map texel1 - mov [edi],bl ;store texel1 -mspskipb add edi,2 -mspreadc mov bl,[eax+SPACEFILLER4] ;read texel2 - cmp bl,0 - je dmsrest -mspmapc mov bl,[ebx+SPACEFILLER4] ;map texel2 - mov [edi-1],bl ;store texel2 - -; do the rest - -dmsrest test esi,esi - jz near dmsdone - - align 16 - -dmsloop mov ebp,ecx -mspstep1d add edx,[esp] -mspstep2d adc ecx,[esp+4] -dmsm3: and ebp,0xfc00003f -dmsx4: rol ebp,6 - mov eax,ecx -mspstep1e add edx,[esp] -mspstep2e adc ecx,[esp+4] -mspreadd mov bl,[ebp+SPACEFILLER4] ;read texel1 -dmsx5: rol eax,6 -dmsm5: and eax,0xfff - cmp bl,0 - mov ebp,ecx - je mspreade -mspmapd mov bl,[ebx+SPACEFILLER4] ;map texel1 - mov [edi],bl ;store texel1 -mspreade mov bl,[eax+SPACEFILLER4] ;read texel2 -mspstep1f add edx,[esp] -mspstep2f adc ecx,[esp+4] -dmsm4: and ebp,0xfc00003f -dmsx6: rol ebp,6 - cmp bl,0 - mov eax,ecx - je mspreadf -mspmape mov bl,[ebx+SPACEFILLER4] ;map texel2 - mov [edi+1],bl ;store texel2 -mspreadf mov bl,[ebp+SPACEFILLER4] ;read texel3 - add edi,4 -dmsx7: rol eax,6 -dmsm7: and eax,0xfff - cmp bl,0 - je mspreadg -mspmapf mov bl,[ebx+SPACEFILLER4] ;map texel3 - mov [edi-2],bl ;store texel3 -mspreadg mov bl,[eax+SPACEFILLER4] ;read texel4 -mspstep1g add edx,[esp] -mspstep2g adc ecx,[esp+4] - cmp bl,0 - je mspskipg -mspmapg mov bl,[ebx+SPACEFILLER4] ;map texel4 - mov [edi-1],bl ;store texel4 -mspskipg dec esi - jnz near dmsloop - -dmsdone add esp,8 - pop esi - pop ebp - pop edi - pop ebx - - ret - - - - -GLOBAL rt_shaded4cols_asm -GLOBAL _rt_shaded4cols_asm - -rt_shaded4cols_asm: -_rt_shaded4cols_asm: - mov ecx,[esp+8] - push ebp - mov ebp,[esp+16] - sub ebp,ecx - js near s4nil - mov eax,[ylookup+ecx*4] - add eax,[dc_destorg] ; eax = destination - push ebx - push esi - mov esi,[dc_temp] - inc ebp ; ebp = count - add eax,[esp+16] - push edi - lea esi,[esi+ecx*4] ; esi = source - - align 16 - -s4loop: movzx edx,byte [esi] - movzx ecx,byte [esi+1] -s4cm1: movzx edx,byte [SPACEFILLER4+edx] ; colormap -s4cm2: movzx edi,byte [SPACEFILLER4+ecx] ; colormap - shl edx,8 - movzx ebx,byte [eax] - shl edi,8 - movzx ecx,byte [eax+1] - sub ebx,edx - sub ecx,edi - mov ebx,[Col2RGB8+0x10000+ebx*4] - mov ecx,[Col2RGB8+0x10000+ecx*4] -s4fg1: add ebx,[SPACEFILLER4+edx*4] -s4fg2: add ecx,[SPACEFILLER4+edi*4] - or ebx,0x1f07c1f - or ecx,0x1f07c1f - mov edx,ebx - shr ebx,15 - mov edi,ecx - shr ecx,15 - and edx,ebx - and ecx,edi - mov bl,[RGB32k+edx] - movzx edx,byte [esi+2] - mov bh,[RGB32k+ecx] - movzx ecx,byte [esi+3] - mov [eax],bl - mov [eax+1],bh - -s4cm3: movzx edx,byte [SPACEFILLER4+edx] ; colormap -s4cm4: movzx edi,byte [SPACEFILLER4+ecx] ; colormap - shl edx,8 - movzx ebx,byte [eax+2] - shl edi,8 - movzx ecx,byte [eax+3] - sub ebx,edx - sub ecx,edi - mov ebx,[Col2RGB8+0x10000+ebx*4] - mov ecx,[Col2RGB8+0x10000+ecx*4] -s4fg3: add ebx,[SPACEFILLER4+edx*4] -s4fg4: add ecx,[SPACEFILLER4+edi*4] - or ebx,0x1f07c1f - or ecx,0x1f07c1f - mov edx,ebx - shr ebx,15 - mov edi,ecx - shr ecx,15 - and edx,ebx - and ecx,edi -s4p: add eax,320 ; pitch - add esi,4 - mov bl,[RGB32k+edx] - mov bh,[RGB32k+ecx] -s4p2: mov [eax-320+2],bl -s4p3: mov [eax-320+3],bh - dec ebp - jne s4loop - - pop edi - pop esi - pop ebx -s4nil: pop ebp - ret - - align 16 - -GLOBAL rt_add4cols_asm -GLOBAL _rt_add4cols_asm - -rt_add4cols_asm: -_rt_add4cols_asm: - mov ecx,[esp+8] - push edi - mov edi,[esp+16] - sub edi,ecx - js near a4nil - mov eax,[ylookup+ecx*4] - add eax,[dc_destorg] - push ebx - push esi - mov esi,[dc_temp] - push ebp - inc edi - add eax,[esp+20] - lea esi,[esi+ecx*4] - - align 16 -a4loop: - movzx ebx,byte [esi] - movzx edx,byte [esi+1] - movzx ecx,byte [eax] - movzx ebp,byte [eax+1] -a4cm1: movzx ebx,byte [SPACEFILLER4+ebx] ; colormap -a4cm2: movzx edx,byte [SPACEFILLER4+edx] ; colormap -a4bg1: mov ecx,[SPACEFILLER4+ecx*4] ; bg2rgb -a4bg2: mov ebp,[SPACEFILLER4+ebp*4] ; bg2rgb -a4fg1: add ecx,[SPACEFILLER4+ebx*4] ; fg2rgb -a4fg2: add ebp,[SPACEFILLER4+edx*4] ; fg2rgb - or ecx,0x01f07c1f - or ebp,0x01f07c1f - mov ebx,ecx - shr ecx,15 - mov edx,ebp - shr ebp,15 - and ecx,ebx - and ebp,edx - movzx ebx,byte [esi+2] - movzx edx,byte [esi+3] - mov cl,[RGB32k+ecx] - mov ch,[RGB32k+ebp] - mov [eax],cl - mov [eax+1],ch - - movzx ecx,byte [eax+2] - movzx ebp,byte [eax+3] -a4cm3: movzx ebx,byte [SPACEFILLER4+ebx] ; colormap -a4cm4: movzx edx,byte [SPACEFILLER4+edx] ; colormap -a4bg3: mov ecx,[SPACEFILLER4+ecx*4] ; bg2rgb -a4bg4: mov ebp,[SPACEFILLER4+ebp*4] ; bg2rgb -a4fg3: add ecx,[SPACEFILLER4+ebx*4] ; fg2rgb -a4fg4: add ebp,[SPACEFILLER4+edx*4] ; fg2rgb - or ecx,0x01f07c1f - or ebp,0x01f07c1f - mov ebx,ecx - shr ecx,15 - mov edx,ebp - shr ebp,15 - and ebx,ecx - and edx,ebp - mov cl,[RGB32k+ebx] - mov ch,[RGB32k+edx] - mov [eax+2],cl - mov [eax+3],ch - - add esi,4 -a4p: add eax,320 ; pitch - sub edi,1 - jne a4loop - pop ebp - pop esi - pop ebx -a4nil: pop edi - ret - - align 16 - -GLOBAL rt_addclamp4cols_asm -GLOBAL _rt_addclamp4cols_asm - -rt_addclamp4cols_asm: -_rt_addclamp4cols_asm: - mov ecx,[esp+8] - push edi - mov edi,[esp+16] - sub edi,ecx - js near ac4nil - mov eax,[ylookup+ecx*4] - add eax,[dc_destorg] - push ebx - push esi - mov esi,[dc_temp] - push ebp - inc edi - add eax,[esp+20] - lea esi,[esi+ecx*4] - push edi - - align 16 -ac4loop: - movzx ebx,byte [esi] - movzx edx,byte [esi+1] - mov [esp],edi -ac4cm1: movzx ebx,byte [SPACEFILLER4+ebx] ; colormap -ac4cm2: movzx edx,byte [SPACEFILLER4+edx] ; colormap - movzx ecx,byte [eax] - movzx ebp,byte [eax+1] -ac4fg1: mov ebx,[SPACEFILLER4+ebx*4] ; fg2rgb -ac4fg2: mov edx,[SPACEFILLER4+edx*4] ; fg2rgb -ac4bg1: add ebx,[SPACEFILLER4+ecx*4] ; bg2rgb -ac4bg2: add edx,[SPACEFILLER4+ebp*4] ; bg2rgb - mov ecx,ebx - or ebx,0x01f07c1f - and ecx,0x40100400 - and ebx,0x3fffffff - mov edi,ecx - shr ecx,5 - mov ebp,edx - sub edi,ecx - or edx,0x01f07c1f - or ebx,edi - mov ecx,ebx - shr ebx,15 - and ebp,0x40100400 - and ebx,ecx - and edx,0x3fffffff - mov edi,ebp - shr ebp,5 - mov cl,[RGB32k+ebx] - sub edi,ebp - mov [eax],cl - or edx,edi - mov ebp,edx - shr edx,15 - movzx ebx,byte [esi+2] - and ebp,edx - movzx edx,byte [esi+3] -ac4cm3: movzx ebx,byte [SPACEFILLER4+ebx] ; colormap - mov cl,[RGB32k+ebp] -ac4cm4: movzx edx,byte [SPACEFILLER4+edx] ; colormap - mov [eax+1],cl - movzx ecx,byte [eax+2] - movzx ebp,byte [eax+3] -ac4fg3: mov ebx,[SPACEFILLER4+ebx*4] ; fg2rgb -ac4fg4: mov edx,[SPACEFILLER4+edx*4] ; fg2rgb -ac4bg3: add ebx,[SPACEFILLER4+ecx*4] ; bg2rgb -ac4bg4: add edx,[SPACEFILLER4+ebp*4] ; bg2rgb - mov ecx,ebx - or ebx,0x01f07c1f - and ecx,0x40100400 - and ebx,0x3fffffff - mov edi,ecx - shr ecx,5 - mov ebp,edx - sub edi,ecx - or edx,0x01f07c1f - or ebx,edi - mov ecx,ebx - shr ebx,15 - and ebp,0x40100400 - and ebx,ecx - and edx,0x3fffffff - mov edi,ebp - shr ebp,5 - mov cl,[RGB32k+ebx] - sub edi,ebp - mov [eax+2],cl - or edx,edi - mov edi,[esp] - mov ebp,edx - shr edx,15 - add esi,4 - and edx,ebp - mov cl,[RGB32k+edx] - mov [eax+3],cl - -ac4p: add eax,320 ; pitch - sub edi,1 - jne ac4loop - pop edi - - pop ebp - pop esi - pop ebx -ac4nil: pop edi - ret - -rtext_end: -%ifdef M_TARGET_MACHO -GLOBAL _rtext_tmap_end -_rtext_tmap_end: -%endif - align 16 - -;************************ - - SECTION .text - -GLOBAL R_SetupShadedCol -GLOBAL @R_SetupShadedCol@0 - -# Patch the values of dc_colormap and dc_color into the shaded column drawer. - -R_SetupShadedCol: -@R_SetupShadedCol@0: - mov eax,[dc_colormap] - cmp [s4cm1+3],eax - je .cmdone - mov [s4cm1+3],eax - mov [s4cm2+3],eax - mov [s4cm3+3],eax - mov [s4cm4+3],eax -.cmdone mov eax,[dc_color] - lea eax,[Col2RGB8+eax*4] - cmp [s4fg1+3],eax - je .cdone - mov [s4fg1+3],eax - mov [s4fg2+3],eax - mov [s4fg3+3],eax - mov [s4fg4+3],eax - selfmod s4cm1, s4fg4+7 -.cdone ret - -GLOBAL R_SetupAddCol -GLOBAL @R_SetupAddCol@0 - -# Patch the values of dc_colormap, dc_srcblend, and dc_destblend into the -# unclamped adding column drawer. - -R_SetupAddCol: -@R_SetupAddCol@0: - mov eax,[dc_colormap] - cmp [a4cm1+3],eax - je .cmdone - mov [a4cm1+3],eax - mov [a4cm2+3],eax - mov [a4cm3+3],eax - mov [a4cm4+3],eax -.cmdone mov eax,[dc_srcblend] - cmp [a4fg1+3],eax - je .sbdone - mov [a4fg1+3],eax - mov [a4fg2+3],eax - mov [a4fg3+3],eax - mov [a4fg4+3],eax -.sbdone mov eax,[dc_destblend] - cmp [a4bg1+3],eax - je .dbdone - mov [a4bg1+3],eax - mov [a4bg2+3],eax - mov [a4bg3+3],eax - mov [a4bg4+3],eax - selfmod a4cm1, a4bg4+7 -.dbdone ret - -GLOBAL R_SetupAddClampCol -GLOBAL @R_SetupAddClampCol@0 - -# Patch the values of dc_colormap, dc_srcblend, and dc_destblend into the -# add with clamping column drawer. - -R_SetupAddClampCol: -@R_SetupAddClampCol@0: - mov eax,[dc_colormap] - cmp [ac4cm1+3],eax - je .cmdone - mov [ac4cm1+3],eax - mov [ac4cm2+3],eax - mov [ac4cm3+3],eax - mov [ac4cm4+3],eax -.cmdone mov eax,[dc_srcblend] - cmp [ac4fg1+3],eax - je .sbdone - mov [ac4fg1+3],eax - mov [ac4fg2+3],eax - mov [ac4fg3+3],eax - mov [ac4fg4+3],eax -.sbdone mov eax,[dc_destblend] - cmp [ac4bg1+3],eax - je .dbdone - mov [ac4bg1+3],eax - mov [ac4bg2+3],eax - mov [ac4bg3+3],eax - mov [ac4bg4+3],eax - selfmod ac4cm1, ac4bg4+7 -.dbdone ret - -EXTERN setvlinebpl_ -EXTERN setpitch3 - -GLOBAL @ASM_PatchPitch@0 -GLOBAL _ASM_PatchPitch -GLOBAL ASM_PatchPitch - -ASM_PatchPitch: -_ASM_PatchPitch: -@ASM_PatchPitch@0: - mov eax,[dc_pitch] - mov [s4p+1],eax - mov [a4p+1],eax - mov [ac4p+1],eax - mov ecx,eax - neg ecx - inc ecx - inc ecx - mov [s4p2+2],ecx - inc ecx - mov [s4p3+2],ecx - selfmod rtext_start, rtext_end - call setpitch3 - jmp setvlinebpl_ diff --git a/src/asm_ia32/tmap2.asm b/src/asm_ia32/tmap2.asm deleted file mode 100644 index ab1695d3cd..0000000000 --- a/src/asm_ia32/tmap2.asm +++ /dev/null @@ -1,643 +0,0 @@ -;* -;* tmap2.nas -;* The tilted plane inner loop. -;* -;*--------------------------------------------------------------------------- -;* Copyright 1998-2006 Randy Heit -;* All rights reserved. -;* -;* Redistribution and use in source and binary forms, with or without -;* modification, are permitted provided that the following conditions -;* are met: -;* -;* 1. Redistributions of source code must retain the above copyright -;* notice, this list of conditions and the following disclaimer. -;* 2. Redistributions in binary form must reproduce the above copyright -;* notice, this list of conditions and the following disclaimer in the -;* documentation and/or other materials provided with the distribution. -;* 3. The name of the author may not be used to endorse or promote products -;* derived from this software without specific prior written permission. -;* -;* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR -;* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES -;* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. -;* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, -;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT -;* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -;* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -;* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -;* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF -;* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -;*--------------------------------------------------------------------------- -;* -;* I tried doing the ROL trick that R_DrawSpanP_ASM uses, and it was -;* actually slightly slower than the more straight-forward approach -;* used here, probably because the trick requires too much setup time. -;* - -BITS 32 - -%include "valgrind.inc" - -%define SPACEFILLER4 (0x44444444) - -%ifndef M_TARGET_LINUX - -%define plane_sz _plane_sz -%define plane_su _plane_su -%define plane_sv _plane_sv -%define plane_shade _plane_shade -%define planelightfloat _planelightfloat -%define spanend _spanend -%define ylookup _ylookup -%define dc_destorg _dc_destorg -%define ds_colormap _ds_colormap -%define ds_source _ds_source -%define centery _centery -%define centerx _centerx -%define ds_curtiltedsource _ds_curtiltedsource -%define pviewx _pviewx -%define pviewy _pviewy -%define tiltlighting _tiltlighting - -%define R_DrawTiltedPlane_ASM _R_DrawTiltedPlane_ASM -%define R_SetTiltedSpanSource_ASM _R_SetTiltedSpanSource_ASM -%define R_CalcTiltedLighting _R_CalcTiltedLighting - -%endif - -EXTERN plane_sz -EXTERN plane_su -EXTERN plane_sv -EXTERN planelightfloat -EXTERN spanend -EXTERN ylookup -EXTERN dc_destorg -EXTERN ds_colormap -EXTERN centery -EXTERN centerx -EXTERN ds_source -EXTERN plane_shade -EXTERN pviewx -EXTERN pviewy -EXTERN tiltlighting -EXTERN R_CalcTiltedLighting - -GLOBAL ds_curtiltedsource - -%define sv_i plane_sv -%define sv_j plane_sv+4 -%define sv_k plane_sv+8 - -%define su_i plane_su -%define su_j plane_su+4 -%define su_k plane_su+8 - -%define sz_i plane_sz -%define sz_j plane_sz+4 -%define sz_k plane_sz+8 - -%define SPANBITS 3 - - section .bss - -start_u: resq 1 -start_v: resq 1 -step_u: resq 1 -step_v: resq 1 - -step_iz: resq 1 -step_uz: resq 1 -step_vz: resq 1 - -end_z: resd 1 - - section .data - -ds_curtiltedsource: dd SPACEFILLER4 - -fp_1: -spanrecips: dd 0x3f800000 ; 1/1 - dd 0x3f000000 ; 1/2 - dd 0x3eaaaaab ; 1/3 - dd 0x3e800000 ; 1/4 - dd 0x3e4ccccd ; 1/5 - dd 0x3e2aaaab ; 1/6 - dd 0x3e124925 ; 1/7 -fp_8recip: dd 0x3e000000 ; 1/8 - dd 0x3de38e39 ; 1/9 - dd 0x3dcccccd ; 1/10 - dd 0x3dba2e8c ; 1/11 - dd 0x3daaaaab ; 1/12 - dd 0x3d9d89d9 ; 1/13 - dd 0x3d924925 ; 1/14 - dd 0x3d888889 ; 1/15 - -fp_quickint: dd 0x3f800000 ; 1 - dd 0x40000000 ; 2 - dd 0x40400000 ; 3 - dd 0x40800000 ; 4 - dd 0x40a00000 ; 5 - dd 0x40c00000 ; 6 - dd 0x40e00000 ; 7 -fp_8: dd 0x41000000 ; 8 - - section .text - -GLOBAL R_SetTiltedSpanSource_ASM -GLOBAL @R_SetTiltedSpanSource_ASM@4 - -R_SetTiltedSpanSource_ASM: - mov ecx,[esp+4] - -@R_SetTiltedSpanSource_ASM@4: - mov [fetch1+3],ecx - mov [fetch2+3],ecx - mov [fetch3+3],ecx - mov [fetch4+3],ecx - mov [fetch5+3],ecx - mov [fetch6+3],ecx - mov [fetch7+3],ecx - mov [fetch8+3],ecx - mov [fetch9+3],ecx - mov [fetch10+3],ecx - mov [ds_curtiltedsource],ecx - selfmod rtext_start, rtext_end - ret - -GLOBAL SetTiltedSpanSize - -SetTiltedSpanSize: - push ecx - mov cl,dl - neg cl - mov eax,1 - shl eax,cl - mov cl,[esp] - neg cl - mov [x1+2],cl - mov [x2+2],cl - mov [x3+2],cl - mov [x4+2],cl - mov [x5+2],cl - mov [x6+2],cl - mov [x7+2],cl - mov [x8+2],cl - mov [x9+2],cl - mov [x10+2],cl - - sub cl,dl - dec eax - mov [y1+2],cl - mov [y2+2],cl - mov [y3+2],cl - mov [y4+2],cl - mov [y5+2],cl - mov [y6+2],cl - mov [y7+2],cl - mov [y8+2],cl - mov [y9+2],cl - mov [y10+2],cl - cmp eax,0 ; if x bits is 0, mask must be 0 too. - jz .notted - not eax -.notted: - pop ecx - - mov [m1+2],eax - mov [m2+2],eax - mov [m3+2],eax - mov [m4+2],eax - mov [m5+2],eax - mov [m6+2],eax - mov [m7+2],eax - mov [m8+2],eax - mov [m9+2],eax - mov [m10+2],eax - - selfmod rtext_start, rtext_end - - ret - -%ifndef M_TARGET_MACHO - SECTION .rtext progbits alloc exec write align=64 -%else - SECTION .text align=64 -GLOBAL _rtext_tmap2_start -_rtext_tmap2_start: -%endif - -rtext_start: - -GLOBAL R_DrawTiltedPlane_ASM -GLOBAL @R_DrawTiltedPlane_ASM@8 - -R_DrawTiltedPlane_ASM: - mov ecx,[esp+4] - mov edx,[esp+8] - - ; ecx = y - ; edx = x - -@R_DrawTiltedPlane_ASM@8: - push ebx - push esi - push edi - push ebp - - mov eax,[centery] - movzx ebx,word [spanend+ecx*2] - sub eax,ecx ; eax = centery-y - sub ebx,edx ; ebx = span length - 1 - mov edi,[ylookup+ecx*4] - push eax - add edi,[dc_destorg] - add edi,edx ; edi = frame buffer pointer - sub edx,[centerx] ; edx = x-centerx - push edx - xor eax,eax - - fild dword [esp+4] ; ymul - fild dword [esp] ; xmul | ymul - fld dword [sv_j] ; sv.j | xmul | ymul - fmul st0,st2 ; sv.j*ymul | xmul | ymul - fld dword [su_j] ; su.j | sv.j*ymul | xmul | ymul - fmul st0,st3 ; su.j*ymul | sv.j*ymul | xmul | ymul - fld dword [sz_j] ; sz.j | su.j*ymul | sv.j*ymul | xmul | ymul - fmulp st4,st0 ; su.j*ymul | sv.j*ymul | xmul | sz.j*ymul - fld dword [sv_i] ; sv.i | su.j*ymul | sv.j*ymul | xmul | sz.j*ymul - fmul st0,st3 ; sv.i*xmul | su.j*ymul | sv.j*ymul | xmul | sz.j*ymul - fld dword [su_i] ; su.i | sv.i*xmul | su.j*ymul | sv.j*ymul | xmul | sz.j*ymul - fmul st0,st4 ; su.i*xmul | sv.i*xmul | su.j*ymul | sv.j*ymul | xmul | sz.j*ymul - fld dword [sz_i] ; sz.i | su.i*xmul | sv.i*xmul | su.j*ymul | sv.j*ymul | xmul | sz.j*ymul - fmulp st5,st0 ; su.i*xmul | sv.i*xmul | su.j*ymul | sv.j*ymul | sz.i*xmul | sz.j*ymul - fxch st1 ; sv.i*xmul | su.i*xmul | su.j*ymul | sv.j*ymul | sz.i*xmul | sz.j*ymul - faddp st3,st0 ; su.i*xmul | su.j*ymul | sv.i*xmul+sv.j*ymul | sz.i*xmul | sz.j*ymul - faddp st1,st0 ; su.i*xmul+su.j*ymul | sv.i*xmul+sv.j*ymul | sz.i*xmul | sz.j*ymul - fxch st3 ; sz.j*ymul | sv.i*xmul+sv.j*ymul | sz.i*xmul | su.i*xmul+su.j*ymul - faddp st2,st0 ; sv.i*xmul+sv.j*ymul | sz.i*xmul+sz.j*ymul | su.i*xmul+su.j*ymul - fadd dword [sv_k] ; v/z | sz.i*xmul+sz.j*ymul | su.i*xmul+su.j*ymul - fxch st1 ; sz.i*xmul+sz.j*ymul | v/z | su.i*xmul+su.j*ymul - fadd dword [sz_k] ; 1/z | v/z | su.i*xmul+su.j*ymul - fxch st2 ; su.i*xmul+su.j*ymul | v/z | 1/z - fadd dword [su_k] ; u/z | v/z | 1/z - fxch st2 ; 1/z | v/z | u/z - fxch st1 ; v/z | 1/z | u/z - -; if lighting is on, fill out the light table - mov al,[plane_shade] - test al,al - jz .litup - - push ebx - fild dword [esp] ; width | v/z | 1/z | u/z - fmul dword [sz_i] ; width*sz.i | v/z | 1/z | u/z - fadd st0,st2 ; 1/endz | v/z | 1/z | u/z - fld st2 ; 1/z | 1/endz | v/z | 1/z | u/z - fmul dword [planelightfloat] - fxch st1 - fmul dword [planelightfloat] - sub esp,16 - fstp qword [esp] - fstp qword [esp+8] - call R_CalcTiltedLighting - add esp, 20 - xor eax, eax - -.litup add esp, 8 - -; calculate initial z, u, and v values - fld st1 ; 1/z | v/z | 1/z | u/z - fdivr dword [fp_1] ; z | v/z | 1/z | u/z - - fld st3 ; u/z | z | v/z | 1/z | u/z - fmul st0,st1 ; u | z | v/z | 1/z | u/z - fld st2 ; v/z | u | z | v/z | 1/z | u/z - fmulp st2,st0 ; u | v | v/z | 1/z | u/z - fld st0 - fistp qword [start_u] - fld st1 - fistp qword [start_v] - - cmp ebx,7 ; Do we have at least 8 pixels to plot? - jl near ShortStrip - -; yes, we do, so figure out tex coords at end of this span - -; multiply i values by span length (8) - fld dword [su_i] ; su.i - fmul dword [fp_8] ; su.i*8 - fld dword [sv_i] ; sv.i | su.i*8 - fmul dword [fp_8] ; sv.i*8 | su.i*8 - fld dword [sz_i] ; sz.i | sv.i*8 | su.i*8 - fmul dword [fp_8] ; sz.i*8 | sv.i*8 | su.i*8 - fxch st2 ; su.i*8 | sv.i*8 | sz.i*8 - fstp qword [step_uz] ; sv.i*8 | sz.i*8 - fstp qword [step_vz] ; sz.i*8 - fst qword [step_iz] ; sz.i*8 - -; find tex coords at start of next span - faddp st4 - fld qword [step_vz] - faddp st3 - fld qword [step_uz] - faddp st5 - - fld st3 ; 1/z | u | v | v/z | 1/z | u/z - fdivr dword [fp_1] ; z | u | v | v/z | 1/z | u/z - fst dword [end_z] - fld st5 ; u/z | z | u | v | v/z | 1/z | u/z - fmul st0,st1 ; u' | z | u | v | v/z | 1/z | u/z - fxch st1 ; z | u' | u | v | v/z | 1/z | u/z - fmul st0,st4 ; v' | u' | u | v | v/z | 1/z | u/z - fxch st3 ; v | u' | u | v' | v/z | 1/z | u/z - -; now subtract to get stepping values for this span - fsubr st0,st3 ; v'-v | u' | u | v' | v/z | 1/z | u/z - fxch st2 ; u | u' | v'-v | v' | v/z | 1/z | u/z - fsubr st0,st1 ; u'-u | u' | v'-v | v' | v/z | 1/z | u/z - fxch st2 ; v'-v | u' | u'-u | v' | v/z | 1/z | u/z - fmul dword [fp_8recip] ; vstep | u' | u'-u | v' | v/z | 1/z | u/z - fxch st1 ; u' | vstep | u'-u | v' | v/z | 1/z | u/z - fxch st2 ; u'-u | vstep | u' | v' | v/z | 1/z | u/z - fmul dword [fp_8recip] ; ustep | vstep | u' | v' | v/z | 1/z | u/z - fxch st1 ; vstep | ustep | u' | v' | v/z | 1/z | u/z - fistp qword [step_v] ; ustep | u' | v' | v/z | 1/z | u/z - fistp qword [step_u] ; u | v | v/z | 1/z | u/z - -FullSpan: - xor eax,eax - cmp ebx,15 ; is there another complete span after this one? - jl NextIsShort - -; there is a complete span after this one - fld qword [step_iz] - faddp st4,st0 - fld qword [step_vz] - faddp st3,st0 - fld qword [step_uz] - faddp st5,st0 - jmp StartDiv - -NextIsShort: - cmp ebx,8 ; if next span is no more than 1 pixel, then we already - jle DrawFullSpan ; know everything we need to draw it - - fld dword [sz_i] ; sz.i | u | v | v/z | 1/z | u/z - fmul dword [fp_quickint-8*4+ebx*4] - fld dword [sv_i] ; sv.i | sz.i | u | v | v/z | 1/z | u/z - fmul dword [fp_quickint-8*4+ebx*4] - fld dword [su_i] ; su.i | sv.i | sz.i | u | v | v/z | 1/z | u/z - fmul dword [fp_quickint-8*4+ebx*4] - fxch st2 ; sz.i | sv.i | su.i | u | v | v/z | 1/z | u/z - faddp st6,st0 ; sv.i | su.i | u | v | v/z | 1/z | u/z - faddp st4,st0 ; su.i | u | v | v/z | 1/z | u/z - faddp st5,st0 ; u | v | v/z | 1/z | u/z - -StartDiv: - fld st3 ; 1/z | u | v | v/z | 1/z | u/z - fdivr dword [fp_1] ; z | u | v | v/z | 1/z | u/z - -DrawFullSpan: - mov ecx,[start_v] - mov edx,[start_u] - - add ecx,[pviewy] - add edx,[pviewx] - - mov esi,edx - mov ebp,ecx -x1 shr ebp,26 -m1 and esi,0xfc000000 -y1 shr esi,20 - add ecx,[step_v] - add edx,[step_u] -fetch1 mov al,[ebp+esi+SPACEFILLER4] - mov ebp,[tiltlighting+ebx*4] - mov esi,edx - mov al,[ebp+eax] - mov ebp,ecx - mov [edi+0],al - -x2 shr ebp,26 -m2 and esi,0xfc000000 -y2 shr esi,20 - add ecx,[step_v] - add edx,[step_u] -fetch2 mov al,[ebp+esi+SPACEFILLER4] - mov ebp,[tiltlighting+ebx*4-4] - mov esi,edx - mov al,[ebp+eax] - mov ebp,ecx - mov [edi+1],al - -x3 shr ebp,26 -m3 and esi,0xfc000000 -y3 shr esi,20 - add ecx,[step_v] - add edx,[step_u] -fetch3 mov al,[ebp+esi+SPACEFILLER4] - mov ebp,[tiltlighting+ebx*4-8] - mov esi,edx - mov al,[ebp+eax] - mov ebp,ecx - mov [edi+2],al - -x4 shr ebp,26 -m4 and esi,0xfc000000 -y4 shr esi,20 - add ecx,[step_v] - add edx,[step_u] -fetch4 mov al,[ebp+esi+SPACEFILLER4] - mov ebp,[tiltlighting+ebx*4-12] - mov esi,edx - mov al,[ebp+eax] - mov ebp,ecx - mov [edi+3],al - -x5 shr ebp,26 -m5 and esi,0xfc000000 -y5 shr esi,20 - add ecx,[step_v] - add edx,[step_u] -fetch5 mov al,[ebp+esi+SPACEFILLER4] - mov ebp,[tiltlighting+ebx*4-16] - mov esi,edx - mov al,[ebp+eax] - mov ebp,ecx - mov [edi+4],al - -x6 shr ebp,26 -m6 and esi,0xfc000000 -y6 shr esi,20 - add ecx,[step_v] - add edx,[step_u] -fetch6 mov al,[ebp+esi+SPACEFILLER4] - mov ebp,[tiltlighting+ebx*4-20] - mov esi,edx - mov al,[ebp+eax] - mov ebp,ecx - mov [edi+5],al - -x7 shr ebp,26 -m7 and esi,0xfc000000 -y7 shr esi,20 - add ecx,[step_v] - add edx,[step_u] -fetch7 mov al,[ebp+esi+SPACEFILLER4] - mov ebp,[tiltlighting+ebx*4-24] -x8 shr ecx,26 - mov al,[ebp+eax] -m8 and edx,0xfc000000 - mov [edi+6],al - -y8 shr edx,20 - mov ebp,[tiltlighting+ebx*4-28] -fetch8 mov al,[edx+ecx+SPACEFILLER4] - mov al,[ebp+eax] - mov [edi+7],al - add edi,8 - - sub ebx,8 - jl near Done - - fld st1 - fistp qword [start_u] - fld st2 - fistp qword [start_v] - - cmp ebx,7 - jl near EndIsShort - - fst dword [end_z] - fld st5 ; u/z | z | u | v | v/z | 1/z | u/z - fmul st0,st1 ; u' | z | u | v | v/z | 1/z | u/z - fxch st1 ; z | u' | u | v | v/z | 1/z | u/z - fmul st0,st4 ; v' | u' | u | v | v/z | 1/z | u/z - fxch st3 ; v | u' | u | v' | v/z | 1/z | u/z - fsubr st0,st3 ; v'-v | u' | u | v' | v/z | 1/z | u/z - fxch st2 ; u | u' | v'-v | v' | v/z | 1/z | u/z - fsubr st0,st1 ; u'-u | u' | v'-v | v' | v/z | 1/z | u/z - fxch st2 ; v'-v | u' | u'-u | v' | v/z | 1/z | u/z - fmul dword [fp_8recip] ; vstep | u' | u'-u | v' | v/z | 1/z | u/z - fxch st1 ; u' | vstep | u'-u | v' | v/z | 1/z | u/z - fxch st2 ; u'-u | vstep | u' | v' | v/z | 1/z | u/z - fmul dword [fp_8recip] ; ustep | vstep | u' | v' | v/z | 1/z | u/z - fxch st1 ; vstep | ustep | u' | v' | v/z | 1/z | u/z - fistp qword [step_v] ; ustep | u' | v' | v/z | 1/z | u/z - fistp qword [step_u] ; u | v | v/z | 1/z | u/z - jmp FullSpan - -OnlyOnePixelAtEnd: - fld st0 - fistp qword [start_u] - fld st1 - fistp qword [start_v] - -OnlyOnePixel: - mov edx,[start_v] - mov ecx,[start_u] - add edx,[pviewy] - add ecx,[pviewx] -x9 shr edx,26 -m9 and ecx,0xfc000000 -y9 shr ecx,20 - mov ebp,[tiltlighting] -fetch9 mov al,[ecx+edx+SPACEFILLER4] - mov al,[ebp+eax] - mov [edi],al - -Done: - fcompp - fcompp - fstp st0 - - pop ebp - pop edi - pop esi - pop ebx - ret - -ShortStrip: - cmp ebx,0 - jle near OnlyOnePixel - -MoreThanOnePixel: - fld dword [sz_i] ; sz.i | u | v | v/z | 1/z | u/z - fmul dword [fp_quickint+ebx*4] - fld dword [sv_i] ; sv.i | sz.i | u | v | v/z | 1/z | u/z - fmul dword [fp_quickint+ebx*4] - fld dword [su_i] ; su.i | sv.i | sz.i | u | v | v/z | 1/z | u/z - fmul dword [fp_quickint+ebx*4] - fxch st2 ; sz.i | sv.i | su.i | u | v | v/z | 1/z | u/z - faddp st6,st0 ; sv.i | su.i | u | v | v/z | 1/z | u/z - faddp st4,st0 ; su.i | u | v | v/z | 1/z | u/z - faddp st5,st0 ; u | v | v/z | 1/z | u/z - fld st3 ; 1/z | u | v | v/z | 1/z | u/z - fdivr dword [fp_1] ; z | u | v | v/z | 1/z | u/z - jmp CalcPartialSteps - -EndIsShort: - cmp ebx,0 - je near OnlyOnePixelAtEnd - -CalcPartialSteps: - fst dword [end_z] - fld st5 ; u/z | z | u | v | v/z | 1/z | u/z - fmul st0,st1 ; u' | z | u | v | v/z | 1/z | u/z - fxch st1 ; z | u' | u | v | v/z | 1/z | u/z - fmul st0,st4 ; v' | u' | u | v | v/z | 1/z | u/z - fxch st1 ; u' | v' | u | v | v/z | 1/z | u/z - fsubrp st2,st0 ; v' | u'-u | v | v/z | 1/z | u/z - fsubrp st2,st0 ; u'-u | v'-v | v/z | 1/z | u/z - fmul dword [spanrecips+ebx*4] ;ustep | v'-v | v/z | 1/z | u/z - fxch st1 ; v'-v | ustep | v/z | 1/z | u/z - fmul dword [spanrecips+ebx*4] ;vstep | ustep | v/z | 1/z | u/z - fxch st1 ; ustep | vstep | v/z | 1/z | u/z - fistp qword [step_u] ; vstep | v/z | 1/z | u/z - fistp qword [step_v] ; v/z | 1/z | u/z - - mov ecx,[start_v] - mov edx,[start_u] - - add ecx,[pviewy] - add edx,[pviewx] - - mov esi,edx - mov ebp,ecx -endloop: -x10 shr ebp,26 -m10 and esi,0xfc000000 - -y10 shr esi,20 - inc edi - - add ecx,[step_v] - add edx,[step_u] - -fetch10 mov al,[ebp+esi+SPACEFILLER4] - mov ebp,[tiltlighting+ebx*4] - - mov esi,edx - dec ebx - - mov al,[ebp+eax] - mov ebp,ecx - - mov [edi-1],al - jge endloop - - fcompp - fstp st0 - - pop ebp - pop edi - pop esi - pop ebx - ret - -rtext_end: -%ifdef M_TARGET_MACHO -GLOBAL _rtext_tmap2_end -_rtext_tmap2_end: -%endif diff --git a/src/asm_ia32/tmap3.asm b/src/asm_ia32/tmap3.asm deleted file mode 100644 index bafc33627f..0000000000 --- a/src/asm_ia32/tmap3.asm +++ /dev/null @@ -1,344 +0,0 @@ -%include "valgrind.inc" - -%ifdef M_TARGET_WATCOM - SEGMENT DATA PUBLIC ALIGN=16 CLASS=DATA USE32 - SEGMENT DATA -%else - SECTION .data -%endif - -%ifndef M_TARGET_LINUX -%define ylookup _ylookup -%define vplce _vplce -%define vince _vince -%define palookupoffse _palookupoffse -%define bufplce _bufplce -%define dc_iscale _dc_iscale -%define dc_colormap _dc_colormap -%define dc_count _dc_count -%define dc_dest _dc_dest -%define dc_source _dc_source -%define dc_texturefrac _dc_texturefrac -%define dc_pitch _dc_pitch - -%define setupvlinetallasm _setupvlinetallasm -%define vlinetallasm4 _vlinetallasm4 -%define vlinetallasmathlon4 _vlinetallasmathlon4 -%define vlinetallasm1 _vlinetallasm1 -%define prevlinetallasm1 _prevlinetallasm1 -%endif - -EXTERN vplce -EXTERN vince -EXTERN palookupoffse -EXTERN bufplce - -EXTERN ylookup -EXTERN dc_iscale -EXTERN dc_colormap -EXTERN dc_count -EXTERN dc_dest -EXTERN dc_source -EXTERN dc_texturefrac -EXTERN dc_pitch - -GLOBAL vlt4pitch -GLOBAL vlt1pitch - -%ifdef M_TARGET_WATCOM - SEGMENT CODE PUBLIC ALIGN=16 CLASS=CODE USE32 - SEGMENT CODE -%else - SECTION .text -%endif - -ALIGN 16 -GLOBAL setpitch3 -setpitch3: - mov [vltpitch+2], eax - mov [vltpitcha+2],eax - mov [vlt1pitch1+2], eax - mov [vlt1pitch2+2], eax - selfmod vltpitch, vlt1pitch2+6 - ret - -ALIGN 16 -GLOBAL setupvlinetallasm -setupvlinetallasm: - mov ecx, [esp+4] - mov [shifter1+2], cl - mov [shifter2+2], cl - mov [shifter3+2], cl - mov [shifter4+2], cl - mov [shifter1a+2], cl - mov [shifter2a+2], cl - mov [shifter3a+2], cl - mov [shifter4a+2], cl - mov [preshift+2], cl - mov [shift11+2], cl - mov [shift12+2], cl - selfmod shifter1, shift12+6 - ret - -%ifdef M_TARGET_MACHO - SECTION .text align=64 -GLOBAL _rtext_tmap3_start -_rtext_tmap3_start: -%else - SECTION .rtext progbits alloc exec write align=64 -%endif - -ALIGN 16 - -GLOBAL vlinetallasm4 -vlinetallasm4: - push ebx - mov eax, [bufplce+0] - mov ebx, [bufplce+4] - mov ecx, [bufplce+8] - mov edx, [bufplce+12] - mov [source1+3], eax - mov [source2+3], ebx - mov [source3+3], ecx - mov [source4+3], edx - mov eax, [palookupoffse+0] - mov ebx, [palookupoffse+4] - mov ecx, [palookupoffse+8] - mov edx, [palookupoffse+12] - mov [lookup1+2], eax - mov [lookup2+2], ebx - mov [lookup3+2], ecx - mov [lookup4+2], edx - mov eax, [vince+0] - mov ebx, [vince+4] - mov ecx, [vince+8] - mov edx, [vince+12] - mov [step1+2], eax - mov [step2+2], ebx - mov [step3+2], ecx - mov [step4+1], edx - push ebp - push esi - push edi - mov ecx, [dc_count] - mov edi, [dc_dest] - mov eax, dword [ylookup+ecx*4-4] - add eax, edi - sub edi, eax - mov [write1+2],eax - inc eax - mov [write2+2],eax - inc eax - mov [write3+2],eax - inc eax - mov [write4+2],eax - mov ebx, [vplce] - mov ecx, [vplce+4] - mov esi, [vplce+8] - mov eax, [vplce+12] - selfmod loopit, vltpitch - jmp loopit - -ALIGN 16 -loopit: - mov edx, ebx -shifter1: shr edx, 24 -source1: movzx edx, BYTE [edx+0x88888888] -lookup1: mov dl, [edx+0x88888888] -write1: mov [edi+0x88888880], dl -step1: add ebx, 0x88888888 - mov edx, ecx -shifter2: shr edx, 24 -source2: movzx edx, BYTE [edx+0x88888888] -lookup2: mov dl, [edx+0x88888888] -write2: mov [edi+0x88888881], dl -step2: add ecx, 0x88888888 - mov edx, esi -shifter3: shr edx, 24 -source3: movzx edx, BYTE [edx+0x88888888] -lookup3: mov dl, BYTE [edx+0x88888888] -write3: mov [edi+0x88888882], dl -step3: add esi, 0x88888888 - mov edx, eax -shifter4: shr edx, 24 -source4: movzx edx, BYTE [edx+0x88888888] -lookup4: mov dl, [edx+0x88888888] -write4: mov [edi+0x88888883], dl -step4: add eax, 0x88888888 -vltpitch: add edi, 320 - jle near loopit - - mov [vplce], ebx - mov [vplce+4], ecx - mov [vplce+8], esi - mov [vplce+12], eax - - pop edi - pop esi - pop ebp - pop ebx - - ret - - ALIGN 16 - -GLOBAL vlinetallasmathlon4 -vlinetallasmathlon4: - push ebx - mov eax, [bufplce+0] - mov ebx, [bufplce+4] - mov ecx, [bufplce+8] - mov edx, [bufplce+12] - mov [source1a+3], eax - mov [source2a+3], ebx - mov [source3a+3], ecx - mov [source4a+3], edx - mov eax, [palookupoffse+0] - mov ebx, [palookupoffse+4] - mov ecx, [palookupoffse+8] - mov edx, [palookupoffse+12] - mov [lookup1a+2], eax - mov [lookup2a+2], ebx - mov [lookup3a+2], ecx - mov [lookup4a+2], edx - mov eax, [vince+0] - mov ebx, [vince+4] - mov ecx, [vince+8] - mov edx, [vince+12] - mov [step1a+2], eax - mov [step2a+2], ebx - mov [step3a+2], ecx - mov [step4a+1], edx - push ebp - push esi - push edi - mov ecx, [dc_count] - mov edi, [dc_dest] - mov eax, dword [ylookup+ecx*4-4] - add eax, edi - sub edi, eax - mov [write1a+2],eax - inc eax - mov [write2a+2],eax - inc eax - mov [write3a+2],eax - inc eax - mov [write4a+2],eax - mov ebp, [vplce] - mov ecx, [vplce+4] - mov esi, [vplce+8] - mov eax, [vplce+12] - selfmod loopita, vltpitcha - jmp loopita - -; Unfortunately, this code has not been carefully analyzed to determine -; how well it utilizes the processor's instruction units. Instead, I just -; kept rearranging code, seeing what sped it up and what slowed it down -; until I arrived at this. The is the fastest version I was able to -; manage, but that does not mean it cannot be made faster with careful -; instructing shuffling. - - ALIGN 64 - -loopita: mov edx, ebp - mov ebx, ecx -shifter1a: shr edx, 24 -shifter2a: shr ebx, 24 -source1a: movzx edx, BYTE [edx+0x88888888] -source2a: movzx ebx, BYTE [ebx+0x88888888] -step1a: add ebp, 0x88888888 -step2a: add ecx, 0x88888888 -lookup1a: mov dl, [edx+0x88888888] -lookup2a: mov dh, [ebx+0x88888888] - mov ebx, esi -write1a: mov [edi+0x88888880], dl -write2a: mov [edi+0x88888881], dh -shifter3a: shr ebx, 24 - mov edx, eax -source3a: movzx ebx, BYTE [ebx+0x88888888] -shifter4a: shr edx, 24 -step3a: add esi, 0x88888888 -source4a: movzx edx, BYTE [edx+0x88888888] -step4a: add eax, 0x88888888 -lookup3a: mov bl, [ebx+0x88888888] -lookup4a: mov dl, [edx+0x88888888] -write3a: mov [edi+0x88888882], bl -write4a: mov [edi+0x88888883], dl -vltpitcha: add edi, 320 - jle near loopita - - mov [vplce], ebp - mov [vplce+4], ecx - mov [vplce+8], esi - mov [vplce+12], eax - - pop edi - pop esi - pop ebp - pop ebx - - ret - -ALIGN 16 -GLOBAL prevlinetallasm1 -prevlinetallasm1: - mov ecx, [dc_count] - cmp ecx, 1 - ja vlinetallasm1 - - mov eax, [dc_iscale] - mov edx, [dc_texturefrac] - add eax, edx - mov ecx, [dc_source] -preshift: shr edx, 16 - push ebx - push edi - mov edi, [dc_colormap] - movzx ebx, byte [ecx+edx] - mov ecx, [dc_dest] - mov bl, byte [edi+ebx] - pop edi - mov byte [ecx], bl - pop ebx - ret - -ALIGN 16 -GLOBAL vlinetallasm1 -vlinetallasm1: - push ebp - push ebx - push edi - push esi - - mov ebp, [dc_count] - mov ebx, [dc_texturefrac] ; ebx = frac - mov edi, [dc_dest] - mov ecx, ebx -shift11: shr ecx, 16 - mov esi, [dc_source] - mov edx, [dc_iscale] -vlt1pitch1: sub edi, 0x88888888 - mov eax, [dc_colormap] - -loop2: - movzx ecx, BYTE [esi+ecx] - add ebx, edx -vlt1pitch2: add edi, 0x88888888 - mov cl,[eax+ecx] - mov [edi],cl - mov ecx,ebx -shift12: shr ecx,16 - dec ebp - jnz loop2 - - mov eax,ebx - pop esi - pop edi - pop ebx - pop ebp - ret - -%ifdef M_TARGET_MACHO -GLOBAL _rtext_tmap3_end -_rtext_tmap3_end: -%endif diff --git a/src/asm_x86_64/tmap3.asm b/src/asm_x86_64/tmap3.asm deleted file mode 100644 index e0f568fea1..0000000000 --- a/src/asm_x86_64/tmap3.asm +++ /dev/null @@ -1,150 +0,0 @@ -%ifnidn __OUTPUT_FORMAT__,win64 -%error tmap3.asm is for Win64 output. You should use tmap.s for other systems. -%endif - -BITS 64 -DEFAULT REL - -EXTERN vplce -EXTERN vince -EXTERN palookupoffse -EXTERN bufplce - -EXTERN dc_count -EXTERN dc_dest -EXTERN dc_pitch - -SECTION .text - -GLOBAL ASM_PatchPitch -ASM_PatchPitch: - mov ecx, [dc_pitch] - mov [pm+3], ecx - mov [vltpitch+3], ecx - ret - align 16 - -GLOBAL setupvlinetallasm -setupvlinetallasm: - mov [shifter1+2], cl - mov [shifter2+2], cl - mov [shifter3+2], cl - mov [shifter4+2], cl - ret - align 16 - -; Yasm can't do progbits alloc exec for win64? -; Hmm, looks like it's automatic. No worries, then. -SECTION .rtext write ;progbits alloc exec - -GLOBAL vlinetallasm4 -PROC_FRAME vlinetallasm4 - rex_push_reg rbx - push_reg rdi - push_reg r15 - push_reg r14 - push_reg r13 - push_reg r12 - push_reg rbp - push_reg rsi - alloc_stack 8 ; Stack must be 16-byte aligned -END_PROLOGUE -; rax = bufplce base address -; rbx = -; rcx = offset from rdi/count (negative) -; edx/rdx = scratch -; rdi = bottom of columns to write to -; r8d-r11d = column offsets -; r12-r15 = palookupoffse[0] - palookupoffse[4] - - mov ecx, [dc_count] - mov rdi, [dc_dest] - test ecx, ecx - jle vltepilog ; count must be positive - - mov rax, [bufplce] - mov r8, [bufplce+8] - sub r8, rax - mov r9, [bufplce+16] - sub r9, rax - mov r10, [bufplce+24] - sub r10, rax - mov [source2+4], r8d - mov [source3+4], r9d - mov [source4+4], r10d - -pm: imul rcx, 320 - - mov r12, [palookupoffse] - mov r13, [palookupoffse+8] - mov r14, [palookupoffse+16] - mov r15, [palookupoffse+24] - - mov r8d, [vince] - mov r9d, [vince+4] - mov r10d, [vince+8] - mov r11d, [vince+12] - mov [step1+3], r8d - mov [step2+3], r9d - mov [step3+3], r10d - mov [step4+3], r11d - - add rdi, rcx - neg rcx - - mov r8d, [vplce] - mov r9d, [vplce+4] - mov r10d, [vplce+8] - mov r11d, [vplce+12] - jmp loopit - -ALIGN 16 -loopit: - mov edx, r8d -shifter1: shr edx, 24 -step1: add r8d, 0x88888888 - movzx edx, BYTE [rax+rdx] - mov ebx, r9d - mov dl, [r12+rdx] -shifter2: shr ebx, 24 -step2: add r9d, 0x88888888 -source2: movzx ebx, BYTE [rax+rbx+0x88888888] - mov ebp, r10d - mov bl, [r13+rbx] -shifter3: shr ebp, 24 -step3: add r10d, 0x88888888 -source3: movzx ebp, BYTE [rax+rbp+0x88888888] - mov esi, r11d - mov bpl, BYTE [r14+rbp] -shifter4: shr esi, 24 -step4: add r11d, 0x88888888 -source4: movzx esi, BYTE [rax+rsi+0x88888888] - mov [rdi+rcx], dl - mov [rdi+rcx+1], bl - mov sil, BYTE [r15+rsi] - mov [rdi+rcx+2], bpl - mov [rdi+rcx+3], sil - -vltpitch: add rcx, 320 - jl loopit - - mov [vplce], r8d - mov [vplce+4], r9d - mov [vplce+8], r10d - mov [vplce+12], r11d - -vltepilog: - add rsp, 8 - pop rsi - pop rbp - pop r12 - pop r13 - pop r14 - pop r15 - pop rdi - pop rbx - ret -vlinetallasm4_end: -ENDPROC_FRAME - ALIGN 16 - diff --git a/src/asm_x86_64/tmap3.s b/src/asm_x86_64/tmap3.s deleted file mode 100644 index 867d11c759..0000000000 --- a/src/asm_x86_64/tmap3.s +++ /dev/null @@ -1,141 +0,0 @@ -#%include "valgrind.inc" - - .section .text - -.globl ASM_PatchPitch -ASM_PatchPitch: - movl dc_pitch(%rip), %ecx - movl %ecx, pm+3(%rip) - movl %ecx, vltpitch+3(%rip) -# selfmod pm, vltpitch+6 - ret - .align 16 - -.globl setupvlinetallasm -setupvlinetallasm: - movb %dil, shifter1+2(%rip) - movb %dil, shifter2+2(%rip) - movb %dil, shifter3+2(%rip) - movb %dil, shifter4+2(%rip) -# selfmod shifter1, shifter4+3 - ret - .align 16 - - .section .rtext,"awx" - -.globl vlinetallasm4 - .type vlinetallasm4,@function -vlinetallasm4: - .cfi_startproc - push %rbx - push %rdi - push %r15 - push %r14 - push %r13 - push %r12 - push %rbp - push %rsi - subq $8, %rsp # Does the stack need to be 16-byte aligned for Linux? - .cfi_adjust_cfa_offset 8 - -# rax = bufplce base address -# rbx = -# rcx = offset from rdi/count (negative) -# edx/rdx = scratch -# rdi = bottom of columns to write to -# r8d-r11d = column offsets -# r12-r15 = palookupoffse[0] - palookupoffse[4] - - movl dc_count(%rip), %ecx - movq dc_dest(%rip), %rdi - testl %ecx, %ecx - jle vltepilog # count must be positive - - movq bufplce(%rip), %rax - movq bufplce+8(%rip), %r8 - subq %rax, %r8 - movq bufplce+16(%rip), %r9 - subq %rax, %r9 - movq bufplce+24(%rip), %r10 - subq %rax, %r10 - movl %r8d, source2+4(%rip) - movl %r9d, source3+4(%rip) - movl %r10d, source4+4(%rip) - -pm: imulq $320, %rcx - - movq palookupoffse(%rip), %r12 - movq palookupoffse+8(%rip), %r13 - movq palookupoffse+16(%rip), %r14 - movq palookupoffse+24(%rip), %r15 - - movl vince(%rip), %r8d - movl vince+4(%rip), %r9d - movl vince+8(%rip), %r10d - movl vince+12(%rip), %r11d - movl %r8d, step1+3(%rip) - movl %r9d, step2+3(%rip) - movl %r10d, step3+3(%rip) - movl %r11d, step4+3(%rip) - - addq %rcx, %rdi - negq %rcx - - movl vplce(%rip), %r8d - movl vplce+4(%rip), %r9d - movl vplce+8(%rip), %r10d - movl vplce+12(%rip), %r11d -# selfmod loopit, vltepilog - jmp loopit - - .align 16 -loopit: - movl %r8d, %edx -shifter1: shrl $24, %edx -step1: addl $0x44444444, %r8d - movzbl (%rax,%rdx), %edx - movl %r9d, %ebx - movb (%r12,%rdx), %dl -shifter2: shrl $24, %ebx -step2: addl $0x44444444, %r9d -source2: movzbl 0x44444444(%rax,%rbx), %ebx - movl %r10d, %ebp - movb (%r13,%rbx), %bl -shifter3: shr $24, %ebp -step3: addl $0x44444444, %r10d -source3: movzbl 0x44444444(%rax,%rbp), %ebp - movl %r11d, %esi - movb (%r14,%rbp), %bpl -shifter4: shr $24, %esi -step4: add $0x44444444, %r11d -source4: movzbl 0x44444444(%rax,%rsi), %esi - movb %dl, (%rdi,%rcx) - movb %bl, 1(%rdi,%rcx) - movb (%r15,%rsi), %sil - movb %bpl, 2(%rdi,%rcx) - movb %sil, 3(%rdi,%rcx) - -vltpitch: addq $320, %rcx - jl loopit - - movl %r8d, vplce(%rip) - movl %r9d, vplce+4(%rip) - movl %r10d, vplce+8(%rip) - movl %r11d, vplce+12(%rip) - -vltepilog: - addq $8, %rsp - .cfi_adjust_cfa_offset -8 - pop %rsi - pop %rbp - pop %r12 - pop %r13 - pop %r14 - pop %r15 - pop %rdi - pop %rbx - ret - .cfi_endproc - .align 16 - - From 2677976cae8c628c49c2f4df23ddfcb42f63222c Mon Sep 17 00:00:00 2001 From: Christoph Oelckers Date: Wed, 7 Dec 2016 15:44:25 +0100 Subject: [PATCH 6/7] - r_drawt.cpp is no longer needed. --- src/r_drawt.cpp | 1118 ----------------------------------------------- 1 file changed, 1118 deletions(-) delete mode 100644 src/r_drawt.cpp diff --git a/src/r_drawt.cpp b/src/r_drawt.cpp deleted file mode 100644 index a4f581d12a..0000000000 --- a/src/r_drawt.cpp +++ /dev/null @@ -1,1118 +0,0 @@ -/* -** r_drawt.cpp -** Faster column drawers for modern processors -** -**--------------------------------------------------------------------------- -** Copyright 1998-2006 Randy Heit -** All rights reserved. -** -** Redistribution and use in source and binary forms, with or without -** modification, are permitted provided that the following conditions -** are met: -** -** 1. Redistributions of source code must retain the above copyright -** notice, this list of conditions and the following disclaimer. -** 2. Redistributions in binary form must reproduce the above copyright -** notice, this list of conditions and the following disclaimer in the -** documentation and/or other materials provided with the distribution. -** 3. The name of the author may not be used to endorse or promote products -** derived from this software without specific prior written permission. -** -** THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR -** IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES -** OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. -** IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, -** INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT -** NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -** DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -** THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -** (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF -** THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**--------------------------------------------------------------------------- -** -** These functions stretch columns into a temporary buffer and then -** map them to the screen. On modern machines, this is faster than drawing -** them directly to the screen. -** -** Will I be able to even understand any of this if I come back to it later? -** Let's hope so. :-) -*/ - -#include "templates.h" -#include "doomtype.h" -#include "doomdef.h" -#include "r_defs.h" -#include "r_draw.h" -#include "r_main.h" -#include "r_things.h" -#include "v_video.h" - -// I should have commented this stuff better. -// -// dc_temp is the buffer R_DrawColumnHoriz writes into. -// dc_tspans points into it. -// dc_ctspan points into dc_tspans. -// horizspan also points into dc_tspans. - -// dc_ctspan is advanced while drawing into dc_temp. -// horizspan is advanced up to dc_ctspan when drawing from dc_temp to the screen. - -BYTE dc_tempbuff[MAXHEIGHT*4]; -BYTE *dc_temp; -unsigned int dc_tspans[4][MAXHEIGHT]; -unsigned int *dc_ctspan[4]; -unsigned int *horizspan[4]; - -#ifdef X86_ASM -extern "C" void R_SetupShadedCol(); -extern "C" void R_SetupAddCol(); -extern "C" void R_SetupAddClampCol(); -#endif - -// Copies one span at hx to the screen at sx. -void rt_copy1col (int hx, int sx, int yl, int yh) -{ - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4 + hx]; - pitch = dc_pitch; - - if (count & 1) { - *dest = *source; - source += 4; - dest += pitch; - } - if (count & 2) { - dest[0] = source[0]; - dest[pitch] = source[4]; - source += 8; - dest += pitch*2; - } - if (!(count >>= 2)) - return; - - do { - dest[0] = source[0]; - dest[pitch] = source[4]; - dest[pitch*2] = source[8]; - dest[pitch*3] = source[12]; - source += 16; - dest += pitch*4; - } while (--count); -} - -// Copies all four spans to the screen starting at sx. -void rt_copy4cols (int sx, int yl, int yh) -{ - int *source; - int *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - dest = (int *)(ylookup[yl] + sx + dc_destorg); - source = (int *)(&dc_temp[yl*4]); - pitch = dc_pitch/sizeof(int); - - if (count & 1) { - *dest = *source; - source += 4/sizeof(int); - dest += pitch; - } - if (!(count >>= 1)) - return; - - do { - dest[0] = source[0]; - dest[pitch] = source[4/sizeof(int)]; - source += 8/sizeof(int); - dest += pitch*2; - } while (--count); -} - -// Maps one span at hx to the screen at sx. -void rt_map1col (int hx, int sx, int yl, int yh) -{ - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - colormap = dc_colormap; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4 + hx]; - pitch = dc_pitch; - - if (count & 1) { - *dest = colormap[*source]; - source += 4; - dest += pitch; - } - if (!(count >>= 1)) - return; - - do { - dest[0] = colormap[source[0]]; - dest[pitch] = colormap[source[4]]; - source += 8; - dest += pitch*2; - } while (--count); -} - -// Maps all four spans to the screen starting at sx. -void rt_map4cols (int sx, int yl, int yh) -{ - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - colormap = dc_colormap; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4]; - pitch = dc_pitch; - - if (count & 1) { - dest[0] = colormap[source[0]]; - dest[1] = colormap[source[1]]; - dest[2] = colormap[source[2]]; - dest[3] = colormap[source[3]]; - source += 4; - dest += pitch; - } - if (!(count >>= 1)) - return; - - do { - dest[0] = colormap[source[0]]; - dest[1] = colormap[source[1]]; - dest[2] = colormap[source[2]]; - dest[3] = colormap[source[3]]; - dest[pitch] = colormap[source[4]]; - dest[pitch+1] = colormap[source[5]]; - dest[pitch+2] = colormap[source[6]]; - dest[pitch+3] = colormap[source[7]]; - source += 8; - dest += pitch*2; - } while (--count); -} - -void rt_Translate1col(const BYTE *translation, int hx, int yl, int yh) -{ - int count = yh - yl + 1; - BYTE *source = &dc_temp[yl*4 + hx]; - - // Things we do to hit the compiler's optimizer with a clue bat: - // 1. Parallelism is explicitly spelled out by using a separate - // C instruction for each assembly instruction. GCC lets me - // have four temporaries, but VC++ spills to the stack with - // more than two. Two is probably optimal, anyway. - // 2. The results of the translation lookups are explicitly - // stored in byte-sized variables. This causes the VC++ code - // to use byte mov instructions in most cases; for apparently - // random reasons, it will use movzx for some places. GCC - // ignores this and uses movzx always. - - // Do 8 rows at a time. - for (int count8 = count >> 3; count8; --count8) - { - int c0, c1; - BYTE b0, b1; - - c0 = source[0]; c1 = source[4]; - b0 = translation[c0]; b1 = translation[c1]; - source[0] = b0; source[4] = b1; - - c0 = source[8]; c1 = source[12]; - b0 = translation[c0]; b1 = translation[c1]; - source[8] = b0; source[12] = b1; - - c0 = source[16]; c1 = source[20]; - b0 = translation[c0]; b1 = translation[c1]; - source[16] = b0; source[20] = b1; - - c0 = source[24]; c1 = source[28]; - b0 = translation[c0]; b1 = translation[c1]; - source[24] = b0; source[28] = b1; - - source += 32; - } - // Finish by doing 1 row at a time. - for (count &= 7; count; --count, source += 4) - { - source[0] = translation[source[0]]; - } -} - -void rt_Translate4cols(const BYTE *translation, int yl, int yh) -{ - int count = yh - yl + 1; - BYTE *source = &dc_temp[yl*4]; - int c0, c1; - BYTE b0, b1; - - // Do 2 rows at a time. - for (int count8 = count >> 1; count8; --count8) - { - c0 = source[0]; c1 = source[1]; - b0 = translation[c0]; b1 = translation[c1]; - source[0] = b0; source[1] = b1; - - c0 = source[2]; c1 = source[3]; - b0 = translation[c0]; b1 = translation[c1]; - source[2] = b0; source[3] = b1; - - c0 = source[4]; c1 = source[5]; - b0 = translation[c0]; b1 = translation[c1]; - source[4] = b0; source[5] = b1; - - c0 = source[6]; c1 = source[7]; - b0 = translation[c0]; b1 = translation[c1]; - source[6] = b0; source[7] = b1; - - source += 8; - } - // Do the final row if count was odd. - if (count & 1) - { - c0 = source[0]; c1 = source[1]; - b0 = translation[c0]; b1 = translation[c1]; - source[0] = b0; source[1] = b1; - - c0 = source[2]; c1 = source[3]; - b0 = translation[c0]; b1 = translation[c1]; - source[2] = b0; source[3] = b1; - } -} - -// Translates one span at hx to the screen at sx. -void rt_tlate1col (int hx, int sx, int yl, int yh) -{ - rt_Translate1col(dc_translation, hx, yl, yh); - rt_map1col(hx, sx, yl, yh); -} - -// Translates all four spans to the screen starting at sx. -void rt_tlate4cols (int sx, int yl, int yh) -{ - rt_Translate4cols(dc_translation, yl, yh); - rt_map4cols(sx, yl, yh); -} - -// Adds one span at hx to the screen at sx without clamping. -void rt_add1col (int hx, int sx, int yl, int yh) -{ - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - DWORD *fg2rgb = dc_srcblend; - DWORD *bg2rgb = dc_destblend; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4 + hx]; - pitch = dc_pitch; - colormap = dc_colormap; - - do { - DWORD fg = colormap[*source]; - DWORD bg = *dest; - - fg = fg2rgb[fg]; - bg = bg2rgb[bg]; - fg = (fg+bg) | 0x1f07c1f; - *dest = RGB32k.All[fg & (fg>>15)]; - source += 4; - dest += pitch; - } while (--count); -} - -// Adds all four spans to the screen starting at sx without clamping. -void rt_add4cols_c (int sx, int yl, int yh) -{ - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - DWORD *fg2rgb = dc_srcblend; - DWORD *bg2rgb = dc_destblend; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4]; - pitch = dc_pitch; - colormap = dc_colormap; - - do { - DWORD fg = colormap[source[0]]; - DWORD bg = dest[0]; - fg = fg2rgb[fg]; - bg = bg2rgb[bg]; - fg = (fg+bg) | 0x1f07c1f; - dest[0] = RGB32k.All[fg & (fg>>15)]; - - fg = colormap[source[1]]; - bg = dest[1]; - fg = fg2rgb[fg]; - bg = bg2rgb[bg]; - fg = (fg+bg) | 0x1f07c1f; - dest[1] = RGB32k.All[fg & (fg>>15)]; - - - fg = colormap[source[2]]; - bg = dest[2]; - fg = fg2rgb[fg]; - bg = bg2rgb[bg]; - fg = (fg+bg) | 0x1f07c1f; - dest[2] = RGB32k.All[fg & (fg>>15)]; - - fg = colormap[source[3]]; - bg = dest[3]; - fg = fg2rgb[fg]; - bg = bg2rgb[bg]; - fg = (fg+bg) | 0x1f07c1f; - dest[3] = RGB32k.All[fg & (fg>>15)]; - - source += 4; - dest += pitch; - } while (--count); -} - -// Translates and adds one span at hx to the screen at sx without clamping. -void rt_tlateadd1col (int hx, int sx, int yl, int yh) -{ - rt_Translate1col(dc_translation, hx, yl, yh); - rt_add1col(hx, sx, yl, yh); -} - -// Translates and adds all four spans to the screen starting at sx without clamping. -void rt_tlateadd4cols (int sx, int yl, int yh) -{ - rt_Translate4cols(dc_translation, yl, yh); - rt_add4cols(sx, yl, yh); -} - -// Shades one span at hx to the screen at sx. -void rt_shaded1col (int hx, int sx, int yl, int yh) -{ - DWORD *fgstart; - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - fgstart = &Col2RGB8[0][dc_color]; - colormap = dc_colormap; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4 + hx]; - pitch = dc_pitch; - - do { - DWORD val = colormap[*source]; - DWORD fg = fgstart[val<<8]; - val = (Col2RGB8[64-val][*dest] + fg) | 0x1f07c1f; - *dest = RGB32k.All[val & (val>>15)]; - source += 4; - dest += pitch; - } while (--count); -} - -// Shades all four spans to the screen starting at sx. -void rt_shaded4cols_c (int sx, int yl, int yh) -{ - DWORD *fgstart; - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - fgstart = &Col2RGB8[0][dc_color]; - colormap = dc_colormap; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4]; - pitch = dc_pitch; - - do { - DWORD val; - - val = colormap[source[0]]; - val = (Col2RGB8[64-val][dest[0]] + fgstart[val<<8]) | 0x1f07c1f; - dest[0] = RGB32k.All[val & (val>>15)]; - - val = colormap[source[1]]; - val = (Col2RGB8[64-val][dest[1]] + fgstart[val<<8]) | 0x1f07c1f; - dest[1] = RGB32k.All[val & (val>>15)]; - - val = colormap[source[2]]; - val = (Col2RGB8[64-val][dest[2]] + fgstart[val<<8]) | 0x1f07c1f; - dest[2] = RGB32k.All[val & (val>>15)]; - - val = colormap[source[3]]; - val = (Col2RGB8[64-val][dest[3]] + fgstart[val<<8]) | 0x1f07c1f; - dest[3] = RGB32k.All[val & (val>>15)]; - - source += 4; - dest += pitch; - } while (--count); -} - -// Adds one span at hx to the screen at sx with clamping. -void rt_addclamp1col (int hx, int sx, int yl, int yh) -{ - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - DWORD *fg2rgb = dc_srcblend; - DWORD *bg2rgb = dc_destblend; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4 + hx]; - pitch = dc_pitch; - colormap = dc_colormap; - - do { - DWORD a = fg2rgb[colormap[*source]] + bg2rgb[*dest]; - DWORD b = a; - - a |= 0x01f07c1f; - b &= 0x40100400; - a &= 0x3fffffff; - b = b - (b >> 5); - a |= b; - *dest = RGB32k.All[(a>>15) & a]; - source += 4; - dest += pitch; - } while (--count); -} - -// Adds all four spans to the screen starting at sx with clamping. -void rt_addclamp4cols_c (int sx, int yl, int yh) -{ - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - DWORD *fg2rgb = dc_srcblend; - DWORD *bg2rgb = dc_destblend; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4]; - pitch = dc_pitch; - colormap = dc_colormap; - - do { - DWORD a = fg2rgb[colormap[source[0]]] + bg2rgb[dest[0]]; - DWORD b = a; - - a |= 0x01f07c1f; - b &= 0x40100400; - a &= 0x3fffffff; - b = b - (b >> 5); - a |= b; - dest[0] = RGB32k.All[(a>>15) & a]; - - a = fg2rgb[colormap[source[1]]] + bg2rgb[dest[1]]; - b = a; - a |= 0x01f07c1f; - b &= 0x40100400; - a &= 0x3fffffff; - b = b - (b >> 5); - a |= b; - dest[1] = RGB32k.All[(a>>15) & a]; - - a = fg2rgb[colormap[source[2]]] + bg2rgb[dest[2]]; - b = a; - a |= 0x01f07c1f; - b &= 0x40100400; - a &= 0x3fffffff; - b = b - (b >> 5); - a |= b; - dest[2] = RGB32k.All[(a>>15) & a]; - - a = fg2rgb[colormap[source[3]]] + bg2rgb[dest[3]]; - b = a; - a |= 0x01f07c1f; - b &= 0x40100400; - a &= 0x3fffffff; - b = b - (b >> 5); - a |= b; - dest[3] = RGB32k.All[(a>>15) & a]; - - source += 4; - dest += pitch; - } while (--count); -} - -// Translates and adds one span at hx to the screen at sx with clamping. -void rt_tlateaddclamp1col (int hx, int sx, int yl, int yh) -{ - rt_Translate1col(dc_translation, hx, yl, yh); - rt_addclamp1col(hx, sx, yl, yh); -} - -// Translates and adds all four spans to the screen starting at sx with clamping. -void rt_tlateaddclamp4cols (int sx, int yl, int yh) -{ - rt_Translate4cols(dc_translation, yl, yh); - rt_addclamp4cols(sx, yl, yh); -} - -// Subtracts one span at hx to the screen at sx with clamping. -void rt_subclamp1col (int hx, int sx, int yl, int yh) -{ - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - DWORD *fg2rgb = dc_srcblend; - DWORD *bg2rgb = dc_destblend; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4 + hx]; - pitch = dc_pitch; - colormap = dc_colormap; - - do { - DWORD a = (fg2rgb[colormap[*source]] | 0x40100400) - bg2rgb[*dest]; - DWORD b = a; - - b &= 0x40100400; - b = b - (b >> 5); - a &= b; - a |= 0x01f07c1f; - *dest = RGB32k.All[(a>>15) & a]; - source += 4; - dest += pitch; - } while (--count); -} - -// Subtracts all four spans to the screen starting at sx with clamping. -void rt_subclamp4cols (int sx, int yl, int yh) -{ - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - DWORD *fg2rgb = dc_srcblend; - DWORD *bg2rgb = dc_destblend; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4]; - pitch = dc_pitch; - colormap = dc_colormap; - - do { - DWORD a = (fg2rgb[colormap[source[0]]] | 0x40100400) - bg2rgb[dest[0]]; - DWORD b = a; - - b &= 0x40100400; - b = b - (b >> 5); - a &= b; - a |= 0x01f07c1f; - dest[0] = RGB32k.All[(a>>15) & a]; - - a = (fg2rgb[colormap[source[1]]] | 0x40100400) - bg2rgb[dest[1]]; - b = a; - b &= 0x40100400; - b = b - (b >> 5); - a &= b; - a |= 0x01f07c1f; - dest[1] = RGB32k.All[(a>>15) & a]; - - a = (fg2rgb[colormap[source[2]]] | 0x40100400) - bg2rgb[dest[2]]; - b = a; - b &= 0x40100400; - b = b - (b >> 5); - a &= b; - a |= 0x01f07c1f; - dest[2] = RGB32k.All[(a>>15) & a]; - - a = (fg2rgb[colormap[source[3]]] | 0x40100400) - bg2rgb[dest[3]]; - b = a; - b &= 0x40100400; - b = b - (b >> 5); - a &= b; - a |= 0x01f07c1f; - dest[3] = RGB32k.All[(a>>15) & a]; - - source += 4; - dest += pitch; - } while (--count); -} - -// Translates and subtracts one span at hx to the screen at sx with clamping. -void rt_tlatesubclamp1col (int hx, int sx, int yl, int yh) -{ - rt_Translate1col(dc_translation, hx, yl, yh); - rt_subclamp1col(hx, sx, yl, yh); -} - -// Translates and subtracts all four spans to the screen starting at sx with clamping. -void rt_tlatesubclamp4cols (int sx, int yl, int yh) -{ - rt_Translate4cols(dc_translation, yl, yh); - rt_subclamp4cols(sx, yl, yh); -} - -// Subtracts one span at hx from the screen at sx with clamping. -void rt_revsubclamp1col (int hx, int sx, int yl, int yh) -{ - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - DWORD *fg2rgb = dc_srcblend; - DWORD *bg2rgb = dc_destblend; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4 + hx]; - pitch = dc_pitch; - colormap = dc_colormap; - - do { - DWORD a = (bg2rgb[*dest] | 0x40100400) - fg2rgb[colormap[*source]]; - DWORD b = a; - - b &= 0x40100400; - b = b - (b >> 5); - a &= b; - a |= 0x01f07c1f; - *dest = RGB32k.All[(a>>15) & a]; - source += 4; - dest += pitch; - } while (--count); -} - -// Subtracts all four spans from the screen starting at sx with clamping. -void rt_revsubclamp4cols (int sx, int yl, int yh) -{ - BYTE *colormap; - BYTE *source; - BYTE *dest; - int count; - int pitch; - - count = yh-yl; - if (count < 0) - return; - count++; - - DWORD *fg2rgb = dc_srcblend; - DWORD *bg2rgb = dc_destblend; - dest = ylookup[yl] + sx + dc_destorg; - source = &dc_temp[yl*4]; - pitch = dc_pitch; - colormap = dc_colormap; - - do { - DWORD a = (bg2rgb[dest[0]] | 0x40100400) - fg2rgb[colormap[source[0]]]; - DWORD b = a; - - b &= 0x40100400; - b = b - (b >> 5); - a &= b; - a |= 0x01f07c1f; - dest[0] = RGB32k.All[(a>>15) & a]; - - a = (bg2rgb[dest[1]] | 0x40100400) - fg2rgb[colormap[source[1]]]; - b = a; - b &= 0x40100400; - b = b - (b >> 5); - a &= b; - a |= 0x01f07c1f; - dest[1] = RGB32k.All[(a>>15) & a]; - - a = (bg2rgb[dest[2]] | 0x40100400) - fg2rgb[colormap[source[2]]]; - b = a; - b &= 0x40100400; - b = b - (b >> 5); - a &= b; - a |= 0x01f07c1f; - dest[2] = RGB32k.All[(a>>15) & a]; - - a = (bg2rgb[dest[3]] | 0x40100400) - fg2rgb[colormap[source[3]]]; - b = a; - b &= 0x40100400; - b = b - (b >> 5); - a &= b; - a |= 0x01f07c1f; - dest[3] = RGB32k.All[(a>>15) & a]; - - source += 4; - dest += pitch; - } while (--count); -} - -// Translates and subtracts one span at hx from the screen at sx with clamping. -void rt_tlaterevsubclamp1col (int hx, int sx, int yl, int yh) -{ - rt_Translate1col(dc_translation, hx, yl, yh); - rt_revsubclamp1col(hx, sx, yl, yh); -} - -// Translates and subtracts all four spans from the screen starting at sx with clamping. -void rt_tlaterevsubclamp4cols (int sx, int yl, int yh) -{ - rt_Translate4cols(dc_translation, yl, yh); - rt_revsubclamp4cols(sx, yl, yh); -} - -// Reorder the posts so that they get drawn top-to-bottom instead of bottom-to-top. -void rt_flip_posts() -{ - unsigned int *front = horizspan[dc_x & 3]; - unsigned int *back = dc_ctspan[dc_x & 3] - 2; - - while (front < back) - { - swapvalues(front[0], back[0]); - swapvalues(front[1], back[1]); - front += 2; - back -= 2; - } -} - -// Copies all spans in all four columns to the screen starting at sx. -// sx should be dword-aligned. -void rt_draw4cols (int sx) -{ - int x, bad; - unsigned int maxtop, minbot, minnexttop; - - // Place a dummy "span" in each column. These don't get - // drawn. They're just here to avoid special cases in the - // max/min calculations below. - for (x = 0; x < 4; ++x) - { - dc_ctspan[x][0] = screen->GetHeight()+1; - dc_ctspan[x][1] = screen->GetHeight(); - } - -#ifdef X86_ASM - // Setup assembly routines for changed colormaps or other parameters. - if (hcolfunc_post4 == rt_shaded4cols) - { - R_SetupShadedCol(); - } - else if (hcolfunc_post4 == rt_addclamp4cols || hcolfunc_post4 == rt_tlateaddclamp4cols) - { - R_SetupAddClampCol(); - } - else if (hcolfunc_post4 == rt_add4cols || hcolfunc_post4 == rt_tlateadd4cols) - { - R_SetupAddCol(); - } -#endif - - for (;;) - { - // If a column is out of spans, mark it as such - bad = 0; - minnexttop = 0xffffffff; - for (x = 0; x < 4; ++x) - { - if (horizspan[x] >= dc_ctspan[x]) - { - bad |= 1 << x; - } - else if ((horizspan[x]+2)[0] < minnexttop) - { - minnexttop = (horizspan[x]+2)[0]; - } - } - // Once all columns are out of spans, we're done - if (bad == 15) - { - return; - } - - // Find the largest shared area for the spans in each column - maxtop = MAX (MAX (horizspan[0][0], horizspan[1][0]), - MAX (horizspan[2][0], horizspan[3][0])); - minbot = MIN (MIN (horizspan[0][1], horizspan[1][1]), - MIN (horizspan[2][1], horizspan[3][1])); - - // If there is no shared area with these spans, draw each span - // individually and advance to the next spans until we reach a shared area. - // However, only draw spans down to the highest span in the next set of - // spans. If we allow the entire height of a span to be drawn, it could - // prevent any more shared areas from being drawn in these four columns. - // - // Example: Suppose we have the following arrangement: - // A CD - // A CD - // B D - // B D - // aB D - // aBcD - // aBcD - // aBc - // - // If we draw the entire height of the spans, we end up drawing this first: - // A CD - // A CD - // B D - // B D - // B D - // B D - // B D - // B D - // B - // - // This leaves only the "a" and "c" columns to be drawn, and they are not - // part of a shared area, but if we can include B and D with them, we can - // get a shared area. So we cut off everything in the first set just - // above the "a" column and end up drawing this first: - // A CD - // A CD - // B D - // B D - // - // Then the next time through, we have the following arrangement with an - // easily shared area to draw: - // aB D - // aBcD - // aBcD - // aBc - if (bad != 0 || maxtop > minbot) - { - int drawcount = 0; - for (x = 0; x < 4; ++x) - { - if (!(bad & 1)) - { - if (horizspan[x][1] < minnexttop) - { - hcolfunc_post1 (x, sx+x, horizspan[x][0], horizspan[x][1]); - horizspan[x] += 2; - drawcount++; - } - else if (minnexttop > horizspan[x][0]) - { - hcolfunc_post1 (x, sx+x, horizspan[x][0], minnexttop-1); - horizspan[x][0] = minnexttop; - drawcount++; - } - } - bad >>= 1; - } - // Drawcount *should* always be non-zero. The reality is that some situations - // can make this not true. Unfortunately, I'm not sure what those situations are. - if (drawcount == 0) - { - return; - } - continue; - } - - // Draw any span fragments above the shared area. - for (x = 0; x < 4; ++x) - { - if (maxtop > horizspan[x][0]) - { - hcolfunc_post1 (x, sx+x, horizspan[x][0], maxtop-1); - } - } - - // Draw the shared area. - hcolfunc_post4 (sx, maxtop, minbot); - - // For each column, if part of the span is past the shared area, - // set its top to just below the shared area. Otherwise, advance - // to the next span in that column. - for (x = 0; x < 4; ++x) - { - if (minbot < horizspan[x][1]) - { - horizspan[x][0] = minbot+1; - } - else - { - horizspan[x] += 2; - } - } - } -} - -// Before each pass through a rendering loop that uses these routines, -// call this function to set up the span pointers. -void rt_initcols (BYTE *buff) -{ - int y; - - dc_temp = buff == NULL ? dc_tempbuff : buff; - for (y = 3; y >= 0; y--) - horizspan[y] = dc_ctspan[y] = &dc_tspans[y][0]; -} - -// Stretches a column into a temporary buffer which is later -// drawn to the screen along with up to three other columns. -void R_DrawColumnHorizP_C (void) -{ - int count = dc_count; - BYTE *dest; - fixed_t fracstep; - fixed_t frac; - - if (count <= 0) - return; - - { - int x = dc_x & 3; - unsigned int **span; - - span = &dc_ctspan[x]; - (*span)[0] = dc_yl; - (*span)[1] = dc_yh; - *span += 2; - dest = &dc_temp[x + 4*dc_yl]; - } - fracstep = dc_iscale; - frac = dc_texturefrac; - - { - const BYTE *source = dc_source; - - if (count & 1) { - *dest = source[frac>>FRACBITS]; dest += 4; frac += fracstep; - } - if (count & 2) { - dest[0] = source[frac>>FRACBITS]; frac += fracstep; - dest[4] = source[frac>>FRACBITS]; frac += fracstep; - dest += 8; - } - if (count & 4) { - dest[0] = source[frac>>FRACBITS]; frac += fracstep; - dest[4] = source[frac>>FRACBITS]; frac += fracstep; - dest[8] = source[frac>>FRACBITS]; frac += fracstep; - dest[12]= source[frac>>FRACBITS]; frac += fracstep; - dest += 16; - } - count >>= 3; - if (!count) return; - - do - { - dest[0] = source[frac>>FRACBITS]; frac += fracstep; - dest[4] = source[frac>>FRACBITS]; frac += fracstep; - dest[8] = source[frac>>FRACBITS]; frac += fracstep; - dest[12]= source[frac>>FRACBITS]; frac += fracstep; - dest[16]= source[frac>>FRACBITS]; frac += fracstep; - dest[20]= source[frac>>FRACBITS]; frac += fracstep; - dest[24]= source[frac>>FRACBITS]; frac += fracstep; - dest[28]= source[frac>>FRACBITS]; frac += fracstep; - dest += 32; - } while (--count); - } -} - -// [RH] Just fills a column with a given color -void R_FillColumnHorizP (void) -{ - int count = dc_count; - BYTE color = dc_color; - BYTE *dest; - - if (count <= 0) - return; - - { - int x = dc_x & 3; - unsigned int **span = &dc_ctspan[x]; - - (*span)[0] = dc_yl; - (*span)[1] = dc_yh; - *span += 2; - dest = &dc_temp[x + 4*dc_yl]; - } - - if (count & 1) { - *dest = color; - dest += 4; - } - if (!(count >>= 1)) - return; - do { - dest[0] = color; dest[4] = color; - dest += 8; - } while (--count); -} From e4c208602dab77e91fc24d1ff17ba24d3cddb497 Mon Sep 17 00:00:00 2001 From: Edoardo Prezioso Date: Wed, 7 Dec 2016 11:28:40 +0100 Subject: [PATCH 7/7] - Fixed GCC/Clang warning/error. --- src/d_main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/d_main.cpp b/src/d_main.cpp index 0f2d5af92b..04ba34cbb0 100644 --- a/src/d_main.cpp +++ b/src/d_main.cpp @@ -1030,7 +1030,7 @@ void D_DoomLoop () catch (CVMAbortException &error) { error.MaybePrintMessage(); - Printf("%s", error.stacktrace); + Printf("%s", error.stacktrace.GetChars()); D_ErrorCleanup(); } }