From a118903e3ef88e093ab5cfb3babef03684cd9608 Mon Sep 17 00:00:00 2001 From: Christoph Oelckers Date: Wed, 7 Dec 2016 14:41:21 +0100 Subject: [PATCH] - complete removal of assembly stuff. Nothing of this gets used anymore. --- src/asm_ia32/a.asm | 812 ------------------------------ src/asm_ia32/misc.asm | 200 -------- src/asm_ia32/tmap.asm | 1002 -------------------------------------- src/asm_ia32/tmap2.asm | 643 ------------------------ src/asm_ia32/tmap3.asm | 344 ------------- src/asm_x86_64/tmap3.asm | 150 ------ src/asm_x86_64/tmap3.s | 141 ------ 7 files changed, 3292 deletions(-) delete mode 100644 src/asm_ia32/a.asm delete mode 100644 src/asm_ia32/misc.asm delete mode 100644 src/asm_ia32/tmap.asm delete mode 100644 src/asm_ia32/tmap2.asm delete mode 100644 src/asm_ia32/tmap3.asm delete mode 100644 src/asm_x86_64/tmap3.asm delete mode 100644 src/asm_x86_64/tmap3.s diff --git a/src/asm_ia32/a.asm b/src/asm_ia32/a.asm deleted file mode 100644 index 786396d4a4..0000000000 --- a/src/asm_ia32/a.asm +++ /dev/null @@ -1,812 +0,0 @@ -; "Build Engine & Tools" Copyright (c) 1993-1997 Ken Silverman -; Ken Silverman's official web site: "http://www.advsys.net/ken" -; See the included license file "BUILDLIC.TXT" for license info. -; This file has been modified from Ken Silverman's original release - -%include "valgrind.inc" - - SECTION .data - -%ifndef M_TARGET_LINUX -%define ylookup _ylookup -%define vince _vince -%define vplce _vplce -%define palookupoffse _palookupoffse -%define bufplce _bufplce -%define dc_iscale _dc_iscale -%define dc_colormap _dc_colormap -%define dc_count _dc_count -%define dc_dest _dc_dest -%define dc_source _dc_source -%define dc_texturefrac _dc_texturefrac - -%define setupvlineasm _setupvlineasm -%define prevlineasm1 _prevlineasm1 -%define vlineasm1 _vlineasm1 -%define vlineasm4 _vlineasm4 - -%define setupmvlineasm _setupmvlineasm -%define mvlineasm1 _mvlineasm1 -%define mvlineasm4 _mvlineasm4 - -%define R_SetupDrawSlabA _R_SetupDrawSlabA -%define R_DrawSlabA _R_DrawSlabA -%endif - -EXTERN ylookup ; near - -EXTERN vplce ; near -EXTERN vince ; near -EXTERN palookupoffse ; near -EXTERN bufplce ; near - -EXTERN dc_iscale -EXTERN dc_colormap -EXTERN dc_count -EXTERN dc_dest -EXTERN dc_source -EXTERN dc_texturefrac - - SECTION .text - -ALIGN 16 -GLOBAL setvlinebpl_ -setvlinebpl_: - mov [fixchain1a+2], eax - mov [fixchain1b+2], eax - mov [fixchain2a+2], eax - mov [fixchain1m+2], eax - mov [fixchain2ma+2], eax - mov [fixchain2mb+2], eax - selfmod fixchain1a, fixchain2mb+6 - -setdrawslabbpl: - mov dword [voxbpl1+2], eax - mov dword [voxbpl2+2], eax - mov dword [voxbpl3+2], eax - mov dword [voxbpl4+2], eax - mov dword [voxbpl5+2], eax - mov dword [voxbpl6+2], eax - mov dword [voxbpl7+2], eax - mov dword [voxbpl8+2], eax - selfmod voxbpl1, voxpl8+6 - ret - - SECTION .data - -lastslabcolormap: - dd 4 - - SECTION .text - -GLOBAL R_SetupDrawSlabA -GLOBAL @R_SetupDrawSlabA@4 -R_SetupDrawSlabA: - mov ecx, [esp+4] -@R_SetupDrawSlabA@4: - cmp [lastslabcolormap], ecx - je .done - mov [lastslabcolormap], ecx - mov dword [voxpal1+2], ecx - mov dword [voxpal2+2], ecx - mov dword [voxpal3+2], ecx - mov dword [voxpal4+2], ecx - mov dword [voxpal5+2], ecx - mov dword [voxpal6+2], ecx - mov dword [voxpal7+2], ecx - mov dword [voxpal8+2], ecx -.done ret - - -; pass it log2(texheight) - -ALIGN 16 -GLOBAL setupvlineasm -setupvlineasm: - mov ecx, [esp+4] - - ;First 2 lines for VLINEASM1, rest for VLINEASM4 - mov byte [premach3a+2], cl - mov byte [mach3a+2], cl - - mov byte [machvsh1+2], cl ;32-shy - mov byte [machvsh3+2], cl ;32-shy - mov byte [machvsh5+2], cl ;32-shy - mov byte [machvsh6+2], cl ;32-shy - mov ch, cl - sub ch, 16 - mov byte [machvsh8+2], ch ;16-shy - neg cl - mov byte [machvsh7+2], cl ;shy - mov byte [machvsh9+2], cl ;shy - mov byte [machvsh10+2], cl ;shy - mov byte [machvsh11+2], cl ;shy - mov byte [machvsh12+2], cl ;shy - mov eax, 1 - shl eax, cl - dec eax - mov dword [machvsh2+2], eax ;(1<>sh) -;vplc3 = (ebp<<(32-sh))+((edx&65535)<<(16-sh)) -machvsh5: shl esi, 88h ;32-sh - mov eax, edx -machvsh6: shl ebp, 88h ;32-sh - and edx, 0000ffffh -machvsh7: shr eax, 88h ;sh - add esi, eax -machvsh8: shl edx, 88h ;16-sh - add ebp, edx - mov dword [vplce+12], esi - mov dword [vplce+4], ebp - - pop edi - pop esi - pop ebx - pop ebp - ret - -;************************************************************************* -;************************* Masked Vertical Lines ************************* -;************************************************************************* - -; pass it log2(texheight) - -ALIGN 16 -GLOBAL setupmvlineasm -setupmvlineasm: - mov ecx, dword [esp+4] - mov byte [maskmach3a+2], cl - mov byte [machmv13+2], cl - - mov byte [machmv14+2], cl - mov byte [machmv15+2], cl - mov byte [machmv16+2], cl - selfmod maskmach3a, machmv13+6 - ret - -ALIGN 16 -GLOBAL mvlineasm1 ;Masked vline -mvlineasm1: - push ebx - push edi - push esi - push ebp - mov ecx, [dc_count] - mov ebp, [dc_colormap] - mov edi, [dc_dest] - mov eax, [dc_iscale] - mov edx, [dc_texturefrac] - mov esi, [dc_source] -beginmvline: - mov ebx, edx -maskmach3a: shr ebx, 32 - movzx ebx, byte [esi+ebx] - cmp ebx, 0 - je short skipmask1 -maskmach3c: mov bl, byte [ebp+ebx] - mov [edi], bl -skipmask1: add edx, eax -fixchain1m: add edi, 320 - dec ecx - jnz short beginmvline - - pop ebp - pop esi - pop edi - pop ebx - mov eax, edx - ret - -ALIGN 16 -GLOBAL mvlineasm4 -mvlineasm4: - push ebx - push esi - push edi - push ebp - - mov ecx,[dc_count] - mov edi,[dc_dest] - - mov eax, [bufplce+0] - mov ebx, [bufplce+4] - mov [machmv1+3], eax - mov [machmv4+3], ebx - mov eax, [bufplce+8] - mov ebx, [bufplce+12] - mov [machmv7+3], eax - mov [machmv10+3], ebx - - mov eax, [palookupoffse] - mov ebx, [palookupoffse+4] - mov [machmv2+2], eax - mov [machmv5+2], ebx - mov eax, [palookupoffse+8] - mov ebx, [palookupoffse+12] - mov [machmv8+2], eax - mov [machmv11+2], ebx - - mov eax, [vince] ;vince - mov ebx, [vince+4] - xor bl, bl - mov [machmv3+2], eax - mov [machmv6+2], ebx - mov eax, [vince+8] - mov ebx, [vince+12] - mov [machmv9+2], eax - mov [machmv12+2], ebx - - inc ecx - push ecx - mov ecx, [vplce+0] - mov edx, [vplce+4] - mov esi, [vplce+8] - mov ebp, [vplce+12] -fixchain2ma: sub edi, 320 - - selfmod beginmvlineasm4, machmv2+6 - jmp short beginmvlineasm4 -ALIGN 16 -beginmvlineasm4: - dec dword [esp] - jz near endmvlineasm4 - - mov eax, ebp - mov ebx, esi -machmv16: shr eax, 32 -machmv12: add ebp, 0x88888888 ;vince[3] -machmv15: shr ebx, 32 -machmv9: add esi, 0x88888888 ;vince[2] -machmv10: movzx eax, byte [eax+0x88888888];bufplce[3] -machmv7: movzx ebx, byte [ebx+0x88888888];bufplce[2] - cmp eax, 1 - adc dl, dl - cmp ebx, 1 - adc dl, dl -machmv8: mov bl, [ebx+0x88888888] ;palookupoffs[2] -machmv11: mov bh, [eax+0x88888888] ;palookupoffs[3] - - mov eax, edx -machmv6: add edx, 0x88888888 ;vince[1] -machmv14: shr eax, 32 - shl ebx, 16 -machmv4: movzx eax, byte [eax+0x88888888];bufplce[1] - cmp eax, 1 - adc dl, dl -machmv5: mov bh, [eax+0x88888888] ;palookupoffs[1] - - mov eax, ecx -machmv3: add ecx, 0x88888888 ;vince[0] -machmv13: shr eax, 32 -machmv1: movzx eax, byte [eax+0x88888888];bufplce[0] - cmp eax, 1 - adc dl, dl -machmv2: mov bl, [eax+0x88888888] ;palookupoffs[0] - - xor eax, eax - shl dl, 4 -fixchain2mb: add edi, 320 - mov al, dl - add eax, mvcase15 - jmp eax ;16 byte cases - -ALIGN 16 -endmvlineasm4: - mov [vplce], ecx - mov [vplce+4], edx - mov [vplce+8], esi - mov [vplce+12], ebp - pop ecx - pop ebp - pop edi - pop esi - pop ebx - ret - - ;5,7,8,8,11,13,12,14,11,13,14,14,12,14,15,7 -ALIGN 16 -mvcase15: mov [edi], ebx - jmp beginmvlineasm4 -ALIGN 16 -mvcase14: mov [edi+1], bh - shr ebx, 16 - mov [edi+2], bx - jmp beginmvlineasm4 -ALIGN 16 -mvcase13: mov [edi], bl - shr ebx, 16 - mov [edi+2], bx - jmp beginmvlineasm4 -ALIGN 16 -mvcase12: shr ebx, 16 - mov [edi+2], bx - jmp beginmvlineasm4 -ALIGN 16 -mvcase11: mov [edi], bx - shr ebx, 16 - mov [edi+3], bh - jmp beginmvlineasm4 -ALIGN 16 -mvcase10: mov [edi+1], bh - shr ebx, 16 - mov [edi+3], bh - jmp beginmvlineasm4 -ALIGN 16 -mvcase9: mov [edi], bl - shr ebx, 16 - mov [edi+3], bh - jmp beginmvlineasm4 -ALIGN 16 -mvcase8: shr ebx, 16 - mov [edi+3], bh - jmp beginmvlineasm4 -ALIGN 16 -mvcase7: mov [edi], bx - shr ebx, 16 - mov [edi+2], bl - jmp beginmvlineasm4 -ALIGN 16 -mvcase6: shr ebx, 8 - mov [edi+1], bx - jmp beginmvlineasm4 -ALIGN 16 -mvcase5: mov [edi], bl - shr ebx, 16 - mov [edi+2], bl - jmp beginmvlineasm4 -ALIGN 16 -mvcase4: shr ebx, 16 - mov [edi+2], bl - jmp beginmvlineasm4 -ALIGN 16 -mvcase3: mov [edi], bx - jmp beginmvlineasm4 -ALIGN 16 -mvcase2: mov [edi+1], bh - jmp beginmvlineasm4 -ALIGN 16 -mvcase1: mov [edi], bl - jmp beginmvlineasm4 -ALIGN 16 -mvcase0: jmp beginmvlineasm4 - -align 16 - - -;************************************************************************* -;***************************** Voxel Slabs ******************************* -;************************************************************************* - -GLOBAL R_DrawSlabA -R_DrawSlabA: - push ebx - push ebp - push esi - push edi - - mov eax, [esp+5*4+0] - mov ebx, [esp+5*4+4] - mov ecx, [esp+5*4+8] - mov edx, [esp+5*4+12] - mov esi, [esp+5*4+16] - mov edi, [esp+5*4+20] - - cmp eax, 2 - je voxbegdraw2 - ja voxskip2 - xor eax, eax -voxbegdraw1: - mov ebp, ebx - shr ebp, 16 - add ebx, edx - dec ecx - mov al, byte [esi+ebp] -voxpal1: mov al, byte [eax+88888888h] - mov byte [edi], al -voxbpl1: lea edi, [edi+88888888h] - jnz voxbegdraw1 - jmp voxskipslab5 - -voxbegdraw2: - mov ebp, ebx - shr ebp, 16 - add ebx, edx - xor eax, eax - dec ecx - mov al, byte [esi+ebp] -voxpal2: mov al, byte [eax+88888888h] - mov ah, al - mov word [edi], ax -voxbpl2: lea edi, [edi+88888888h] - jnz voxbegdraw2 - jmp voxskipslab5 - -voxskip2: - cmp eax, 4 - jne voxskip4 - xor eax, eax -voxbegdraw4: - mov ebp, ebx - add ebx, edx - shr ebp, 16 - xor eax, eax - mov al, byte [esi+ebp] -voxpal3: mov al, byte [eax+88888888h] - mov ah, al - shl eax, 8 - mov al, ah - shl eax, 8 - mov al, ah - mov dword [edi], eax -voxbpl3: add edi, 88888888h - dec ecx - jnz voxbegdraw4 - jmp voxskipslab5 - -voxskip4: - add eax, edi - - test edi, 1 - jz voxskipslab1 - cmp edi, eax - je voxskipslab1 - - push eax - push ebx - push ecx - push edi -voxbegslab1: - mov ebp, ebx - add ebx, edx - shr ebp, 16 - xor eax, eax - mov al, byte [esi+ebp] -voxpal4: mov al, byte [eax+88888888h] - mov byte [edi], al -voxbpl4: add edi, 88888888h - dec ecx - jnz voxbegslab1 - pop edi - pop ecx - pop ebx - pop eax - inc edi - -voxskipslab1: - push eax - test edi, 2 - jz voxskipslab2 - dec eax - cmp edi, eax - jge voxskipslab2 - - push ebx - push ecx - push edi -voxbegslab2: - mov ebp, ebx - add ebx, edx - shr ebp, 16 - xor eax, eax - mov al, byte [esi+ebp] -voxpal5: mov al, byte [eax+88888888h] - mov ah, al - mov word [edi], ax -voxbpl5: add edi, 88888888h - dec ecx - jnz voxbegslab2 - pop edi - pop ecx - pop ebx - add edi, 2 - -voxskipslab2: - mov eax, [esp] - - sub eax, 3 - cmp edi, eax - jge voxskipslab3 - -voxprebegslab3: - push ebx - push ecx - push edi -voxbegslab3: - mov ebp, ebx - add ebx, edx - shr ebp, 16 - xor eax, eax - mov al, byte [esi+ebp] -voxpal6: mov al, byte [eax+88888888h] - mov ah, al - shl eax, 8 - mov al, ah - shl eax, 8 - mov al, ah - mov dword [edi], eax -voxbpl6: add edi, 88888888h - dec ecx - jnz voxbegslab3 - pop edi - pop ecx - pop ebx - add edi, 4 - - mov eax, [esp] - - sub eax, 3 - cmp edi, eax - jl voxprebegslab3 - -voxskipslab3: - mov eax, [esp] - - dec eax - cmp edi, eax - jge voxskipslab4 - - push ebx - push ecx - push edi -voxbegslab4: - mov ebp, ebx - add ebx, edx - shr ebp, 16 - xor eax, eax - mov al, byte [esi+ebp] -voxpal7: mov al, byte [eax+88888888h] - mov ah, al - mov word [edi], ax -voxbpl7: add edi, 88888888h - dec ecx - jnz voxbegslab4 - pop edi - pop ecx - pop ebx - add edi, 2 - -voxskipslab4: - pop eax - - cmp edi, eax - je voxskipslab5 - -voxbegslab5: - mov ebp, ebx - add ebx, edx - shr ebp, 16 - xor eax, eax - mov al, byte [esi+ebp] -voxpal8: mov al, byte [eax+88888888h] - mov byte [edi], al -voxbpl8: add edi, 88888888h - dec ecx - jnz voxbegslab5 - -voxskipslab5: - pop edi - pop esi - pop ebp - pop ebx - ret - -align 16 - -%ifdef M_TARGET_MACHO -GLOBAL _rtext_a_end -_rtext_a_end: -%endif diff --git a/src/asm_ia32/misc.asm b/src/asm_ia32/misc.asm deleted file mode 100644 index b825a4d02a..0000000000 --- a/src/asm_ia32/misc.asm +++ /dev/null @@ -1,200 +0,0 @@ -;* -;* misc.nas -;* Miscellaneous assembly functions -;* -;*--------------------------------------------------------------------------- -;* Copyright 1998-2006 Randy Heit -;* All rights reserved. -;* -;* Redistribution and use in source and binary forms, with or without -;* modification, are permitted provided that the following conditions -;* are met: -;* -;* 1. Redistributions of source code must retain the above copyright -;* notice, this list of conditions and the following disclaimer. -;* 2. Redistributions in binary form must reproduce the above copyright -;* notice, this list of conditions and the following disclaimer in the -;* documentation and/or other materials provided with the distribution. -;* 3. The name of the author may not be used to endorse or promote products -;* derived from this software without specific prior written permission. -;* -;* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR -;* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES -;* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. -;* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, -;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT -;* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -;* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -;* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -;* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF -;* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -;*--------------------------------------------------------------------------- -;* - -BITS 32 - -%ifndef M_TARGET_LINUX - -%define DoBlending_MMX _DoBlending_MMX -%define BestColor_MMX _BestColor_MMX - -%endif - -%ifdef M_TARGET_WATCOM - SEGMENT DATA PUBLIC ALIGN=16 CLASS=DATA USE32 - SEGMENT DATA -%else - SECTION .data -%endif - -Blending256: - dd 0x01000100,0x00000100 - -%ifdef M_TARGET_WATCOM - SEGMENT CODE PUBLIC ALIGN=16 CLASS=CODE USE32 - SEGMENT CODE -%else - SECTION .text -%endif - -;----------------------------------------------------------- -; -; DoBlending_MMX -; -; MMX version of DoBlending -; -; (DWORD *from, DWORD *to, count, tor, tog, tob, toa) -;----------------------------------------------------------- - -GLOBAL DoBlending_MMX - -DoBlending_MMX: - pxor mm0,mm0 ; mm0 = 0 - mov eax,[esp+4*4] - shl eax,16 - mov edx,[esp+4*5] - shl edx,8 - or eax,[esp+4*6] - or eax,edx - mov ecx,[esp+4*3] ; ecx = count - movd mm1,eax ; mm1 = 00000000 00RRGGBB - mov eax,[esp+4*7] - shl eax,16 - mov edx,[esp+4*7] - shl edx,8 - or eax,[esp+4*7] - or eax,edx - mov edx,[esp+4*2] ; edx = dest - movd mm6,eax ; mm6 = 00000000 00AAAAAA - punpcklbw mm1,mm0 ; mm1 = 000000RR 00GG00BB - movq mm7,[Blending256] - punpcklbw mm6,mm0 ; mm6 = 000000AA 00AA00AA - mov eax,[esp+4*1] ; eax = source - pmullw mm1,mm6 ; mm1 = 000000RR 00GG00BB (multiplied by alpha) - psubusw mm7,mm6 ; mm7 = 000000aa 00aa00aa (one minus alpha) - nop ; Does this actually pair on a Pentium? - -; Do four colors per iteration: Count must be a multiple of four. - -.loop movq mm2,[eax] ; mm2 = 00r2g2b2 00r1g1b1 - add eax,8 - movq mm3,mm2 ; mm3 = 00r2g2b2 00r1g1b1 - punpcklbw mm2,mm0 ; mm2 = 000000r1 00g100b1 - punpckhbw mm3,mm0 ; mm3 = 000000r2 00g200b2 - pmullw mm2,mm7 ; mm2 = 0000r1rr g1ggb1bb - add edx,8 - pmullw mm3,mm7 ; mm3 = 0000r2rr g2ggb2bb - sub ecx,2 - paddusw mm2,mm1 - psrlw mm2,8 - paddusw mm3,mm1 - psrlw mm3,8 - packuswb mm2,mm3 ; mm2 = 00r2g2b2 00r1g1b1 - movq [edx-8],mm2 - - movq mm2,[eax] ; mm2 = 00r2g2b2 00r1g1b1 - add eax,8 - movq mm3,mm2 ; mm3 = 00r2g2b2 00r1g1b1 - punpcklbw mm2,mm0 ; mm2 = 000000r1 00g100b1 - punpckhbw mm3,mm0 ; mm3 = 000000r2 00g200b2 - pmullw mm2,mm7 ; mm2 = 0000r1rr g1ggb1bb - add edx,8 - pmullw mm3,mm7 ; mm3 = 0000r2rr g2ggb2bb - sub ecx,2 - paddusw mm2,mm1 - psrlw mm2,8 - paddusw mm3,mm1 - psrlw mm3,8 - packuswb mm2,mm3 ; mm2 = 00r2g2b2 00r1g1b1 - movq [edx-8],mm2 - - jnz .loop - - emms - ret - -;----------------------------------------------------------- -; -; BestColor_MMX -; -; Picks the closest matching color from a palette -; -; Passed FFRRGGBB and palette array in same format -; FF is the index of the first palette entry to consider -; -;----------------------------------------------------------- - -GLOBAL BestColor_MMX -GLOBAL @BestColor_MMX@8 - -BestColor_MMX: - mov ecx,[esp+4] - mov edx,[esp+8] -@BestColor_MMX@8: - pxor mm0,mm0 - movd mm1,ecx ; mm1 = color searching for - mov eax,257*257+257*257+257*257 ;eax = bestdist - push ebx - punpcklbw mm1,mm0 - mov ebx,ecx ; ebx = best color - shr ecx,24 ; ecx = count - and ebx,0xffffff - push esi - push ebp - -.loop movd mm2,[edx+ecx*4] ; mm2 = color considering now - inc ecx - punpcklbw mm2,mm0 - movq mm3,mm1 - psubsw mm3,mm2 - pmullw mm3,mm3 ; mm3 = color distance squared - - movd ebp,mm3 ; add the three components - psrlq mm3,32 ; into ebp to get the real - mov esi,ebp ; (squared) distance - shr esi,16 - and ebp,0xffff - add ebp,esi - movd esi,mm3 - add ebp,esi - - jz .perf ; found a perfect match - cmp eax,ebp - jb .skip - mov eax,ebp - lea ebx,[ecx-1] -.skip cmp ecx,256 - jne .loop - mov eax,ebx - pop ebp - pop esi - pop ebx - emms - ret - -.perf lea eax,[ecx-1] - pop ebp - pop esi - pop ebx - emms - ret diff --git a/src/asm_ia32/tmap.asm b/src/asm_ia32/tmap.asm deleted file mode 100644 index 2096b92229..0000000000 --- a/src/asm_ia32/tmap.asm +++ /dev/null @@ -1,1002 +0,0 @@ -;* -;* tmap.nas -;* The texture-mapping inner loops in pure assembly language. -;* -;*--------------------------------------------------------------------------- -;* Copyright 1998-2006 Randy Heit -;* All rights reserved. -;* -;* Redistribution and use in source and binary forms, with or without -;* modification, are permitted provided that the following conditions -;* are met: -;* -;* 1. Redistributions of source code must retain the above copyright -;* notice, this list of conditions and the following disclaimer. -;* 2. Redistributions in binary form must reproduce the above copyright -;* notice, this list of conditions and the following disclaimer in the -;* documentation and/or other materials provided with the distribution. -;* 3. The name of the author may not be used to endorse or promote products -;* derived from this software without specific prior written permission. -;* -;* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR -;* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES -;* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. -;* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, -;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT -;* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -;* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -;* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -;* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF -;* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -;*--------------------------------------------------------------------------- -;* - -BITS 32 - -%include "valgrind.inc" - -; Segment/section definition macros. - - SECTION .data - -%define SPACEFILLER4 (0x44444444) - -; If you change this in r_draw.c, be sure to change it here, too! -FUZZTABLE equ 50 - -%ifndef M_TARGET_LINUX - -%define ylookup _ylookup -%define centery _centery -%define fuzzpos _fuzzpos -%define fuzzoffset _fuzzoffset -%define NormalLight _NormalLight -%define viewheight _viewheight -%define fuzzviewheight _fuzzviewheight -%define CPU _CPU - -%define dc_pitch _dc_pitch -%define dc_colormap _dc_colormap -%define dc_color _dc_color -%define dc_iscale _dc_iscale -%define dc_texturefrac _dc_texturefrac -%define dc_srcblend _dc_srcblend -%define dc_destblend _dc_destblend -%define dc_source _dc_source -%define dc_yl _dc_yl -%define dc_yh _dc_yh -%define dc_x _dc_x -%define dc_count _dc_count -%define dc_dest _dc_dest -%define dc_destorg _dc_destorg - -%define Col2RGB8 _Col2RGB8 -%define RGB32k _RGB32k - -%define dc_ctspan _dc_ctspan -%define dc_temp _dc_temp - -%define ds_xstep _ds_xstep -%define ds_ystep _ds_ystep -%define ds_colormap _ds_colormap -%define ds_source _ds_source -%define ds_x1 _ds_x1 -%define ds_x2 _ds_x2 -%define ds_xfrac _ds_xfrac -%define ds_yfrac _ds_yfrac -%define ds_y _ds_y - -%define ds_cursource _ds_cursource -%define ds_curcolormap _ds_curcolormap - -%define R_SetSpanSource_ASM _R_SetSpanSource_ASM -%define R_SetSpanSize_ASM _R_SetSpanSize_ASM -%define R_SetSpanColormap_ASM _R_SetSpanColormap_ASM -%define R_SetupShadedCol _R_SetupShadedCol -%define R_SetupAddCol _R_SetupAddCol -%define R_SetupAddClampCol _R_SetupAddClampCol - -%endif - -EXTERN ylookup -EXTERN centery -EXTERN fuzzpos -EXTERN fuzzoffset -EXTERN NormalLight -EXTERN viewheight -EXTERN fuzzviewheight -EXTERN CPU - -EXTERN dc_pitch -EXTERN dc_colormap -EXTERN dc_color -EXTERN dc_iscale -EXTERN dc_texturefrac -EXTERN dc_srcblend -EXTERN dc_destblend -EXTERN dc_source -EXTERN dc_yl -EXTERN dc_yh -EXTERN dc_x -EXTERN dc_count -EXTERN dc_dest -EXTERN dc_destorg - -EXTERN dc_ctspan -EXTERN dc_temp - -EXTERN Col2RGB8 -EXTERN RGB32k - -EXTERN ds_xstep -EXTERN ds_ystep -EXTERN ds_colormap -EXTERN ds_source -EXTERN ds_x1 -EXTERN ds_x2 -EXTERN ds_xfrac -EXTERN ds_yfrac -EXTERN ds_y - -GLOBAL ds_cursource -GLOBAL ds_curcolormap - - -ds_cursource: - DD 0 - -ds_curcolormap: - DD 0 - - -; Local stuff: -lastAddress DD 0 -pixelcount DD 0 - - SECTION .text - - -GLOBAL @R_SetSpanSource_ASM@4 -GLOBAL R_SetSpanSource_ASM - -R_SetSpanSource_ASM: - mov ecx,[esp+4] - -@R_SetSpanSource_ASM@4: - mov [spreada+2],ecx - mov [spreadb+2],ecx - mov [spreadc+2],ecx - mov [spreadd+2],ecx - mov [spreade+2],ecx - mov [spreadf+2],ecx - mov [spreadg+2],ecx - - mov [mspreada+2],ecx - mov [mspreadb+2],ecx - mov [mspreadc+2],ecx - mov [mspreadd+2],ecx - mov [mspreade+2],ecx - mov [mspreadf+2],ecx - mov [mspreadg+2],ecx - - selfmod spreada, mspreadg+6 - - mov [ds_cursource],ecx - ret - -GLOBAL @R_SetSpanColormap_ASM@4 -GLOBAL R_SetSpanColormap_ASM - -R_SetSpanColormap_ASM: - mov ecx,[esp+4] - -@R_SetSpanColormap_ASM@4: - mov [spmapa+2],ecx - mov [spmapb+2],ecx - mov [spmapc+2],ecx - mov [spmapd+2],ecx - mov [spmape+2],ecx - mov [spmapf+2],ecx - mov [spmapg+2],ecx - - mov [mspmapa+2],ecx - mov [mspmapb+2],ecx - mov [mspmapc+2],ecx - mov [mspmapd+2],ecx - mov [mspmape+2],ecx - mov [mspmapf+2],ecx - mov [mspmapg+2],ecx - - selfmod spmapa, mspmapg+6 - - mov [ds_curcolormap],ecx - ret - -GLOBAL R_SetSpanSize_ASM - -EXTERN SetTiltedSpanSize - -R_SetSpanSize_ASM: - mov edx,[esp+4] - mov ecx,[esp+8] - call SetTiltedSpanSize - - mov [dsy1+2],dl - mov [dsy2+2],dl - - mov [dsx1+2],cl - mov [dsx2+2],cl - mov [dsx3+2],cl - mov [dsx4+2],cl - mov [dsx5+2],cl - mov [dsx6+2],cl - mov [dsx7+2],cl - - mov [dmsy1+2],dl - mov [dmsy2+2],dl - - mov [dmsx1+2],cl - mov [dmsx2+2],cl - mov [dmsx3+2],cl - mov [dmsx4+2],cl - mov [dmsx5+2],cl - mov [dmsx6+2],cl - mov [dmsx7+2],cl - - push ecx - add ecx,edx - mov eax,1 - shl eax,cl - dec eax - mov [dsm1+2],eax - mov [dsm5+1],eax - mov [dsm6+1],eax - mov [dsm7+1],eax - - mov [dmsm1+2],eax - mov [dmsm5+1],eax - mov [dmsm6+1],eax - mov [dmsm7+1],eax - pop ecx - ror eax,cl - mov [dsm2+2],eax - mov [dsm3+2],eax - mov [dsm4+2],eax - - mov [dmsm2+2],eax - mov [dmsm3+2],eax - mov [dmsm4+2],eax - and eax,0xffff - not eax - mov [dsm8+2],eax - mov [dsm9+2],eax - - mov [dmsm8+2],eax - mov [dmsm9+2],eax - - neg dl - mov [dsy3+2],dl - mov [dsy4+2],dl - - mov [dmsy3+2],dl - mov [dmsy4+2],dl - - selfmod dsy1, dmsm7+6 - -aret: ret - -%ifdef M_TARGET_MACHO - SECTION .text align=64 -%else - SECTION .rtext progbits alloc exec write align=64 -%endif - -%ifdef M_TARGET_MACHO -GLOBAL _rtext_tmap_start -_rtext_tmap_start: -%endif - -rtext_start: - -GLOBAL @R_DrawSpanP_ASM@0 -GLOBAL _R_DrawSpanP_ASM -GLOBAL R_DrawSpanP_ASM - -; eax: scratch -; ebx: zero -; ecx: yfrac at top end, xfrac int part at low end -; edx: xfrac frac part at top end -; edi: dest -; ebp: scratch -; esi: count -; [esp]: xstep -; [esp+4]: ystep - - align 16 - -@R_DrawSpanP_ASM@0: -_R_DrawSpanP_ASM: -R_DrawSpanP_ASM: - mov eax,[ds_x2] - mov ecx,[ds_x1] - sub eax,ecx - jl near rdspret ; count < 0: nothing to do, so leave - - push ebx - push edi - push ebp - push esi - sub esp, 8 - - mov edi,ecx - add edi,[dc_destorg] - mov ecx,[ds_y] - add edi,[ylookup+ecx*4] - mov edx,[ds_xstep] -dsy1: shl edx,6 - mov ebp,[ds_xstep] -dsy3: shr ebp,26 - xor ebx,ebx - lea esi,[eax+1] - mov [esp],edx - mov edx,[ds_ystep] - mov ecx,[ds_xfrac] -dsy4: shr ecx,26 -dsm8: and edx,strict dword 0xffffffc0 - or ebp,edx - mov [esp+4],ebp - mov ebp,[ds_yfrac] - mov edx,[ds_xfrac] -dsy2: shl edx,6 -dsm9: and ebp,strict dword 0xffffffc0 - or ecx,ebp - shr esi,1 - jnc dseven1 - -; do odd pixel - - mov ebp,ecx -dsx1: rol ebp,6 -dsm1: and ebp,0xfff - add edx,[esp] - adc ecx,[esp+4] -spreada mov bl,[ebp+SPACEFILLER4] -spmapa mov bl,[ebx+SPACEFILLER4] - mov [edi],bl - inc edi - -dseven1 shr esi,1 - jnc dsrest - -; do two more pixels - mov ebp,ecx - add edx,[esp] - adc ecx,[esp+4] -dsm2: and ebp,0xfc00003f -dsx2: rol ebp,6 - mov eax,ecx - add edx,[esp] - adc ecx,[esp+4] -spreadb mov bl,[ebp+SPACEFILLER4] ;read texel1 -dsx3: rol eax,6 -dsm6: and eax,0xfff -spmapb mov bl,[ebx+SPACEFILLER4] ;map texel1 - mov [edi],bl ;store texel1 - add edi,2 -spreadc mov bl,[eax+SPACEFILLER4] ;read texel2 -spmapc mov bl,[ebx+SPACEFILLER4] ;map texel2 - mov [edi-1],bl ;store texel2 - -; do the rest - -dsrest test esi,esi - jz near dsdone - - align 16 - -dsloop mov ebp,ecx -spstep1d add edx,[esp] -spstep2d adc ecx,[esp+4] -dsm3: and ebp,0xfc00003f -dsx4: rol ebp,6 - mov eax,ecx -spstep1e add edx,[esp] -spstep2e adc ecx,[esp+4] -spreadd mov bl,[ebp+SPACEFILLER4] ;read texel1 -dsx5: rol eax,6 -dsm5: and eax,0xfff -spmapd mov bl,[ebx+SPACEFILLER4] ;map texel1 - mov [edi],bl ;store texel1 - mov ebp,ecx -spreade mov bl,[eax+SPACEFILLER4] ;read texel2 -spstep1f add edx,[esp] -spstep2f adc ecx,[esp+4] -dsm4: and ebp,0xfc00003f -dsx6: rol ebp,6 -spmape mov bl,[ebx+SPACEFILLER4] ;map texel2 - mov eax,ecx - mov [edi+1],bl ;store texel2 -spreadf mov bl,[ebp+SPACEFILLER4] ;read texel3 -spmapf mov bl,[ebx+SPACEFILLER4] ;map texel3 - add edi,4 -dsx7: rol eax,6 -dsm7: and eax,0xfff - mov [edi-2],bl ;store texel3 -spreadg mov bl,[eax+SPACEFILLER4] ;read texel4 -spstep1g add edx,[esp] -spstep2g adc ecx,[esp+4] -spmapg mov bl,[ebx+SPACEFILLER4] ;map texel4 - dec esi - mov [edi-1],bl ;store texel4 - jnz near dsloop - -dsdone add esp,8 - pop esi - pop ebp - pop edi - pop ebx - -rdspret ret - -; This is the same as the previous routine, except it doesn't draw pixels -; where the texture's color value is 0. - -GLOBAL @R_DrawSpanMaskedP_ASM@0 -GLOBAL _R_DrawSpanMaskedP_ASM -GLOBAL R_DrawSpanMaskedP_ASM - -; eax: scratch -; ebx: zero -; ecx: yfrac at top end, xfrac int part at low end -; edx: xfrac frac part at top end -; edi: dest -; ebp: scratch -; esi: count -; [esp]: xstep -; [esp+4]: ystep - - align 16 - -@R_DrawSpanMaskedP_ASM@0: -_R_DrawSpanMaskedP_ASM: -R_DrawSpanMaskedP_ASM: - mov eax,[ds_x2] - mov ecx,[ds_x1] - sub eax,ecx - jl rdspret ; count < 0: nothing to do, so leave - - push ebx - push edi - push ebp - push esi - sub esp,8 - - mov edi,ecx - add edi,[dc_destorg] - mov ecx,[ds_y] - add edi,[ylookup+ecx*4] - mov edx,[ds_xstep] -dmsy1: shl edx,6 - mov ebp,[ds_xstep] -dmsy3: shr ebp,26 - xor ebx,ebx - lea esi,[eax+1] - mov [esp],edx - mov edx,[ds_ystep] - mov ecx,[ds_xfrac] -dmsy4: shr ecx,26 -dmsm8: and edx,strict dword 0xffffffc0 - or ebp,edx - mov [esp+4],ebp - mov ebp,[ds_yfrac] - mov edx,[ds_xfrac] -dmsy2: shl edx,6 -dmsm9: and ebp,strict dword 0xffffffc0 - or ecx,ebp - shr esi,1 - jnc dmseven1 - -; do odd pixel - - mov ebp,ecx -dmsx1: rol ebp,6 -dmsm1: and ebp,0xfff - add edx,[esp] - adc ecx,[esp+4] -mspreada mov bl,[ebp+SPACEFILLER4] - cmp bl,0 - je mspskipa -mspmapa mov bl,[ebx+SPACEFILLER4] - mov [edi],bl -mspskipa: inc edi - -dmseven1 shr esi,1 - jnc dmsrest - -; do two more pixels - mov ebp,ecx - add edx,[esp] - adc ecx,[esp+4] -dmsm2: and ebp,0xfc00003f -dmsx2: rol ebp,6 - mov eax,ecx - add edx,[esp] - adc ecx,[esp+4] -mspreadb mov bl,[ebp+SPACEFILLER4] ;read texel1 -dmsx3: rol eax,6 -dmsm6: and eax,0xfff - cmp bl,0 - je mspskipb -mspmapb mov bl,[ebx+SPACEFILLER4] ;map texel1 - mov [edi],bl ;store texel1 -mspskipb add edi,2 -mspreadc mov bl,[eax+SPACEFILLER4] ;read texel2 - cmp bl,0 - je dmsrest -mspmapc mov bl,[ebx+SPACEFILLER4] ;map texel2 - mov [edi-1],bl ;store texel2 - -; do the rest - -dmsrest test esi,esi - jz near dmsdone - - align 16 - -dmsloop mov ebp,ecx -mspstep1d add edx,[esp] -mspstep2d adc ecx,[esp+4] -dmsm3: and ebp,0xfc00003f -dmsx4: rol ebp,6 - mov eax,ecx -mspstep1e add edx,[esp] -mspstep2e adc ecx,[esp+4] -mspreadd mov bl,[ebp+SPACEFILLER4] ;read texel1 -dmsx5: rol eax,6 -dmsm5: and eax,0xfff - cmp bl,0 - mov ebp,ecx - je mspreade -mspmapd mov bl,[ebx+SPACEFILLER4] ;map texel1 - mov [edi],bl ;store texel1 -mspreade mov bl,[eax+SPACEFILLER4] ;read texel2 -mspstep1f add edx,[esp] -mspstep2f adc ecx,[esp+4] -dmsm4: and ebp,0xfc00003f -dmsx6: rol ebp,6 - cmp bl,0 - mov eax,ecx - je mspreadf -mspmape mov bl,[ebx+SPACEFILLER4] ;map texel2 - mov [edi+1],bl ;store texel2 -mspreadf mov bl,[ebp+SPACEFILLER4] ;read texel3 - add edi,4 -dmsx7: rol eax,6 -dmsm7: and eax,0xfff - cmp bl,0 - je mspreadg -mspmapf mov bl,[ebx+SPACEFILLER4] ;map texel3 - mov [edi-2],bl ;store texel3 -mspreadg mov bl,[eax+SPACEFILLER4] ;read texel4 -mspstep1g add edx,[esp] -mspstep2g adc ecx,[esp+4] - cmp bl,0 - je mspskipg -mspmapg mov bl,[ebx+SPACEFILLER4] ;map texel4 - mov [edi-1],bl ;store texel4 -mspskipg dec esi - jnz near dmsloop - -dmsdone add esp,8 - pop esi - pop ebp - pop edi - pop ebx - - ret - - - - -GLOBAL rt_shaded4cols_asm -GLOBAL _rt_shaded4cols_asm - -rt_shaded4cols_asm: -_rt_shaded4cols_asm: - mov ecx,[esp+8] - push ebp - mov ebp,[esp+16] - sub ebp,ecx - js near s4nil - mov eax,[ylookup+ecx*4] - add eax,[dc_destorg] ; eax = destination - push ebx - push esi - mov esi,[dc_temp] - inc ebp ; ebp = count - add eax,[esp+16] - push edi - lea esi,[esi+ecx*4] ; esi = source - - align 16 - -s4loop: movzx edx,byte [esi] - movzx ecx,byte [esi+1] -s4cm1: movzx edx,byte [SPACEFILLER4+edx] ; colormap -s4cm2: movzx edi,byte [SPACEFILLER4+ecx] ; colormap - shl edx,8 - movzx ebx,byte [eax] - shl edi,8 - movzx ecx,byte [eax+1] - sub ebx,edx - sub ecx,edi - mov ebx,[Col2RGB8+0x10000+ebx*4] - mov ecx,[Col2RGB8+0x10000+ecx*4] -s4fg1: add ebx,[SPACEFILLER4+edx*4] -s4fg2: add ecx,[SPACEFILLER4+edi*4] - or ebx,0x1f07c1f - or ecx,0x1f07c1f - mov edx,ebx - shr ebx,15 - mov edi,ecx - shr ecx,15 - and edx,ebx - and ecx,edi - mov bl,[RGB32k+edx] - movzx edx,byte [esi+2] - mov bh,[RGB32k+ecx] - movzx ecx,byte [esi+3] - mov [eax],bl - mov [eax+1],bh - -s4cm3: movzx edx,byte [SPACEFILLER4+edx] ; colormap -s4cm4: movzx edi,byte [SPACEFILLER4+ecx] ; colormap - shl edx,8 - movzx ebx,byte [eax+2] - shl edi,8 - movzx ecx,byte [eax+3] - sub ebx,edx - sub ecx,edi - mov ebx,[Col2RGB8+0x10000+ebx*4] - mov ecx,[Col2RGB8+0x10000+ecx*4] -s4fg3: add ebx,[SPACEFILLER4+edx*4] -s4fg4: add ecx,[SPACEFILLER4+edi*4] - or ebx,0x1f07c1f - or ecx,0x1f07c1f - mov edx,ebx - shr ebx,15 - mov edi,ecx - shr ecx,15 - and edx,ebx - and ecx,edi -s4p: add eax,320 ; pitch - add esi,4 - mov bl,[RGB32k+edx] - mov bh,[RGB32k+ecx] -s4p2: mov [eax-320+2],bl -s4p3: mov [eax-320+3],bh - dec ebp - jne s4loop - - pop edi - pop esi - pop ebx -s4nil: pop ebp - ret - - align 16 - -GLOBAL rt_add4cols_asm -GLOBAL _rt_add4cols_asm - -rt_add4cols_asm: -_rt_add4cols_asm: - mov ecx,[esp+8] - push edi - mov edi,[esp+16] - sub edi,ecx - js near a4nil - mov eax,[ylookup+ecx*4] - add eax,[dc_destorg] - push ebx - push esi - mov esi,[dc_temp] - push ebp - inc edi - add eax,[esp+20] - lea esi,[esi+ecx*4] - - align 16 -a4loop: - movzx ebx,byte [esi] - movzx edx,byte [esi+1] - movzx ecx,byte [eax] - movzx ebp,byte [eax+1] -a4cm1: movzx ebx,byte [SPACEFILLER4+ebx] ; colormap -a4cm2: movzx edx,byte [SPACEFILLER4+edx] ; colormap -a4bg1: mov ecx,[SPACEFILLER4+ecx*4] ; bg2rgb -a4bg2: mov ebp,[SPACEFILLER4+ebp*4] ; bg2rgb -a4fg1: add ecx,[SPACEFILLER4+ebx*4] ; fg2rgb -a4fg2: add ebp,[SPACEFILLER4+edx*4] ; fg2rgb - or ecx,0x01f07c1f - or ebp,0x01f07c1f - mov ebx,ecx - shr ecx,15 - mov edx,ebp - shr ebp,15 - and ecx,ebx - and ebp,edx - movzx ebx,byte [esi+2] - movzx edx,byte [esi+3] - mov cl,[RGB32k+ecx] - mov ch,[RGB32k+ebp] - mov [eax],cl - mov [eax+1],ch - - movzx ecx,byte [eax+2] - movzx ebp,byte [eax+3] -a4cm3: movzx ebx,byte [SPACEFILLER4+ebx] ; colormap -a4cm4: movzx edx,byte [SPACEFILLER4+edx] ; colormap -a4bg3: mov ecx,[SPACEFILLER4+ecx*4] ; bg2rgb -a4bg4: mov ebp,[SPACEFILLER4+ebp*4] ; bg2rgb -a4fg3: add ecx,[SPACEFILLER4+ebx*4] ; fg2rgb -a4fg4: add ebp,[SPACEFILLER4+edx*4] ; fg2rgb - or ecx,0x01f07c1f - or ebp,0x01f07c1f - mov ebx,ecx - shr ecx,15 - mov edx,ebp - shr ebp,15 - and ebx,ecx - and edx,ebp - mov cl,[RGB32k+ebx] - mov ch,[RGB32k+edx] - mov [eax+2],cl - mov [eax+3],ch - - add esi,4 -a4p: add eax,320 ; pitch - sub edi,1 - jne a4loop - pop ebp - pop esi - pop ebx -a4nil: pop edi - ret - - align 16 - -GLOBAL rt_addclamp4cols_asm -GLOBAL _rt_addclamp4cols_asm - -rt_addclamp4cols_asm: -_rt_addclamp4cols_asm: - mov ecx,[esp+8] - push edi - mov edi,[esp+16] - sub edi,ecx - js near ac4nil - mov eax,[ylookup+ecx*4] - add eax,[dc_destorg] - push ebx - push esi - mov esi,[dc_temp] - push ebp - inc edi - add eax,[esp+20] - lea esi,[esi+ecx*4] - push edi - - align 16 -ac4loop: - movzx ebx,byte [esi] - movzx edx,byte [esi+1] - mov [esp],edi -ac4cm1: movzx ebx,byte [SPACEFILLER4+ebx] ; colormap -ac4cm2: movzx edx,byte [SPACEFILLER4+edx] ; colormap - movzx ecx,byte [eax] - movzx ebp,byte [eax+1] -ac4fg1: mov ebx,[SPACEFILLER4+ebx*4] ; fg2rgb -ac4fg2: mov edx,[SPACEFILLER4+edx*4] ; fg2rgb -ac4bg1: add ebx,[SPACEFILLER4+ecx*4] ; bg2rgb -ac4bg2: add edx,[SPACEFILLER4+ebp*4] ; bg2rgb - mov ecx,ebx - or ebx,0x01f07c1f - and ecx,0x40100400 - and ebx,0x3fffffff - mov edi,ecx - shr ecx,5 - mov ebp,edx - sub edi,ecx - or edx,0x01f07c1f - or ebx,edi - mov ecx,ebx - shr ebx,15 - and ebp,0x40100400 - and ebx,ecx - and edx,0x3fffffff - mov edi,ebp - shr ebp,5 - mov cl,[RGB32k+ebx] - sub edi,ebp - mov [eax],cl - or edx,edi - mov ebp,edx - shr edx,15 - movzx ebx,byte [esi+2] - and ebp,edx - movzx edx,byte [esi+3] -ac4cm3: movzx ebx,byte [SPACEFILLER4+ebx] ; colormap - mov cl,[RGB32k+ebp] -ac4cm4: movzx edx,byte [SPACEFILLER4+edx] ; colormap - mov [eax+1],cl - movzx ecx,byte [eax+2] - movzx ebp,byte [eax+3] -ac4fg3: mov ebx,[SPACEFILLER4+ebx*4] ; fg2rgb -ac4fg4: mov edx,[SPACEFILLER4+edx*4] ; fg2rgb -ac4bg3: add ebx,[SPACEFILLER4+ecx*4] ; bg2rgb -ac4bg4: add edx,[SPACEFILLER4+ebp*4] ; bg2rgb - mov ecx,ebx - or ebx,0x01f07c1f - and ecx,0x40100400 - and ebx,0x3fffffff - mov edi,ecx - shr ecx,5 - mov ebp,edx - sub edi,ecx - or edx,0x01f07c1f - or ebx,edi - mov ecx,ebx - shr ebx,15 - and ebp,0x40100400 - and ebx,ecx - and edx,0x3fffffff - mov edi,ebp - shr ebp,5 - mov cl,[RGB32k+ebx] - sub edi,ebp - mov [eax+2],cl - or edx,edi - mov edi,[esp] - mov ebp,edx - shr edx,15 - add esi,4 - and edx,ebp - mov cl,[RGB32k+edx] - mov [eax+3],cl - -ac4p: add eax,320 ; pitch - sub edi,1 - jne ac4loop - pop edi - - pop ebp - pop esi - pop ebx -ac4nil: pop edi - ret - -rtext_end: -%ifdef M_TARGET_MACHO -GLOBAL _rtext_tmap_end -_rtext_tmap_end: -%endif - align 16 - -;************************ - - SECTION .text - -GLOBAL R_SetupShadedCol -GLOBAL @R_SetupShadedCol@0 - -# Patch the values of dc_colormap and dc_color into the shaded column drawer. - -R_SetupShadedCol: -@R_SetupShadedCol@0: - mov eax,[dc_colormap] - cmp [s4cm1+3],eax - je .cmdone - mov [s4cm1+3],eax - mov [s4cm2+3],eax - mov [s4cm3+3],eax - mov [s4cm4+3],eax -.cmdone mov eax,[dc_color] - lea eax,[Col2RGB8+eax*4] - cmp [s4fg1+3],eax - je .cdone - mov [s4fg1+3],eax - mov [s4fg2+3],eax - mov [s4fg3+3],eax - mov [s4fg4+3],eax - selfmod s4cm1, s4fg4+7 -.cdone ret - -GLOBAL R_SetupAddCol -GLOBAL @R_SetupAddCol@0 - -# Patch the values of dc_colormap, dc_srcblend, and dc_destblend into the -# unclamped adding column drawer. - -R_SetupAddCol: -@R_SetupAddCol@0: - mov eax,[dc_colormap] - cmp [a4cm1+3],eax - je .cmdone - mov [a4cm1+3],eax - mov [a4cm2+3],eax - mov [a4cm3+3],eax - mov [a4cm4+3],eax -.cmdone mov eax,[dc_srcblend] - cmp [a4fg1+3],eax - je .sbdone - mov [a4fg1+3],eax - mov [a4fg2+3],eax - mov [a4fg3+3],eax - mov [a4fg4+3],eax -.sbdone mov eax,[dc_destblend] - cmp [a4bg1+3],eax - je .dbdone - mov [a4bg1+3],eax - mov [a4bg2+3],eax - mov [a4bg3+3],eax - mov [a4bg4+3],eax - selfmod a4cm1, a4bg4+7 -.dbdone ret - -GLOBAL R_SetupAddClampCol -GLOBAL @R_SetupAddClampCol@0 - -# Patch the values of dc_colormap, dc_srcblend, and dc_destblend into the -# add with clamping column drawer. - -R_SetupAddClampCol: -@R_SetupAddClampCol@0: - mov eax,[dc_colormap] - cmp [ac4cm1+3],eax - je .cmdone - mov [ac4cm1+3],eax - mov [ac4cm2+3],eax - mov [ac4cm3+3],eax - mov [ac4cm4+3],eax -.cmdone mov eax,[dc_srcblend] - cmp [ac4fg1+3],eax - je .sbdone - mov [ac4fg1+3],eax - mov [ac4fg2+3],eax - mov [ac4fg3+3],eax - mov [ac4fg4+3],eax -.sbdone mov eax,[dc_destblend] - cmp [ac4bg1+3],eax - je .dbdone - mov [ac4bg1+3],eax - mov [ac4bg2+3],eax - mov [ac4bg3+3],eax - mov [ac4bg4+3],eax - selfmod ac4cm1, ac4bg4+7 -.dbdone ret - -EXTERN setvlinebpl_ -EXTERN setpitch3 - -GLOBAL @ASM_PatchPitch@0 -GLOBAL _ASM_PatchPitch -GLOBAL ASM_PatchPitch - -ASM_PatchPitch: -_ASM_PatchPitch: -@ASM_PatchPitch@0: - mov eax,[dc_pitch] - mov [s4p+1],eax - mov [a4p+1],eax - mov [ac4p+1],eax - mov ecx,eax - neg ecx - inc ecx - inc ecx - mov [s4p2+2],ecx - inc ecx - mov [s4p3+2],ecx - selfmod rtext_start, rtext_end - call setpitch3 - jmp setvlinebpl_ diff --git a/src/asm_ia32/tmap2.asm b/src/asm_ia32/tmap2.asm deleted file mode 100644 index ab1695d3cd..0000000000 --- a/src/asm_ia32/tmap2.asm +++ /dev/null @@ -1,643 +0,0 @@ -;* -;* tmap2.nas -;* The tilted plane inner loop. -;* -;*--------------------------------------------------------------------------- -;* Copyright 1998-2006 Randy Heit -;* All rights reserved. -;* -;* Redistribution and use in source and binary forms, with or without -;* modification, are permitted provided that the following conditions -;* are met: -;* -;* 1. Redistributions of source code must retain the above copyright -;* notice, this list of conditions and the following disclaimer. -;* 2. Redistributions in binary form must reproduce the above copyright -;* notice, this list of conditions and the following disclaimer in the -;* documentation and/or other materials provided with the distribution. -;* 3. The name of the author may not be used to endorse or promote products -;* derived from this software without specific prior written permission. -;* -;* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR -;* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES -;* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. -;* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, -;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT -;* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -;* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -;* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -;* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF -;* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -;*--------------------------------------------------------------------------- -;* -;* I tried doing the ROL trick that R_DrawSpanP_ASM uses, and it was -;* actually slightly slower than the more straight-forward approach -;* used here, probably because the trick requires too much setup time. -;* - -BITS 32 - -%include "valgrind.inc" - -%define SPACEFILLER4 (0x44444444) - -%ifndef M_TARGET_LINUX - -%define plane_sz _plane_sz -%define plane_su _plane_su -%define plane_sv _plane_sv -%define plane_shade _plane_shade -%define planelightfloat _planelightfloat -%define spanend _spanend -%define ylookup _ylookup -%define dc_destorg _dc_destorg -%define ds_colormap _ds_colormap -%define ds_source _ds_source -%define centery _centery -%define centerx _centerx -%define ds_curtiltedsource _ds_curtiltedsource -%define pviewx _pviewx -%define pviewy _pviewy -%define tiltlighting _tiltlighting - -%define R_DrawTiltedPlane_ASM _R_DrawTiltedPlane_ASM -%define R_SetTiltedSpanSource_ASM _R_SetTiltedSpanSource_ASM -%define R_CalcTiltedLighting _R_CalcTiltedLighting - -%endif - -EXTERN plane_sz -EXTERN plane_su -EXTERN plane_sv -EXTERN planelightfloat -EXTERN spanend -EXTERN ylookup -EXTERN dc_destorg -EXTERN ds_colormap -EXTERN centery -EXTERN centerx -EXTERN ds_source -EXTERN plane_shade -EXTERN pviewx -EXTERN pviewy -EXTERN tiltlighting -EXTERN R_CalcTiltedLighting - -GLOBAL ds_curtiltedsource - -%define sv_i plane_sv -%define sv_j plane_sv+4 -%define sv_k plane_sv+8 - -%define su_i plane_su -%define su_j plane_su+4 -%define su_k plane_su+8 - -%define sz_i plane_sz -%define sz_j plane_sz+4 -%define sz_k plane_sz+8 - -%define SPANBITS 3 - - section .bss - -start_u: resq 1 -start_v: resq 1 -step_u: resq 1 -step_v: resq 1 - -step_iz: resq 1 -step_uz: resq 1 -step_vz: resq 1 - -end_z: resd 1 - - section .data - -ds_curtiltedsource: dd SPACEFILLER4 - -fp_1: -spanrecips: dd 0x3f800000 ; 1/1 - dd 0x3f000000 ; 1/2 - dd 0x3eaaaaab ; 1/3 - dd 0x3e800000 ; 1/4 - dd 0x3e4ccccd ; 1/5 - dd 0x3e2aaaab ; 1/6 - dd 0x3e124925 ; 1/7 -fp_8recip: dd 0x3e000000 ; 1/8 - dd 0x3de38e39 ; 1/9 - dd 0x3dcccccd ; 1/10 - dd 0x3dba2e8c ; 1/11 - dd 0x3daaaaab ; 1/12 - dd 0x3d9d89d9 ; 1/13 - dd 0x3d924925 ; 1/14 - dd 0x3d888889 ; 1/15 - -fp_quickint: dd 0x3f800000 ; 1 - dd 0x40000000 ; 2 - dd 0x40400000 ; 3 - dd 0x40800000 ; 4 - dd 0x40a00000 ; 5 - dd 0x40c00000 ; 6 - dd 0x40e00000 ; 7 -fp_8: dd 0x41000000 ; 8 - - section .text - -GLOBAL R_SetTiltedSpanSource_ASM -GLOBAL @R_SetTiltedSpanSource_ASM@4 - -R_SetTiltedSpanSource_ASM: - mov ecx,[esp+4] - -@R_SetTiltedSpanSource_ASM@4: - mov [fetch1+3],ecx - mov [fetch2+3],ecx - mov [fetch3+3],ecx - mov [fetch4+3],ecx - mov [fetch5+3],ecx - mov [fetch6+3],ecx - mov [fetch7+3],ecx - mov [fetch8+3],ecx - mov [fetch9+3],ecx - mov [fetch10+3],ecx - mov [ds_curtiltedsource],ecx - selfmod rtext_start, rtext_end - ret - -GLOBAL SetTiltedSpanSize - -SetTiltedSpanSize: - push ecx - mov cl,dl - neg cl - mov eax,1 - shl eax,cl - mov cl,[esp] - neg cl - mov [x1+2],cl - mov [x2+2],cl - mov [x3+2],cl - mov [x4+2],cl - mov [x5+2],cl - mov [x6+2],cl - mov [x7+2],cl - mov [x8+2],cl - mov [x9+2],cl - mov [x10+2],cl - - sub cl,dl - dec eax - mov [y1+2],cl - mov [y2+2],cl - mov [y3+2],cl - mov [y4+2],cl - mov [y5+2],cl - mov [y6+2],cl - mov [y7+2],cl - mov [y8+2],cl - mov [y9+2],cl - mov [y10+2],cl - cmp eax,0 ; if x bits is 0, mask must be 0 too. - jz .notted - not eax -.notted: - pop ecx - - mov [m1+2],eax - mov [m2+2],eax - mov [m3+2],eax - mov [m4+2],eax - mov [m5+2],eax - mov [m6+2],eax - mov [m7+2],eax - mov [m8+2],eax - mov [m9+2],eax - mov [m10+2],eax - - selfmod rtext_start, rtext_end - - ret - -%ifndef M_TARGET_MACHO - SECTION .rtext progbits alloc exec write align=64 -%else - SECTION .text align=64 -GLOBAL _rtext_tmap2_start -_rtext_tmap2_start: -%endif - -rtext_start: - -GLOBAL R_DrawTiltedPlane_ASM -GLOBAL @R_DrawTiltedPlane_ASM@8 - -R_DrawTiltedPlane_ASM: - mov ecx,[esp+4] - mov edx,[esp+8] - - ; ecx = y - ; edx = x - -@R_DrawTiltedPlane_ASM@8: - push ebx - push esi - push edi - push ebp - - mov eax,[centery] - movzx ebx,word [spanend+ecx*2] - sub eax,ecx ; eax = centery-y - sub ebx,edx ; ebx = span length - 1 - mov edi,[ylookup+ecx*4] - push eax - add edi,[dc_destorg] - add edi,edx ; edi = frame buffer pointer - sub edx,[centerx] ; edx = x-centerx - push edx - xor eax,eax - - fild dword [esp+4] ; ymul - fild dword [esp] ; xmul | ymul - fld dword [sv_j] ; sv.j | xmul | ymul - fmul st0,st2 ; sv.j*ymul | xmul | ymul - fld dword [su_j] ; su.j | sv.j*ymul | xmul | ymul - fmul st0,st3 ; su.j*ymul | sv.j*ymul | xmul | ymul - fld dword [sz_j] ; sz.j | su.j*ymul | sv.j*ymul | xmul | ymul - fmulp st4,st0 ; su.j*ymul | sv.j*ymul | xmul | sz.j*ymul - fld dword [sv_i] ; sv.i | su.j*ymul | sv.j*ymul | xmul | sz.j*ymul - fmul st0,st3 ; sv.i*xmul | su.j*ymul | sv.j*ymul | xmul | sz.j*ymul - fld dword [su_i] ; su.i | sv.i*xmul | su.j*ymul | sv.j*ymul | xmul | sz.j*ymul - fmul st0,st4 ; su.i*xmul | sv.i*xmul | su.j*ymul | sv.j*ymul | xmul | sz.j*ymul - fld dword [sz_i] ; sz.i | su.i*xmul | sv.i*xmul | su.j*ymul | sv.j*ymul | xmul | sz.j*ymul - fmulp st5,st0 ; su.i*xmul | sv.i*xmul | su.j*ymul | sv.j*ymul | sz.i*xmul | sz.j*ymul - fxch st1 ; sv.i*xmul | su.i*xmul | su.j*ymul | sv.j*ymul | sz.i*xmul | sz.j*ymul - faddp st3,st0 ; su.i*xmul | su.j*ymul | sv.i*xmul+sv.j*ymul | sz.i*xmul | sz.j*ymul - faddp st1,st0 ; su.i*xmul+su.j*ymul | sv.i*xmul+sv.j*ymul | sz.i*xmul | sz.j*ymul - fxch st3 ; sz.j*ymul | sv.i*xmul+sv.j*ymul | sz.i*xmul | su.i*xmul+su.j*ymul - faddp st2,st0 ; sv.i*xmul+sv.j*ymul | sz.i*xmul+sz.j*ymul | su.i*xmul+su.j*ymul - fadd dword [sv_k] ; v/z | sz.i*xmul+sz.j*ymul | su.i*xmul+su.j*ymul - fxch st1 ; sz.i*xmul+sz.j*ymul | v/z | su.i*xmul+su.j*ymul - fadd dword [sz_k] ; 1/z | v/z | su.i*xmul+su.j*ymul - fxch st2 ; su.i*xmul+su.j*ymul | v/z | 1/z - fadd dword [su_k] ; u/z | v/z | 1/z - fxch st2 ; 1/z | v/z | u/z - fxch st1 ; v/z | 1/z | u/z - -; if lighting is on, fill out the light table - mov al,[plane_shade] - test al,al - jz .litup - - push ebx - fild dword [esp] ; width | v/z | 1/z | u/z - fmul dword [sz_i] ; width*sz.i | v/z | 1/z | u/z - fadd st0,st2 ; 1/endz | v/z | 1/z | u/z - fld st2 ; 1/z | 1/endz | v/z | 1/z | u/z - fmul dword [planelightfloat] - fxch st1 - fmul dword [planelightfloat] - sub esp,16 - fstp qword [esp] - fstp qword [esp+8] - call R_CalcTiltedLighting - add esp, 20 - xor eax, eax - -.litup add esp, 8 - -; calculate initial z, u, and v values - fld st1 ; 1/z | v/z | 1/z | u/z - fdivr dword [fp_1] ; z | v/z | 1/z | u/z - - fld st3 ; u/z | z | v/z | 1/z | u/z - fmul st0,st1 ; u | z | v/z | 1/z | u/z - fld st2 ; v/z | u | z | v/z | 1/z | u/z - fmulp st2,st0 ; u | v | v/z | 1/z | u/z - fld st0 - fistp qword [start_u] - fld st1 - fistp qword [start_v] - - cmp ebx,7 ; Do we have at least 8 pixels to plot? - jl near ShortStrip - -; yes, we do, so figure out tex coords at end of this span - -; multiply i values by span length (8) - fld dword [su_i] ; su.i - fmul dword [fp_8] ; su.i*8 - fld dword [sv_i] ; sv.i | su.i*8 - fmul dword [fp_8] ; sv.i*8 | su.i*8 - fld dword [sz_i] ; sz.i | sv.i*8 | su.i*8 - fmul dword [fp_8] ; sz.i*8 | sv.i*8 | su.i*8 - fxch st2 ; su.i*8 | sv.i*8 | sz.i*8 - fstp qword [step_uz] ; sv.i*8 | sz.i*8 - fstp qword [step_vz] ; sz.i*8 - fst qword [step_iz] ; sz.i*8 - -; find tex coords at start of next span - faddp st4 - fld qword [step_vz] - faddp st3 - fld qword [step_uz] - faddp st5 - - fld st3 ; 1/z | u | v | v/z | 1/z | u/z - fdivr dword [fp_1] ; z | u | v | v/z | 1/z | u/z - fst dword [end_z] - fld st5 ; u/z | z | u | v | v/z | 1/z | u/z - fmul st0,st1 ; u' | z | u | v | v/z | 1/z | u/z - fxch st1 ; z | u' | u | v | v/z | 1/z | u/z - fmul st0,st4 ; v' | u' | u | v | v/z | 1/z | u/z - fxch st3 ; v | u' | u | v' | v/z | 1/z | u/z - -; now subtract to get stepping values for this span - fsubr st0,st3 ; v'-v | u' | u | v' | v/z | 1/z | u/z - fxch st2 ; u | u' | v'-v | v' | v/z | 1/z | u/z - fsubr st0,st1 ; u'-u | u' | v'-v | v' | v/z | 1/z | u/z - fxch st2 ; v'-v | u' | u'-u | v' | v/z | 1/z | u/z - fmul dword [fp_8recip] ; vstep | u' | u'-u | v' | v/z | 1/z | u/z - fxch st1 ; u' | vstep | u'-u | v' | v/z | 1/z | u/z - fxch st2 ; u'-u | vstep | u' | v' | v/z | 1/z | u/z - fmul dword [fp_8recip] ; ustep | vstep | u' | v' | v/z | 1/z | u/z - fxch st1 ; vstep | ustep | u' | v' | v/z | 1/z | u/z - fistp qword [step_v] ; ustep | u' | v' | v/z | 1/z | u/z - fistp qword [step_u] ; u | v | v/z | 1/z | u/z - -FullSpan: - xor eax,eax - cmp ebx,15 ; is there another complete span after this one? - jl NextIsShort - -; there is a complete span after this one - fld qword [step_iz] - faddp st4,st0 - fld qword [step_vz] - faddp st3,st0 - fld qword [step_uz] - faddp st5,st0 - jmp StartDiv - -NextIsShort: - cmp ebx,8 ; if next span is no more than 1 pixel, then we already - jle DrawFullSpan ; know everything we need to draw it - - fld dword [sz_i] ; sz.i | u | v | v/z | 1/z | u/z - fmul dword [fp_quickint-8*4+ebx*4] - fld dword [sv_i] ; sv.i | sz.i | u | v | v/z | 1/z | u/z - fmul dword [fp_quickint-8*4+ebx*4] - fld dword [su_i] ; su.i | sv.i | sz.i | u | v | v/z | 1/z | u/z - fmul dword [fp_quickint-8*4+ebx*4] - fxch st2 ; sz.i | sv.i | su.i | u | v | v/z | 1/z | u/z - faddp st6,st0 ; sv.i | su.i | u | v | v/z | 1/z | u/z - faddp st4,st0 ; su.i | u | v | v/z | 1/z | u/z - faddp st5,st0 ; u | v | v/z | 1/z | u/z - -StartDiv: - fld st3 ; 1/z | u | v | v/z | 1/z | u/z - fdivr dword [fp_1] ; z | u | v | v/z | 1/z | u/z - -DrawFullSpan: - mov ecx,[start_v] - mov edx,[start_u] - - add ecx,[pviewy] - add edx,[pviewx] - - mov esi,edx - mov ebp,ecx -x1 shr ebp,26 -m1 and esi,0xfc000000 -y1 shr esi,20 - add ecx,[step_v] - add edx,[step_u] -fetch1 mov al,[ebp+esi+SPACEFILLER4] - mov ebp,[tiltlighting+ebx*4] - mov esi,edx - mov al,[ebp+eax] - mov ebp,ecx - mov [edi+0],al - -x2 shr ebp,26 -m2 and esi,0xfc000000 -y2 shr esi,20 - add ecx,[step_v] - add edx,[step_u] -fetch2 mov al,[ebp+esi+SPACEFILLER4] - mov ebp,[tiltlighting+ebx*4-4] - mov esi,edx - mov al,[ebp+eax] - mov ebp,ecx - mov [edi+1],al - -x3 shr ebp,26 -m3 and esi,0xfc000000 -y3 shr esi,20 - add ecx,[step_v] - add edx,[step_u] -fetch3 mov al,[ebp+esi+SPACEFILLER4] - mov ebp,[tiltlighting+ebx*4-8] - mov esi,edx - mov al,[ebp+eax] - mov ebp,ecx - mov [edi+2],al - -x4 shr ebp,26 -m4 and esi,0xfc000000 -y4 shr esi,20 - add ecx,[step_v] - add edx,[step_u] -fetch4 mov al,[ebp+esi+SPACEFILLER4] - mov ebp,[tiltlighting+ebx*4-12] - mov esi,edx - mov al,[ebp+eax] - mov ebp,ecx - mov [edi+3],al - -x5 shr ebp,26 -m5 and esi,0xfc000000 -y5 shr esi,20 - add ecx,[step_v] - add edx,[step_u] -fetch5 mov al,[ebp+esi+SPACEFILLER4] - mov ebp,[tiltlighting+ebx*4-16] - mov esi,edx - mov al,[ebp+eax] - mov ebp,ecx - mov [edi+4],al - -x6 shr ebp,26 -m6 and esi,0xfc000000 -y6 shr esi,20 - add ecx,[step_v] - add edx,[step_u] -fetch6 mov al,[ebp+esi+SPACEFILLER4] - mov ebp,[tiltlighting+ebx*4-20] - mov esi,edx - mov al,[ebp+eax] - mov ebp,ecx - mov [edi+5],al - -x7 shr ebp,26 -m7 and esi,0xfc000000 -y7 shr esi,20 - add ecx,[step_v] - add edx,[step_u] -fetch7 mov al,[ebp+esi+SPACEFILLER4] - mov ebp,[tiltlighting+ebx*4-24] -x8 shr ecx,26 - mov al,[ebp+eax] -m8 and edx,0xfc000000 - mov [edi+6],al - -y8 shr edx,20 - mov ebp,[tiltlighting+ebx*4-28] -fetch8 mov al,[edx+ecx+SPACEFILLER4] - mov al,[ebp+eax] - mov [edi+7],al - add edi,8 - - sub ebx,8 - jl near Done - - fld st1 - fistp qword [start_u] - fld st2 - fistp qword [start_v] - - cmp ebx,7 - jl near EndIsShort - - fst dword [end_z] - fld st5 ; u/z | z | u | v | v/z | 1/z | u/z - fmul st0,st1 ; u' | z | u | v | v/z | 1/z | u/z - fxch st1 ; z | u' | u | v | v/z | 1/z | u/z - fmul st0,st4 ; v' | u' | u | v | v/z | 1/z | u/z - fxch st3 ; v | u' | u | v' | v/z | 1/z | u/z - fsubr st0,st3 ; v'-v | u' | u | v' | v/z | 1/z | u/z - fxch st2 ; u | u' | v'-v | v' | v/z | 1/z | u/z - fsubr st0,st1 ; u'-u | u' | v'-v | v' | v/z | 1/z | u/z - fxch st2 ; v'-v | u' | u'-u | v' | v/z | 1/z | u/z - fmul dword [fp_8recip] ; vstep | u' | u'-u | v' | v/z | 1/z | u/z - fxch st1 ; u' | vstep | u'-u | v' | v/z | 1/z | u/z - fxch st2 ; u'-u | vstep | u' | v' | v/z | 1/z | u/z - fmul dword [fp_8recip] ; ustep | vstep | u' | v' | v/z | 1/z | u/z - fxch st1 ; vstep | ustep | u' | v' | v/z | 1/z | u/z - fistp qword [step_v] ; ustep | u' | v' | v/z | 1/z | u/z - fistp qword [step_u] ; u | v | v/z | 1/z | u/z - jmp FullSpan - -OnlyOnePixelAtEnd: - fld st0 - fistp qword [start_u] - fld st1 - fistp qword [start_v] - -OnlyOnePixel: - mov edx,[start_v] - mov ecx,[start_u] - add edx,[pviewy] - add ecx,[pviewx] -x9 shr edx,26 -m9 and ecx,0xfc000000 -y9 shr ecx,20 - mov ebp,[tiltlighting] -fetch9 mov al,[ecx+edx+SPACEFILLER4] - mov al,[ebp+eax] - mov [edi],al - -Done: - fcompp - fcompp - fstp st0 - - pop ebp - pop edi - pop esi - pop ebx - ret - -ShortStrip: - cmp ebx,0 - jle near OnlyOnePixel - -MoreThanOnePixel: - fld dword [sz_i] ; sz.i | u | v | v/z | 1/z | u/z - fmul dword [fp_quickint+ebx*4] - fld dword [sv_i] ; sv.i | sz.i | u | v | v/z | 1/z | u/z - fmul dword [fp_quickint+ebx*4] - fld dword [su_i] ; su.i | sv.i | sz.i | u | v | v/z | 1/z | u/z - fmul dword [fp_quickint+ebx*4] - fxch st2 ; sz.i | sv.i | su.i | u | v | v/z | 1/z | u/z - faddp st6,st0 ; sv.i | su.i | u | v | v/z | 1/z | u/z - faddp st4,st0 ; su.i | u | v | v/z | 1/z | u/z - faddp st5,st0 ; u | v | v/z | 1/z | u/z - fld st3 ; 1/z | u | v | v/z | 1/z | u/z - fdivr dword [fp_1] ; z | u | v | v/z | 1/z | u/z - jmp CalcPartialSteps - -EndIsShort: - cmp ebx,0 - je near OnlyOnePixelAtEnd - -CalcPartialSteps: - fst dword [end_z] - fld st5 ; u/z | z | u | v | v/z | 1/z | u/z - fmul st0,st1 ; u' | z | u | v | v/z | 1/z | u/z - fxch st1 ; z | u' | u | v | v/z | 1/z | u/z - fmul st0,st4 ; v' | u' | u | v | v/z | 1/z | u/z - fxch st1 ; u' | v' | u | v | v/z | 1/z | u/z - fsubrp st2,st0 ; v' | u'-u | v | v/z | 1/z | u/z - fsubrp st2,st0 ; u'-u | v'-v | v/z | 1/z | u/z - fmul dword [spanrecips+ebx*4] ;ustep | v'-v | v/z | 1/z | u/z - fxch st1 ; v'-v | ustep | v/z | 1/z | u/z - fmul dword [spanrecips+ebx*4] ;vstep | ustep | v/z | 1/z | u/z - fxch st1 ; ustep | vstep | v/z | 1/z | u/z - fistp qword [step_u] ; vstep | v/z | 1/z | u/z - fistp qword [step_v] ; v/z | 1/z | u/z - - mov ecx,[start_v] - mov edx,[start_u] - - add ecx,[pviewy] - add edx,[pviewx] - - mov esi,edx - mov ebp,ecx -endloop: -x10 shr ebp,26 -m10 and esi,0xfc000000 - -y10 shr esi,20 - inc edi - - add ecx,[step_v] - add edx,[step_u] - -fetch10 mov al,[ebp+esi+SPACEFILLER4] - mov ebp,[tiltlighting+ebx*4] - - mov esi,edx - dec ebx - - mov al,[ebp+eax] - mov ebp,ecx - - mov [edi-1],al - jge endloop - - fcompp - fstp st0 - - pop ebp - pop edi - pop esi - pop ebx - ret - -rtext_end: -%ifdef M_TARGET_MACHO -GLOBAL _rtext_tmap2_end -_rtext_tmap2_end: -%endif diff --git a/src/asm_ia32/tmap3.asm b/src/asm_ia32/tmap3.asm deleted file mode 100644 index bafc33627f..0000000000 --- a/src/asm_ia32/tmap3.asm +++ /dev/null @@ -1,344 +0,0 @@ -%include "valgrind.inc" - -%ifdef M_TARGET_WATCOM - SEGMENT DATA PUBLIC ALIGN=16 CLASS=DATA USE32 - SEGMENT DATA -%else - SECTION .data -%endif - -%ifndef M_TARGET_LINUX -%define ylookup _ylookup -%define vplce _vplce -%define vince _vince -%define palookupoffse _palookupoffse -%define bufplce _bufplce -%define dc_iscale _dc_iscale -%define dc_colormap _dc_colormap -%define dc_count _dc_count -%define dc_dest _dc_dest -%define dc_source _dc_source -%define dc_texturefrac _dc_texturefrac -%define dc_pitch _dc_pitch - -%define setupvlinetallasm _setupvlinetallasm -%define vlinetallasm4 _vlinetallasm4 -%define vlinetallasmathlon4 _vlinetallasmathlon4 -%define vlinetallasm1 _vlinetallasm1 -%define prevlinetallasm1 _prevlinetallasm1 -%endif - -EXTERN vplce -EXTERN vince -EXTERN palookupoffse -EXTERN bufplce - -EXTERN ylookup -EXTERN dc_iscale -EXTERN dc_colormap -EXTERN dc_count -EXTERN dc_dest -EXTERN dc_source -EXTERN dc_texturefrac -EXTERN dc_pitch - -GLOBAL vlt4pitch -GLOBAL vlt1pitch - -%ifdef M_TARGET_WATCOM - SEGMENT CODE PUBLIC ALIGN=16 CLASS=CODE USE32 - SEGMENT CODE -%else - SECTION .text -%endif - -ALIGN 16 -GLOBAL setpitch3 -setpitch3: - mov [vltpitch+2], eax - mov [vltpitcha+2],eax - mov [vlt1pitch1+2], eax - mov [vlt1pitch2+2], eax - selfmod vltpitch, vlt1pitch2+6 - ret - -ALIGN 16 -GLOBAL setupvlinetallasm -setupvlinetallasm: - mov ecx, [esp+4] - mov [shifter1+2], cl - mov [shifter2+2], cl - mov [shifter3+2], cl - mov [shifter4+2], cl - mov [shifter1a+2], cl - mov [shifter2a+2], cl - mov [shifter3a+2], cl - mov [shifter4a+2], cl - mov [preshift+2], cl - mov [shift11+2], cl - mov [shift12+2], cl - selfmod shifter1, shift12+6 - ret - -%ifdef M_TARGET_MACHO - SECTION .text align=64 -GLOBAL _rtext_tmap3_start -_rtext_tmap3_start: -%else - SECTION .rtext progbits alloc exec write align=64 -%endif - -ALIGN 16 - -GLOBAL vlinetallasm4 -vlinetallasm4: - push ebx - mov eax, [bufplce+0] - mov ebx, [bufplce+4] - mov ecx, [bufplce+8] - mov edx, [bufplce+12] - mov [source1+3], eax - mov [source2+3], ebx - mov [source3+3], ecx - mov [source4+3], edx - mov eax, [palookupoffse+0] - mov ebx, [palookupoffse+4] - mov ecx, [palookupoffse+8] - mov edx, [palookupoffse+12] - mov [lookup1+2], eax - mov [lookup2+2], ebx - mov [lookup3+2], ecx - mov [lookup4+2], edx - mov eax, [vince+0] - mov ebx, [vince+4] - mov ecx, [vince+8] - mov edx, [vince+12] - mov [step1+2], eax - mov [step2+2], ebx - mov [step3+2], ecx - mov [step4+1], edx - push ebp - push esi - push edi - mov ecx, [dc_count] - mov edi, [dc_dest] - mov eax, dword [ylookup+ecx*4-4] - add eax, edi - sub edi, eax - mov [write1+2],eax - inc eax - mov [write2+2],eax - inc eax - mov [write3+2],eax - inc eax - mov [write4+2],eax - mov ebx, [vplce] - mov ecx, [vplce+4] - mov esi, [vplce+8] - mov eax, [vplce+12] - selfmod loopit, vltpitch - jmp loopit - -ALIGN 16 -loopit: - mov edx, ebx -shifter1: shr edx, 24 -source1: movzx edx, BYTE [edx+0x88888888] -lookup1: mov dl, [edx+0x88888888] -write1: mov [edi+0x88888880], dl -step1: add ebx, 0x88888888 - mov edx, ecx -shifter2: shr edx, 24 -source2: movzx edx, BYTE [edx+0x88888888] -lookup2: mov dl, [edx+0x88888888] -write2: mov [edi+0x88888881], dl -step2: add ecx, 0x88888888 - mov edx, esi -shifter3: shr edx, 24 -source3: movzx edx, BYTE [edx+0x88888888] -lookup3: mov dl, BYTE [edx+0x88888888] -write3: mov [edi+0x88888882], dl -step3: add esi, 0x88888888 - mov edx, eax -shifter4: shr edx, 24 -source4: movzx edx, BYTE [edx+0x88888888] -lookup4: mov dl, [edx+0x88888888] -write4: mov [edi+0x88888883], dl -step4: add eax, 0x88888888 -vltpitch: add edi, 320 - jle near loopit - - mov [vplce], ebx - mov [vplce+4], ecx - mov [vplce+8], esi - mov [vplce+12], eax - - pop edi - pop esi - pop ebp - pop ebx - - ret - - ALIGN 16 - -GLOBAL vlinetallasmathlon4 -vlinetallasmathlon4: - push ebx - mov eax, [bufplce+0] - mov ebx, [bufplce+4] - mov ecx, [bufplce+8] - mov edx, [bufplce+12] - mov [source1a+3], eax - mov [source2a+3], ebx - mov [source3a+3], ecx - mov [source4a+3], edx - mov eax, [palookupoffse+0] - mov ebx, [palookupoffse+4] - mov ecx, [palookupoffse+8] - mov edx, [palookupoffse+12] - mov [lookup1a+2], eax - mov [lookup2a+2], ebx - mov [lookup3a+2], ecx - mov [lookup4a+2], edx - mov eax, [vince+0] - mov ebx, [vince+4] - mov ecx, [vince+8] - mov edx, [vince+12] - mov [step1a+2], eax - mov [step2a+2], ebx - mov [step3a+2], ecx - mov [step4a+1], edx - push ebp - push esi - push edi - mov ecx, [dc_count] - mov edi, [dc_dest] - mov eax, dword [ylookup+ecx*4-4] - add eax, edi - sub edi, eax - mov [write1a+2],eax - inc eax - mov [write2a+2],eax - inc eax - mov [write3a+2],eax - inc eax - mov [write4a+2],eax - mov ebp, [vplce] - mov ecx, [vplce+4] - mov esi, [vplce+8] - mov eax, [vplce+12] - selfmod loopita, vltpitcha - jmp loopita - -; Unfortunately, this code has not been carefully analyzed to determine -; how well it utilizes the processor's instruction units. Instead, I just -; kept rearranging code, seeing what sped it up and what slowed it down -; until I arrived at this. The is the fastest version I was able to -; manage, but that does not mean it cannot be made faster with careful -; instructing shuffling. - - ALIGN 64 - -loopita: mov edx, ebp - mov ebx, ecx -shifter1a: shr edx, 24 -shifter2a: shr ebx, 24 -source1a: movzx edx, BYTE [edx+0x88888888] -source2a: movzx ebx, BYTE [ebx+0x88888888] -step1a: add ebp, 0x88888888 -step2a: add ecx, 0x88888888 -lookup1a: mov dl, [edx+0x88888888] -lookup2a: mov dh, [ebx+0x88888888] - mov ebx, esi -write1a: mov [edi+0x88888880], dl -write2a: mov [edi+0x88888881], dh -shifter3a: shr ebx, 24 - mov edx, eax -source3a: movzx ebx, BYTE [ebx+0x88888888] -shifter4a: shr edx, 24 -step3a: add esi, 0x88888888 -source4a: movzx edx, BYTE [edx+0x88888888] -step4a: add eax, 0x88888888 -lookup3a: mov bl, [ebx+0x88888888] -lookup4a: mov dl, [edx+0x88888888] -write3a: mov [edi+0x88888882], bl -write4a: mov [edi+0x88888883], dl -vltpitcha: add edi, 320 - jle near loopita - - mov [vplce], ebp - mov [vplce+4], ecx - mov [vplce+8], esi - mov [vplce+12], eax - - pop edi - pop esi - pop ebp - pop ebx - - ret - -ALIGN 16 -GLOBAL prevlinetallasm1 -prevlinetallasm1: - mov ecx, [dc_count] - cmp ecx, 1 - ja vlinetallasm1 - - mov eax, [dc_iscale] - mov edx, [dc_texturefrac] - add eax, edx - mov ecx, [dc_source] -preshift: shr edx, 16 - push ebx - push edi - mov edi, [dc_colormap] - movzx ebx, byte [ecx+edx] - mov ecx, [dc_dest] - mov bl, byte [edi+ebx] - pop edi - mov byte [ecx], bl - pop ebx - ret - -ALIGN 16 -GLOBAL vlinetallasm1 -vlinetallasm1: - push ebp - push ebx - push edi - push esi - - mov ebp, [dc_count] - mov ebx, [dc_texturefrac] ; ebx = frac - mov edi, [dc_dest] - mov ecx, ebx -shift11: shr ecx, 16 - mov esi, [dc_source] - mov edx, [dc_iscale] -vlt1pitch1: sub edi, 0x88888888 - mov eax, [dc_colormap] - -loop2: - movzx ecx, BYTE [esi+ecx] - add ebx, edx -vlt1pitch2: add edi, 0x88888888 - mov cl,[eax+ecx] - mov [edi],cl - mov ecx,ebx -shift12: shr ecx,16 - dec ebp - jnz loop2 - - mov eax,ebx - pop esi - pop edi - pop ebx - pop ebp - ret - -%ifdef M_TARGET_MACHO -GLOBAL _rtext_tmap3_end -_rtext_tmap3_end: -%endif diff --git a/src/asm_x86_64/tmap3.asm b/src/asm_x86_64/tmap3.asm deleted file mode 100644 index e0f568fea1..0000000000 --- a/src/asm_x86_64/tmap3.asm +++ /dev/null @@ -1,150 +0,0 @@ -%ifnidn __OUTPUT_FORMAT__,win64 -%error tmap3.asm is for Win64 output. You should use tmap.s for other systems. -%endif - -BITS 64 -DEFAULT REL - -EXTERN vplce -EXTERN vince -EXTERN palookupoffse -EXTERN bufplce - -EXTERN dc_count -EXTERN dc_dest -EXTERN dc_pitch - -SECTION .text - -GLOBAL ASM_PatchPitch -ASM_PatchPitch: - mov ecx, [dc_pitch] - mov [pm+3], ecx - mov [vltpitch+3], ecx - ret - align 16 - -GLOBAL setupvlinetallasm -setupvlinetallasm: - mov [shifter1+2], cl - mov [shifter2+2], cl - mov [shifter3+2], cl - mov [shifter4+2], cl - ret - align 16 - -; Yasm can't do progbits alloc exec for win64? -; Hmm, looks like it's automatic. No worries, then. -SECTION .rtext write ;progbits alloc exec - -GLOBAL vlinetallasm4 -PROC_FRAME vlinetallasm4 - rex_push_reg rbx - push_reg rdi - push_reg r15 - push_reg r14 - push_reg r13 - push_reg r12 - push_reg rbp - push_reg rsi - alloc_stack 8 ; Stack must be 16-byte aligned -END_PROLOGUE -; rax = bufplce base address -; rbx = -; rcx = offset from rdi/count (negative) -; edx/rdx = scratch -; rdi = bottom of columns to write to -; r8d-r11d = column offsets -; r12-r15 = palookupoffse[0] - palookupoffse[4] - - mov ecx, [dc_count] - mov rdi, [dc_dest] - test ecx, ecx - jle vltepilog ; count must be positive - - mov rax, [bufplce] - mov r8, [bufplce+8] - sub r8, rax - mov r9, [bufplce+16] - sub r9, rax - mov r10, [bufplce+24] - sub r10, rax - mov [source2+4], r8d - mov [source3+4], r9d - mov [source4+4], r10d - -pm: imul rcx, 320 - - mov r12, [palookupoffse] - mov r13, [palookupoffse+8] - mov r14, [palookupoffse+16] - mov r15, [palookupoffse+24] - - mov r8d, [vince] - mov r9d, [vince+4] - mov r10d, [vince+8] - mov r11d, [vince+12] - mov [step1+3], r8d - mov [step2+3], r9d - mov [step3+3], r10d - mov [step4+3], r11d - - add rdi, rcx - neg rcx - - mov r8d, [vplce] - mov r9d, [vplce+4] - mov r10d, [vplce+8] - mov r11d, [vplce+12] - jmp loopit - -ALIGN 16 -loopit: - mov edx, r8d -shifter1: shr edx, 24 -step1: add r8d, 0x88888888 - movzx edx, BYTE [rax+rdx] - mov ebx, r9d - mov dl, [r12+rdx] -shifter2: shr ebx, 24 -step2: add r9d, 0x88888888 -source2: movzx ebx, BYTE [rax+rbx+0x88888888] - mov ebp, r10d - mov bl, [r13+rbx] -shifter3: shr ebp, 24 -step3: add r10d, 0x88888888 -source3: movzx ebp, BYTE [rax+rbp+0x88888888] - mov esi, r11d - mov bpl, BYTE [r14+rbp] -shifter4: shr esi, 24 -step4: add r11d, 0x88888888 -source4: movzx esi, BYTE [rax+rsi+0x88888888] - mov [rdi+rcx], dl - mov [rdi+rcx+1], bl - mov sil, BYTE [r15+rsi] - mov [rdi+rcx+2], bpl - mov [rdi+rcx+3], sil - -vltpitch: add rcx, 320 - jl loopit - - mov [vplce], r8d - mov [vplce+4], r9d - mov [vplce+8], r10d - mov [vplce+12], r11d - -vltepilog: - add rsp, 8 - pop rsi - pop rbp - pop r12 - pop r13 - pop r14 - pop r15 - pop rdi - pop rbx - ret -vlinetallasm4_end: -ENDPROC_FRAME - ALIGN 16 - diff --git a/src/asm_x86_64/tmap3.s b/src/asm_x86_64/tmap3.s deleted file mode 100644 index 867d11c759..0000000000 --- a/src/asm_x86_64/tmap3.s +++ /dev/null @@ -1,141 +0,0 @@ -#%include "valgrind.inc" - - .section .text - -.globl ASM_PatchPitch -ASM_PatchPitch: - movl dc_pitch(%rip), %ecx - movl %ecx, pm+3(%rip) - movl %ecx, vltpitch+3(%rip) -# selfmod pm, vltpitch+6 - ret - .align 16 - -.globl setupvlinetallasm -setupvlinetallasm: - movb %dil, shifter1+2(%rip) - movb %dil, shifter2+2(%rip) - movb %dil, shifter3+2(%rip) - movb %dil, shifter4+2(%rip) -# selfmod shifter1, shifter4+3 - ret - .align 16 - - .section .rtext,"awx" - -.globl vlinetallasm4 - .type vlinetallasm4,@function -vlinetallasm4: - .cfi_startproc - push %rbx - push %rdi - push %r15 - push %r14 - push %r13 - push %r12 - push %rbp - push %rsi - subq $8, %rsp # Does the stack need to be 16-byte aligned for Linux? - .cfi_adjust_cfa_offset 8 - -# rax = bufplce base address -# rbx = -# rcx = offset from rdi/count (negative) -# edx/rdx = scratch -# rdi = bottom of columns to write to -# r8d-r11d = column offsets -# r12-r15 = palookupoffse[0] - palookupoffse[4] - - movl dc_count(%rip), %ecx - movq dc_dest(%rip), %rdi - testl %ecx, %ecx - jle vltepilog # count must be positive - - movq bufplce(%rip), %rax - movq bufplce+8(%rip), %r8 - subq %rax, %r8 - movq bufplce+16(%rip), %r9 - subq %rax, %r9 - movq bufplce+24(%rip), %r10 - subq %rax, %r10 - movl %r8d, source2+4(%rip) - movl %r9d, source3+4(%rip) - movl %r10d, source4+4(%rip) - -pm: imulq $320, %rcx - - movq palookupoffse(%rip), %r12 - movq palookupoffse+8(%rip), %r13 - movq palookupoffse+16(%rip), %r14 - movq palookupoffse+24(%rip), %r15 - - movl vince(%rip), %r8d - movl vince+4(%rip), %r9d - movl vince+8(%rip), %r10d - movl vince+12(%rip), %r11d - movl %r8d, step1+3(%rip) - movl %r9d, step2+3(%rip) - movl %r10d, step3+3(%rip) - movl %r11d, step4+3(%rip) - - addq %rcx, %rdi - negq %rcx - - movl vplce(%rip), %r8d - movl vplce+4(%rip), %r9d - movl vplce+8(%rip), %r10d - movl vplce+12(%rip), %r11d -# selfmod loopit, vltepilog - jmp loopit - - .align 16 -loopit: - movl %r8d, %edx -shifter1: shrl $24, %edx -step1: addl $0x44444444, %r8d - movzbl (%rax,%rdx), %edx - movl %r9d, %ebx - movb (%r12,%rdx), %dl -shifter2: shrl $24, %ebx -step2: addl $0x44444444, %r9d -source2: movzbl 0x44444444(%rax,%rbx), %ebx - movl %r10d, %ebp - movb (%r13,%rbx), %bl -shifter3: shr $24, %ebp -step3: addl $0x44444444, %r10d -source3: movzbl 0x44444444(%rax,%rbp), %ebp - movl %r11d, %esi - movb (%r14,%rbp), %bpl -shifter4: shr $24, %esi -step4: add $0x44444444, %r11d -source4: movzbl 0x44444444(%rax,%rsi), %esi - movb %dl, (%rdi,%rcx) - movb %bl, 1(%rdi,%rcx) - movb (%r15,%rsi), %sil - movb %bpl, 2(%rdi,%rcx) - movb %sil, 3(%rdi,%rcx) - -vltpitch: addq $320, %rcx - jl loopit - - movl %r8d, vplce(%rip) - movl %r9d, vplce+4(%rip) - movl %r10d, vplce+8(%rip) - movl %r11d, vplce+12(%rip) - -vltepilog: - addq $8, %rsp - .cfi_adjust_cfa_offset -8 - pop %rsi - pop %rbp - pop %r12 - pop %r13 - pop %r14 - pop %r15 - pop %rdi - pop %rbx - ret - .cfi_endproc - .align 16 - -