- complete removal of assembly stuff. Nothing of this gets used anymore.

This commit is contained in:
Christoph Oelckers 2016-12-07 14:41:21 +01:00
parent ed141943e1
commit a118903e3e
7 changed files with 0 additions and 3292 deletions

View File

@ -1,812 +0,0 @@
; "Build Engine & Tools" Copyright (c) 1993-1997 Ken Silverman
; Ken Silverman's official web site: "http://www.advsys.net/ken"
; See the included license file "BUILDLIC.TXT" for license info.
; This file has been modified from Ken Silverman's original release
%include "valgrind.inc"
SECTION .data
%ifndef M_TARGET_LINUX
%define ylookup _ylookup
%define vince _vince
%define vplce _vplce
%define palookupoffse _palookupoffse
%define bufplce _bufplce
%define dc_iscale _dc_iscale
%define dc_colormap _dc_colormap
%define dc_count _dc_count
%define dc_dest _dc_dest
%define dc_source _dc_source
%define dc_texturefrac _dc_texturefrac
%define setupvlineasm _setupvlineasm
%define prevlineasm1 _prevlineasm1
%define vlineasm1 _vlineasm1
%define vlineasm4 _vlineasm4
%define setupmvlineasm _setupmvlineasm
%define mvlineasm1 _mvlineasm1
%define mvlineasm4 _mvlineasm4
%define R_SetupDrawSlabA _R_SetupDrawSlabA
%define R_DrawSlabA _R_DrawSlabA
%endif
EXTERN ylookup ; near
EXTERN vplce ; near
EXTERN vince ; near
EXTERN palookupoffse ; near
EXTERN bufplce ; near
EXTERN dc_iscale
EXTERN dc_colormap
EXTERN dc_count
EXTERN dc_dest
EXTERN dc_source
EXTERN dc_texturefrac
SECTION .text
ALIGN 16
GLOBAL setvlinebpl_
setvlinebpl_:
mov [fixchain1a+2], eax
mov [fixchain1b+2], eax
mov [fixchain2a+2], eax
mov [fixchain1m+2], eax
mov [fixchain2ma+2], eax
mov [fixchain2mb+2], eax
selfmod fixchain1a, fixchain2mb+6
setdrawslabbpl:
mov dword [voxbpl1+2], eax
mov dword [voxbpl2+2], eax
mov dword [voxbpl3+2], eax
mov dword [voxbpl4+2], eax
mov dword [voxbpl5+2], eax
mov dword [voxbpl6+2], eax
mov dword [voxbpl7+2], eax
mov dword [voxbpl8+2], eax
selfmod voxbpl1, voxpl8+6
ret
SECTION .data
lastslabcolormap:
dd 4
SECTION .text
GLOBAL R_SetupDrawSlabA
GLOBAL @R_SetupDrawSlabA@4
R_SetupDrawSlabA:
mov ecx, [esp+4]
@R_SetupDrawSlabA@4:
cmp [lastslabcolormap], ecx
je .done
mov [lastslabcolormap], ecx
mov dword [voxpal1+2], ecx
mov dword [voxpal2+2], ecx
mov dword [voxpal3+2], ecx
mov dword [voxpal4+2], ecx
mov dword [voxpal5+2], ecx
mov dword [voxpal6+2], ecx
mov dword [voxpal7+2], ecx
mov dword [voxpal8+2], ecx
.done ret
; pass it log2(texheight)
ALIGN 16
GLOBAL setupvlineasm
setupvlineasm:
mov ecx, [esp+4]
;First 2 lines for VLINEASM1, rest for VLINEASM4
mov byte [premach3a+2], cl
mov byte [mach3a+2], cl
mov byte [machvsh1+2], cl ;32-shy
mov byte [machvsh3+2], cl ;32-shy
mov byte [machvsh5+2], cl ;32-shy
mov byte [machvsh6+2], cl ;32-shy
mov ch, cl
sub ch, 16
mov byte [machvsh8+2], ch ;16-shy
neg cl
mov byte [machvsh7+2], cl ;shy
mov byte [machvsh9+2], cl ;shy
mov byte [machvsh10+2], cl ;shy
mov byte [machvsh11+2], cl ;shy
mov byte [machvsh12+2], cl ;shy
mov eax, 1
shl eax, cl
dec eax
mov dword [machvsh2+2], eax ;(1<<shy)-1
mov dword [machvsh4+2], eax ;(1<<shy)-1
selfmod premach3a, machvsh8+6
ret
%ifdef M_TARGET_MACHO
SECTION .text align=64
%else
SECTION .rtext progbits alloc exec write align=64
%endif
%ifdef M_TARGET_MACHO
GLOBAL _rtext_a_start
_rtext_a_start:
%endif
;eax = xscale
;ebx = palookupoffse
;ecx = # pixels to draw-1
;edx = texturefrac
;esi = texturecolumn
;edi = buffer pointer
ALIGN 16
GLOBAL prevlineasm1
prevlineasm1:
mov ecx, [dc_count]
cmp ecx, 1
ja vlineasm1
mov eax, [dc_iscale]
mov edx, [dc_texturefrac]
add eax, edx
mov ecx, [dc_source]
premach3a: shr edx, 32
push ebx
push edi
mov edi, [dc_colormap]
xor ebx, ebx
mov bl, byte [ecx+edx]
mov ecx, [dc_dest]
mov bl, byte [edi+ebx]
pop edi
mov byte [ecx], bl
pop ebx
ret
GLOBAL vlineasm1
ALIGN 16
vlineasm1:
push ebx
push edi
push esi
push ebp
mov ecx, [dc_count]
mov ebp, [dc_colormap]
mov edi, [dc_dest]
mov eax, [dc_iscale]
mov edx, [dc_texturefrac]
mov esi, [dc_source]
fixchain1a: sub edi, 320
nop
nop
nop
beginvline:
mov ebx, edx
mach3a: shr ebx, 32
fixchain1b: add edi, 320
mov bl, byte [esi+ebx]
add edx, eax
dec ecx
mov bl, byte [ebp+ebx]
mov byte [edi], bl
jnz short beginvline
pop ebp
pop esi
pop edi
pop ebx
mov eax, edx
ret
;eax: -------temp1-------
;ebx: -------temp2-------
;ecx: dat dat dat dat
;edx: ylo2 ylo4
;esi: yhi1 yhi2
;edi: ---videoplc/cnt----
;ebp: yhi3 yhi4
;esp:
ALIGN 16
GLOBAL vlineasm4
vlineasm4:
mov ecx, [dc_count]
push ebp
push ebx
push esi
push edi
mov edi, [dc_dest]
mov eax, dword [ylookup+ecx*4-4]
add eax, edi
mov dword [machvline4end+2], eax
sub edi, eax
mov eax, dword [bufplce+0]
mov ebx, dword [bufplce+4]
mov ecx, dword [bufplce+8]
mov edx, dword [bufplce+12]
mov dword [machvbuf1+2], ecx
mov dword [machvbuf2+2], edx
mov dword [machvbuf3+2], eax
mov dword [machvbuf4+2], ebx
mov eax, dword [palookupoffse+0]
mov ebx, dword [palookupoffse+4]
mov ecx, dword [palookupoffse+8]
mov edx, dword [palookupoffse+12]
mov dword [machvpal1+2], ecx
mov dword [machvpal2+2], edx
mov dword [machvpal3+2], eax
mov dword [machvpal4+2], ebx
; ÚÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÂÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄ¿
;edx: ³v3lo ³v1lo ³
; ÃÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÁÄÄÄÄÄÄÄÂÄÄÄÄÄÄÄ´
;esi: ³v2hi v2lo ³ v3hi³
; ÃÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÅÄÄÄÄÄÄÄ´
;ebp: ³v0hi v0lo ³ v1hi³
; ÀÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÁÄÄÄÄÄÄÄÙ
mov ebp, dword [vince+0]
mov ebx, dword [vince+4]
mov esi, dword [vince+8]
mov eax, dword [vince+12]
and esi, 0fffffe00h
and ebp, 0fffffe00h
machvsh9: rol eax, 88h ;sh
machvsh10: rol ebx, 88h ;sh
mov edx, eax
mov ecx, ebx
shr ecx, 16
and edx, 0ffff0000h
add edx, ecx
and eax, 000001ffh
and ebx, 000001ffh
add esi, eax
add ebp, ebx
;
mov eax, edx
and eax, 0ffff0000h
mov dword [machvinc1+2], eax
mov dword [machvinc2+2], esi
mov byte [machvinc3+2], dl
mov byte [machvinc4+2], dh
mov dword [machvinc5+2], ebp
mov ebp, dword [vplce+0]
mov ebx, dword [vplce+4]
mov esi, dword [vplce+8]
mov eax, dword [vplce+12]
and esi, 0fffffe00h
and ebp, 0fffffe00h
machvsh11: rol eax, 88h ;sh
machvsh12: rol ebx, 88h ;sh
mov edx, eax
mov ecx, ebx
shr ecx, 16
and edx, 0ffff0000h
add edx, ecx
and eax, 000001ffh
and ebx, 000001ffh
add esi, eax
add ebp, ebx
mov ecx, esi
selfmod beginvlineasm4, machvline4end+6
jmp short beginvlineasm4
ALIGN 16
beginvlineasm4:
machvsh1: shr ecx, 88h ;32-sh
mov ebx, esi
machvsh2: and ebx, 00000088h ;(1<<sh)-1
machvinc1: add edx, 88880000h
machvinc2: adc esi, 88888088h
machvbuf1: mov cl, byte [ecx+88888888h]
machvbuf2: mov bl, byte [ebx+88888888h]
mov eax, ebp
machvsh3: shr eax, 88h ;32-sh
machvpal1: mov cl, byte [ecx+88888888h]
machvpal2: mov ch, byte [ebx+88888888h]
mov ebx, ebp
shl ecx, 16
machvsh4: and ebx, 00000088h ;(1<<sh)-1
machvinc3: add dl, 88h
machvbuf3: mov al, byte [eax+88888888h]
machvinc4: adc dh, 88h
machvbuf4: mov bl, byte [ebx+88888888h]
machvinc5: adc ebp, 88888088h
machvpal3: mov cl, byte [eax+88888888h]
machvpal4: mov ch, byte [ebx+88888888h]
machvline4end: mov dword [edi+88888888h], ecx
fixchain2a: add edi, 88888888h
mov ecx, esi
jle short beginvlineasm4
; ÚÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÂÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄ¿
;edx: ³v3lo ³v1lo ³
; ÃÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÁÄÄÄÄÄÄÄÂÄÄÄÄÄÄÄ´
;esi: ³v2hi v2lo ³ v3hi³
; ÃÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÅÄÄÄÄÄÄÄ´
;ebp: ³v0hi v0lo ³ v1hi³
; ÀÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÁÄÄÄÄÄÄÄÙ
mov dword [vplce+8], esi
mov dword [vplce+0], ebp
;vplc2 = (esi<<(32-sh))+(edx>>sh)
;vplc3 = (ebp<<(32-sh))+((edx&65535)<<(16-sh))
machvsh5: shl esi, 88h ;32-sh
mov eax, edx
machvsh6: shl ebp, 88h ;32-sh
and edx, 0000ffffh
machvsh7: shr eax, 88h ;sh
add esi, eax
machvsh8: shl edx, 88h ;16-sh
add ebp, edx
mov dword [vplce+12], esi
mov dword [vplce+4], ebp
pop edi
pop esi
pop ebx
pop ebp
ret
;*************************************************************************
;************************* Masked Vertical Lines *************************
;*************************************************************************
; pass it log2(texheight)
ALIGN 16
GLOBAL setupmvlineasm
setupmvlineasm:
mov ecx, dword [esp+4]
mov byte [maskmach3a+2], cl
mov byte [machmv13+2], cl
mov byte [machmv14+2], cl
mov byte [machmv15+2], cl
mov byte [machmv16+2], cl
selfmod maskmach3a, machmv13+6
ret
ALIGN 16
GLOBAL mvlineasm1 ;Masked vline
mvlineasm1:
push ebx
push edi
push esi
push ebp
mov ecx, [dc_count]
mov ebp, [dc_colormap]
mov edi, [dc_dest]
mov eax, [dc_iscale]
mov edx, [dc_texturefrac]
mov esi, [dc_source]
beginmvline:
mov ebx, edx
maskmach3a: shr ebx, 32
movzx ebx, byte [esi+ebx]
cmp ebx, 0
je short skipmask1
maskmach3c: mov bl, byte [ebp+ebx]
mov [edi], bl
skipmask1: add edx, eax
fixchain1m: add edi, 320
dec ecx
jnz short beginmvline
pop ebp
pop esi
pop edi
pop ebx
mov eax, edx
ret
ALIGN 16
GLOBAL mvlineasm4
mvlineasm4:
push ebx
push esi
push edi
push ebp
mov ecx,[dc_count]
mov edi,[dc_dest]
mov eax, [bufplce+0]
mov ebx, [bufplce+4]
mov [machmv1+3], eax
mov [machmv4+3], ebx
mov eax, [bufplce+8]
mov ebx, [bufplce+12]
mov [machmv7+3], eax
mov [machmv10+3], ebx
mov eax, [palookupoffse]
mov ebx, [palookupoffse+4]
mov [machmv2+2], eax
mov [machmv5+2], ebx
mov eax, [palookupoffse+8]
mov ebx, [palookupoffse+12]
mov [machmv8+2], eax
mov [machmv11+2], ebx
mov eax, [vince] ;vince
mov ebx, [vince+4]
xor bl, bl
mov [machmv3+2], eax
mov [machmv6+2], ebx
mov eax, [vince+8]
mov ebx, [vince+12]
mov [machmv9+2], eax
mov [machmv12+2], ebx
inc ecx
push ecx
mov ecx, [vplce+0]
mov edx, [vplce+4]
mov esi, [vplce+8]
mov ebp, [vplce+12]
fixchain2ma: sub edi, 320
selfmod beginmvlineasm4, machmv2+6
jmp short beginmvlineasm4
ALIGN 16
beginmvlineasm4:
dec dword [esp]
jz near endmvlineasm4
mov eax, ebp
mov ebx, esi
machmv16: shr eax, 32
machmv12: add ebp, 0x88888888 ;vince[3]
machmv15: shr ebx, 32
machmv9: add esi, 0x88888888 ;vince[2]
machmv10: movzx eax, byte [eax+0x88888888];bufplce[3]
machmv7: movzx ebx, byte [ebx+0x88888888];bufplce[2]
cmp eax, 1
adc dl, dl
cmp ebx, 1
adc dl, dl
machmv8: mov bl, [ebx+0x88888888] ;palookupoffs[2]
machmv11: mov bh, [eax+0x88888888] ;palookupoffs[3]
mov eax, edx
machmv6: add edx, 0x88888888 ;vince[1]
machmv14: shr eax, 32
shl ebx, 16
machmv4: movzx eax, byte [eax+0x88888888];bufplce[1]
cmp eax, 1
adc dl, dl
machmv5: mov bh, [eax+0x88888888] ;palookupoffs[1]
mov eax, ecx
machmv3: add ecx, 0x88888888 ;vince[0]
machmv13: shr eax, 32
machmv1: movzx eax, byte [eax+0x88888888];bufplce[0]
cmp eax, 1
adc dl, dl
machmv2: mov bl, [eax+0x88888888] ;palookupoffs[0]
xor eax, eax
shl dl, 4
fixchain2mb: add edi, 320
mov al, dl
add eax, mvcase15
jmp eax ;16 byte cases
ALIGN 16
endmvlineasm4:
mov [vplce], ecx
mov [vplce+4], edx
mov [vplce+8], esi
mov [vplce+12], ebp
pop ecx
pop ebp
pop edi
pop esi
pop ebx
ret
;5,7,8,8,11,13,12,14,11,13,14,14,12,14,15,7
ALIGN 16
mvcase15: mov [edi], ebx
jmp beginmvlineasm4
ALIGN 16
mvcase14: mov [edi+1], bh
shr ebx, 16
mov [edi+2], bx
jmp beginmvlineasm4
ALIGN 16
mvcase13: mov [edi], bl
shr ebx, 16
mov [edi+2], bx
jmp beginmvlineasm4
ALIGN 16
mvcase12: shr ebx, 16
mov [edi+2], bx
jmp beginmvlineasm4
ALIGN 16
mvcase11: mov [edi], bx
shr ebx, 16
mov [edi+3], bh
jmp beginmvlineasm4
ALIGN 16
mvcase10: mov [edi+1], bh
shr ebx, 16
mov [edi+3], bh
jmp beginmvlineasm4
ALIGN 16
mvcase9: mov [edi], bl
shr ebx, 16
mov [edi+3], bh
jmp beginmvlineasm4
ALIGN 16
mvcase8: shr ebx, 16
mov [edi+3], bh
jmp beginmvlineasm4
ALIGN 16
mvcase7: mov [edi], bx
shr ebx, 16
mov [edi+2], bl
jmp beginmvlineasm4
ALIGN 16
mvcase6: shr ebx, 8
mov [edi+1], bx
jmp beginmvlineasm4
ALIGN 16
mvcase5: mov [edi], bl
shr ebx, 16
mov [edi+2], bl
jmp beginmvlineasm4
ALIGN 16
mvcase4: shr ebx, 16
mov [edi+2], bl
jmp beginmvlineasm4
ALIGN 16
mvcase3: mov [edi], bx
jmp beginmvlineasm4
ALIGN 16
mvcase2: mov [edi+1], bh
jmp beginmvlineasm4
ALIGN 16
mvcase1: mov [edi], bl
jmp beginmvlineasm4
ALIGN 16
mvcase0: jmp beginmvlineasm4
align 16
;*************************************************************************
;***************************** Voxel Slabs *******************************
;*************************************************************************
GLOBAL R_DrawSlabA
R_DrawSlabA:
push ebx
push ebp
push esi
push edi
mov eax, [esp+5*4+0]
mov ebx, [esp+5*4+4]
mov ecx, [esp+5*4+8]
mov edx, [esp+5*4+12]
mov esi, [esp+5*4+16]
mov edi, [esp+5*4+20]
cmp eax, 2
je voxbegdraw2
ja voxskip2
xor eax, eax
voxbegdraw1:
mov ebp, ebx
shr ebp, 16
add ebx, edx
dec ecx
mov al, byte [esi+ebp]
voxpal1: mov al, byte [eax+88888888h]
mov byte [edi], al
voxbpl1: lea edi, [edi+88888888h]
jnz voxbegdraw1
jmp voxskipslab5
voxbegdraw2:
mov ebp, ebx
shr ebp, 16
add ebx, edx
xor eax, eax
dec ecx
mov al, byte [esi+ebp]
voxpal2: mov al, byte [eax+88888888h]
mov ah, al
mov word [edi], ax
voxbpl2: lea edi, [edi+88888888h]
jnz voxbegdraw2
jmp voxskipslab5
voxskip2:
cmp eax, 4
jne voxskip4
xor eax, eax
voxbegdraw4:
mov ebp, ebx
add ebx, edx
shr ebp, 16
xor eax, eax
mov al, byte [esi+ebp]
voxpal3: mov al, byte [eax+88888888h]
mov ah, al
shl eax, 8
mov al, ah
shl eax, 8
mov al, ah
mov dword [edi], eax
voxbpl3: add edi, 88888888h
dec ecx
jnz voxbegdraw4
jmp voxskipslab5
voxskip4:
add eax, edi
test edi, 1
jz voxskipslab1
cmp edi, eax
je voxskipslab1
push eax
push ebx
push ecx
push edi
voxbegslab1:
mov ebp, ebx
add ebx, edx
shr ebp, 16
xor eax, eax
mov al, byte [esi+ebp]
voxpal4: mov al, byte [eax+88888888h]
mov byte [edi], al
voxbpl4: add edi, 88888888h
dec ecx
jnz voxbegslab1
pop edi
pop ecx
pop ebx
pop eax
inc edi
voxskipslab1:
push eax
test edi, 2
jz voxskipslab2
dec eax
cmp edi, eax
jge voxskipslab2
push ebx
push ecx
push edi
voxbegslab2:
mov ebp, ebx
add ebx, edx
shr ebp, 16
xor eax, eax
mov al, byte [esi+ebp]
voxpal5: mov al, byte [eax+88888888h]
mov ah, al
mov word [edi], ax
voxbpl5: add edi, 88888888h
dec ecx
jnz voxbegslab2
pop edi
pop ecx
pop ebx
add edi, 2
voxskipslab2:
mov eax, [esp]
sub eax, 3
cmp edi, eax
jge voxskipslab3
voxprebegslab3:
push ebx
push ecx
push edi
voxbegslab3:
mov ebp, ebx
add ebx, edx
shr ebp, 16
xor eax, eax
mov al, byte [esi+ebp]
voxpal6: mov al, byte [eax+88888888h]
mov ah, al
shl eax, 8
mov al, ah
shl eax, 8
mov al, ah
mov dword [edi], eax
voxbpl6: add edi, 88888888h
dec ecx
jnz voxbegslab3
pop edi
pop ecx
pop ebx
add edi, 4
mov eax, [esp]
sub eax, 3
cmp edi, eax
jl voxprebegslab3
voxskipslab3:
mov eax, [esp]
dec eax
cmp edi, eax
jge voxskipslab4
push ebx
push ecx
push edi
voxbegslab4:
mov ebp, ebx
add ebx, edx
shr ebp, 16
xor eax, eax
mov al, byte [esi+ebp]
voxpal7: mov al, byte [eax+88888888h]
mov ah, al
mov word [edi], ax
voxbpl7: add edi, 88888888h
dec ecx
jnz voxbegslab4
pop edi
pop ecx
pop ebx
add edi, 2
voxskipslab4:
pop eax
cmp edi, eax
je voxskipslab5
voxbegslab5:
mov ebp, ebx
add ebx, edx
shr ebp, 16
xor eax, eax
mov al, byte [esi+ebp]
voxpal8: mov al, byte [eax+88888888h]
mov byte [edi], al
voxbpl8: add edi, 88888888h
dec ecx
jnz voxbegslab5
voxskipslab5:
pop edi
pop esi
pop ebp
pop ebx
ret
align 16
%ifdef M_TARGET_MACHO
GLOBAL _rtext_a_end
_rtext_a_end:
%endif

View File

@ -1,200 +0,0 @@
;*
;* misc.nas
;* Miscellaneous assembly functions
;*
;*---------------------------------------------------------------------------
;* Copyright 1998-2006 Randy Heit
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* 1. Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;* 2. Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in the
;* documentation and/or other materials provided with the distribution.
;* 3. The name of the author may not be used to endorse or promote products
;* derived from this software without specific prior written permission.
;*
;* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
;* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
;* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
;* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
;* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
;* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;*---------------------------------------------------------------------------
;*
BITS 32
%ifndef M_TARGET_LINUX
%define DoBlending_MMX _DoBlending_MMX
%define BestColor_MMX _BestColor_MMX
%endif
%ifdef M_TARGET_WATCOM
SEGMENT DATA PUBLIC ALIGN=16 CLASS=DATA USE32
SEGMENT DATA
%else
SECTION .data
%endif
Blending256:
dd 0x01000100,0x00000100
%ifdef M_TARGET_WATCOM
SEGMENT CODE PUBLIC ALIGN=16 CLASS=CODE USE32
SEGMENT CODE
%else
SECTION .text
%endif
;-----------------------------------------------------------
;
; DoBlending_MMX
;
; MMX version of DoBlending
;
; (DWORD *from, DWORD *to, count, tor, tog, tob, toa)
;-----------------------------------------------------------
GLOBAL DoBlending_MMX
DoBlending_MMX:
pxor mm0,mm0 ; mm0 = 0
mov eax,[esp+4*4]
shl eax,16
mov edx,[esp+4*5]
shl edx,8
or eax,[esp+4*6]
or eax,edx
mov ecx,[esp+4*3] ; ecx = count
movd mm1,eax ; mm1 = 00000000 00RRGGBB
mov eax,[esp+4*7]
shl eax,16
mov edx,[esp+4*7]
shl edx,8
or eax,[esp+4*7]
or eax,edx
mov edx,[esp+4*2] ; edx = dest
movd mm6,eax ; mm6 = 00000000 00AAAAAA
punpcklbw mm1,mm0 ; mm1 = 000000RR 00GG00BB
movq mm7,[Blending256]
punpcklbw mm6,mm0 ; mm6 = 000000AA 00AA00AA
mov eax,[esp+4*1] ; eax = source
pmullw mm1,mm6 ; mm1 = 000000RR 00GG00BB (multiplied by alpha)
psubusw mm7,mm6 ; mm7 = 000000aa 00aa00aa (one minus alpha)
nop ; Does this actually pair on a Pentium?
; Do four colors per iteration: Count must be a multiple of four.
.loop movq mm2,[eax] ; mm2 = 00r2g2b2 00r1g1b1
add eax,8
movq mm3,mm2 ; mm3 = 00r2g2b2 00r1g1b1
punpcklbw mm2,mm0 ; mm2 = 000000r1 00g100b1
punpckhbw mm3,mm0 ; mm3 = 000000r2 00g200b2
pmullw mm2,mm7 ; mm2 = 0000r1rr g1ggb1bb
add edx,8
pmullw mm3,mm7 ; mm3 = 0000r2rr g2ggb2bb
sub ecx,2
paddusw mm2,mm1
psrlw mm2,8
paddusw mm3,mm1
psrlw mm3,8
packuswb mm2,mm3 ; mm2 = 00r2g2b2 00r1g1b1
movq [edx-8],mm2
movq mm2,[eax] ; mm2 = 00r2g2b2 00r1g1b1
add eax,8
movq mm3,mm2 ; mm3 = 00r2g2b2 00r1g1b1
punpcklbw mm2,mm0 ; mm2 = 000000r1 00g100b1
punpckhbw mm3,mm0 ; mm3 = 000000r2 00g200b2
pmullw mm2,mm7 ; mm2 = 0000r1rr g1ggb1bb
add edx,8
pmullw mm3,mm7 ; mm3 = 0000r2rr g2ggb2bb
sub ecx,2
paddusw mm2,mm1
psrlw mm2,8
paddusw mm3,mm1
psrlw mm3,8
packuswb mm2,mm3 ; mm2 = 00r2g2b2 00r1g1b1
movq [edx-8],mm2
jnz .loop
emms
ret
;-----------------------------------------------------------
;
; BestColor_MMX
;
; Picks the closest matching color from a palette
;
; Passed FFRRGGBB and palette array in same format
; FF is the index of the first palette entry to consider
;
;-----------------------------------------------------------
GLOBAL BestColor_MMX
GLOBAL @BestColor_MMX@8
BestColor_MMX:
mov ecx,[esp+4]
mov edx,[esp+8]
@BestColor_MMX@8:
pxor mm0,mm0
movd mm1,ecx ; mm1 = color searching for
mov eax,257*257+257*257+257*257 ;eax = bestdist
push ebx
punpcklbw mm1,mm0
mov ebx,ecx ; ebx = best color
shr ecx,24 ; ecx = count
and ebx,0xffffff
push esi
push ebp
.loop movd mm2,[edx+ecx*4] ; mm2 = color considering now
inc ecx
punpcklbw mm2,mm0
movq mm3,mm1
psubsw mm3,mm2
pmullw mm3,mm3 ; mm3 = color distance squared
movd ebp,mm3 ; add the three components
psrlq mm3,32 ; into ebp to get the real
mov esi,ebp ; (squared) distance
shr esi,16
and ebp,0xffff
add ebp,esi
movd esi,mm3
add ebp,esi
jz .perf ; found a perfect match
cmp eax,ebp
jb .skip
mov eax,ebp
lea ebx,[ecx-1]
.skip cmp ecx,256
jne .loop
mov eax,ebx
pop ebp
pop esi
pop ebx
emms
ret
.perf lea eax,[ecx-1]
pop ebp
pop esi
pop ebx
emms
ret

File diff suppressed because it is too large Load Diff

View File

@ -1,643 +0,0 @@
;*
;* tmap2.nas
;* The tilted plane inner loop.
;*
;*---------------------------------------------------------------------------
;* Copyright 1998-2006 Randy Heit
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* 1. Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;* 2. Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in the
;* documentation and/or other materials provided with the distribution.
;* 3. The name of the author may not be used to endorse or promote products
;* derived from this software without specific prior written permission.
;*
;* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
;* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
;* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
;* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
;* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
;* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;*---------------------------------------------------------------------------
;*
;* I tried doing the ROL trick that R_DrawSpanP_ASM uses, and it was
;* actually slightly slower than the more straight-forward approach
;* used here, probably because the trick requires too much setup time.
;*
BITS 32
%include "valgrind.inc"
%define SPACEFILLER4 (0x44444444)
%ifndef M_TARGET_LINUX
%define plane_sz _plane_sz
%define plane_su _plane_su
%define plane_sv _plane_sv
%define plane_shade _plane_shade
%define planelightfloat _planelightfloat
%define spanend _spanend
%define ylookup _ylookup
%define dc_destorg _dc_destorg
%define ds_colormap _ds_colormap
%define ds_source _ds_source
%define centery _centery
%define centerx _centerx
%define ds_curtiltedsource _ds_curtiltedsource
%define pviewx _pviewx
%define pviewy _pviewy
%define tiltlighting _tiltlighting
%define R_DrawTiltedPlane_ASM _R_DrawTiltedPlane_ASM
%define R_SetTiltedSpanSource_ASM _R_SetTiltedSpanSource_ASM
%define R_CalcTiltedLighting _R_CalcTiltedLighting
%endif
EXTERN plane_sz
EXTERN plane_su
EXTERN plane_sv
EXTERN planelightfloat
EXTERN spanend
EXTERN ylookup
EXTERN dc_destorg
EXTERN ds_colormap
EXTERN centery
EXTERN centerx
EXTERN ds_source
EXTERN plane_shade
EXTERN pviewx
EXTERN pviewy
EXTERN tiltlighting
EXTERN R_CalcTiltedLighting
GLOBAL ds_curtiltedsource
%define sv_i plane_sv
%define sv_j plane_sv+4
%define sv_k plane_sv+8
%define su_i plane_su
%define su_j plane_su+4
%define su_k plane_su+8
%define sz_i plane_sz
%define sz_j plane_sz+4
%define sz_k plane_sz+8
%define SPANBITS 3
section .bss
start_u: resq 1
start_v: resq 1
step_u: resq 1
step_v: resq 1
step_iz: resq 1
step_uz: resq 1
step_vz: resq 1
end_z: resd 1
section .data
ds_curtiltedsource: dd SPACEFILLER4
fp_1:
spanrecips: dd 0x3f800000 ; 1/1
dd 0x3f000000 ; 1/2
dd 0x3eaaaaab ; 1/3
dd 0x3e800000 ; 1/4
dd 0x3e4ccccd ; 1/5
dd 0x3e2aaaab ; 1/6
dd 0x3e124925 ; 1/7
fp_8recip: dd 0x3e000000 ; 1/8
dd 0x3de38e39 ; 1/9
dd 0x3dcccccd ; 1/10
dd 0x3dba2e8c ; 1/11
dd 0x3daaaaab ; 1/12
dd 0x3d9d89d9 ; 1/13
dd 0x3d924925 ; 1/14
dd 0x3d888889 ; 1/15
fp_quickint: dd 0x3f800000 ; 1
dd 0x40000000 ; 2
dd 0x40400000 ; 3
dd 0x40800000 ; 4
dd 0x40a00000 ; 5
dd 0x40c00000 ; 6
dd 0x40e00000 ; 7
fp_8: dd 0x41000000 ; 8
section .text
GLOBAL R_SetTiltedSpanSource_ASM
GLOBAL @R_SetTiltedSpanSource_ASM@4
R_SetTiltedSpanSource_ASM:
mov ecx,[esp+4]
@R_SetTiltedSpanSource_ASM@4:
mov [fetch1+3],ecx
mov [fetch2+3],ecx
mov [fetch3+3],ecx
mov [fetch4+3],ecx
mov [fetch5+3],ecx
mov [fetch6+3],ecx
mov [fetch7+3],ecx
mov [fetch8+3],ecx
mov [fetch9+3],ecx
mov [fetch10+3],ecx
mov [ds_curtiltedsource],ecx
selfmod rtext_start, rtext_end
ret
GLOBAL SetTiltedSpanSize
SetTiltedSpanSize:
push ecx
mov cl,dl
neg cl
mov eax,1
shl eax,cl
mov cl,[esp]
neg cl
mov [x1+2],cl
mov [x2+2],cl
mov [x3+2],cl
mov [x4+2],cl
mov [x5+2],cl
mov [x6+2],cl
mov [x7+2],cl
mov [x8+2],cl
mov [x9+2],cl
mov [x10+2],cl
sub cl,dl
dec eax
mov [y1+2],cl
mov [y2+2],cl
mov [y3+2],cl
mov [y4+2],cl
mov [y5+2],cl
mov [y6+2],cl
mov [y7+2],cl
mov [y8+2],cl
mov [y9+2],cl
mov [y10+2],cl
cmp eax,0 ; if x bits is 0, mask must be 0 too.
jz .notted
not eax
.notted:
pop ecx
mov [m1+2],eax
mov [m2+2],eax
mov [m3+2],eax
mov [m4+2],eax
mov [m5+2],eax
mov [m6+2],eax
mov [m7+2],eax
mov [m8+2],eax
mov [m9+2],eax
mov [m10+2],eax
selfmod rtext_start, rtext_end
ret
%ifndef M_TARGET_MACHO
SECTION .rtext progbits alloc exec write align=64
%else
SECTION .text align=64
GLOBAL _rtext_tmap2_start
_rtext_tmap2_start:
%endif
rtext_start:
GLOBAL R_DrawTiltedPlane_ASM
GLOBAL @R_DrawTiltedPlane_ASM@8
R_DrawTiltedPlane_ASM:
mov ecx,[esp+4]
mov edx,[esp+8]
; ecx = y
; edx = x
@R_DrawTiltedPlane_ASM@8:
push ebx
push esi
push edi
push ebp
mov eax,[centery]
movzx ebx,word [spanend+ecx*2]
sub eax,ecx ; eax = centery-y
sub ebx,edx ; ebx = span length - 1
mov edi,[ylookup+ecx*4]
push eax
add edi,[dc_destorg]
add edi,edx ; edi = frame buffer pointer
sub edx,[centerx] ; edx = x-centerx
push edx
xor eax,eax
fild dword [esp+4] ; ymul
fild dword [esp] ; xmul | ymul
fld dword [sv_j] ; sv.j | xmul | ymul
fmul st0,st2 ; sv.j*ymul | xmul | ymul
fld dword [su_j] ; su.j | sv.j*ymul | xmul | ymul
fmul st0,st3 ; su.j*ymul | sv.j*ymul | xmul | ymul
fld dword [sz_j] ; sz.j | su.j*ymul | sv.j*ymul | xmul | ymul
fmulp st4,st0 ; su.j*ymul | sv.j*ymul | xmul | sz.j*ymul
fld dword [sv_i] ; sv.i | su.j*ymul | sv.j*ymul | xmul | sz.j*ymul
fmul st0,st3 ; sv.i*xmul | su.j*ymul | sv.j*ymul | xmul | sz.j*ymul
fld dword [su_i] ; su.i | sv.i*xmul | su.j*ymul | sv.j*ymul | xmul | sz.j*ymul
fmul st0,st4 ; su.i*xmul | sv.i*xmul | su.j*ymul | sv.j*ymul | xmul | sz.j*ymul
fld dword [sz_i] ; sz.i | su.i*xmul | sv.i*xmul | su.j*ymul | sv.j*ymul | xmul | sz.j*ymul
fmulp st5,st0 ; su.i*xmul | sv.i*xmul | su.j*ymul | sv.j*ymul | sz.i*xmul | sz.j*ymul
fxch st1 ; sv.i*xmul | su.i*xmul | su.j*ymul | sv.j*ymul | sz.i*xmul | sz.j*ymul
faddp st3,st0 ; su.i*xmul | su.j*ymul | sv.i*xmul+sv.j*ymul | sz.i*xmul | sz.j*ymul
faddp st1,st0 ; su.i*xmul+su.j*ymul | sv.i*xmul+sv.j*ymul | sz.i*xmul | sz.j*ymul
fxch st3 ; sz.j*ymul | sv.i*xmul+sv.j*ymul | sz.i*xmul | su.i*xmul+su.j*ymul
faddp st2,st0 ; sv.i*xmul+sv.j*ymul | sz.i*xmul+sz.j*ymul | su.i*xmul+su.j*ymul
fadd dword [sv_k] ; v/z | sz.i*xmul+sz.j*ymul | su.i*xmul+su.j*ymul
fxch st1 ; sz.i*xmul+sz.j*ymul | v/z | su.i*xmul+su.j*ymul
fadd dword [sz_k] ; 1/z | v/z | su.i*xmul+su.j*ymul
fxch st2 ; su.i*xmul+su.j*ymul | v/z | 1/z
fadd dword [su_k] ; u/z | v/z | 1/z
fxch st2 ; 1/z | v/z | u/z
fxch st1 ; v/z | 1/z | u/z
; if lighting is on, fill out the light table
mov al,[plane_shade]
test al,al
jz .litup
push ebx
fild dword [esp] ; width | v/z | 1/z | u/z
fmul dword [sz_i] ; width*sz.i | v/z | 1/z | u/z
fadd st0,st2 ; 1/endz | v/z | 1/z | u/z
fld st2 ; 1/z | 1/endz | v/z | 1/z | u/z
fmul dword [planelightfloat]
fxch st1
fmul dword [planelightfloat]
sub esp,16
fstp qword [esp]
fstp qword [esp+8]
call R_CalcTiltedLighting
add esp, 20
xor eax, eax
.litup add esp, 8
; calculate initial z, u, and v values
fld st1 ; 1/z | v/z | 1/z | u/z
fdivr dword [fp_1] ; z | v/z | 1/z | u/z
fld st3 ; u/z | z | v/z | 1/z | u/z
fmul st0,st1 ; u | z | v/z | 1/z | u/z
fld st2 ; v/z | u | z | v/z | 1/z | u/z
fmulp st2,st0 ; u | v | v/z | 1/z | u/z
fld st0
fistp qword [start_u]
fld st1
fistp qword [start_v]
cmp ebx,7 ; Do we have at least 8 pixels to plot?
jl near ShortStrip
; yes, we do, so figure out tex coords at end of this span
; multiply i values by span length (8)
fld dword [su_i] ; su.i
fmul dword [fp_8] ; su.i*8
fld dword [sv_i] ; sv.i | su.i*8
fmul dword [fp_8] ; sv.i*8 | su.i*8
fld dword [sz_i] ; sz.i | sv.i*8 | su.i*8
fmul dword [fp_8] ; sz.i*8 | sv.i*8 | su.i*8
fxch st2 ; su.i*8 | sv.i*8 | sz.i*8
fstp qword [step_uz] ; sv.i*8 | sz.i*8
fstp qword [step_vz] ; sz.i*8
fst qword [step_iz] ; sz.i*8
; find tex coords at start of next span
faddp st4
fld qword [step_vz]
faddp st3
fld qword [step_uz]
faddp st5
fld st3 ; 1/z | u | v | v/z | 1/z | u/z
fdivr dword [fp_1] ; z | u | v | v/z | 1/z | u/z
fst dword [end_z]
fld st5 ; u/z | z | u | v | v/z | 1/z | u/z
fmul st0,st1 ; u' | z | u | v | v/z | 1/z | u/z
fxch st1 ; z | u' | u | v | v/z | 1/z | u/z
fmul st0,st4 ; v' | u' | u | v | v/z | 1/z | u/z
fxch st3 ; v | u' | u | v' | v/z | 1/z | u/z
; now subtract to get stepping values for this span
fsubr st0,st3 ; v'-v | u' | u | v' | v/z | 1/z | u/z
fxch st2 ; u | u' | v'-v | v' | v/z | 1/z | u/z
fsubr st0,st1 ; u'-u | u' | v'-v | v' | v/z | 1/z | u/z
fxch st2 ; v'-v | u' | u'-u | v' | v/z | 1/z | u/z
fmul dword [fp_8recip] ; vstep | u' | u'-u | v' | v/z | 1/z | u/z
fxch st1 ; u' | vstep | u'-u | v' | v/z | 1/z | u/z
fxch st2 ; u'-u | vstep | u' | v' | v/z | 1/z | u/z
fmul dword [fp_8recip] ; ustep | vstep | u' | v' | v/z | 1/z | u/z
fxch st1 ; vstep | ustep | u' | v' | v/z | 1/z | u/z
fistp qword [step_v] ; ustep | u' | v' | v/z | 1/z | u/z
fistp qword [step_u] ; u | v | v/z | 1/z | u/z
FullSpan:
xor eax,eax
cmp ebx,15 ; is there another complete span after this one?
jl NextIsShort
; there is a complete span after this one
fld qword [step_iz]
faddp st4,st0
fld qword [step_vz]
faddp st3,st0
fld qword [step_uz]
faddp st5,st0
jmp StartDiv
NextIsShort:
cmp ebx,8 ; if next span is no more than 1 pixel, then we already
jle DrawFullSpan ; know everything we need to draw it
fld dword [sz_i] ; sz.i | u | v | v/z | 1/z | u/z
fmul dword [fp_quickint-8*4+ebx*4]
fld dword [sv_i] ; sv.i | sz.i | u | v | v/z | 1/z | u/z
fmul dword [fp_quickint-8*4+ebx*4]
fld dword [su_i] ; su.i | sv.i | sz.i | u | v | v/z | 1/z | u/z
fmul dword [fp_quickint-8*4+ebx*4]
fxch st2 ; sz.i | sv.i | su.i | u | v | v/z | 1/z | u/z
faddp st6,st0 ; sv.i | su.i | u | v | v/z | 1/z | u/z
faddp st4,st0 ; su.i | u | v | v/z | 1/z | u/z
faddp st5,st0 ; u | v | v/z | 1/z | u/z
StartDiv:
fld st3 ; 1/z | u | v | v/z | 1/z | u/z
fdivr dword [fp_1] ; z | u | v | v/z | 1/z | u/z
DrawFullSpan:
mov ecx,[start_v]
mov edx,[start_u]
add ecx,[pviewy]
add edx,[pviewx]
mov esi,edx
mov ebp,ecx
x1 shr ebp,26
m1 and esi,0xfc000000
y1 shr esi,20
add ecx,[step_v]
add edx,[step_u]
fetch1 mov al,[ebp+esi+SPACEFILLER4]
mov ebp,[tiltlighting+ebx*4]
mov esi,edx
mov al,[ebp+eax]
mov ebp,ecx
mov [edi+0],al
x2 shr ebp,26
m2 and esi,0xfc000000
y2 shr esi,20
add ecx,[step_v]
add edx,[step_u]
fetch2 mov al,[ebp+esi+SPACEFILLER4]
mov ebp,[tiltlighting+ebx*4-4]
mov esi,edx
mov al,[ebp+eax]
mov ebp,ecx
mov [edi+1],al
x3 shr ebp,26
m3 and esi,0xfc000000
y3 shr esi,20
add ecx,[step_v]
add edx,[step_u]
fetch3 mov al,[ebp+esi+SPACEFILLER4]
mov ebp,[tiltlighting+ebx*4-8]
mov esi,edx
mov al,[ebp+eax]
mov ebp,ecx
mov [edi+2],al
x4 shr ebp,26
m4 and esi,0xfc000000
y4 shr esi,20
add ecx,[step_v]
add edx,[step_u]
fetch4 mov al,[ebp+esi+SPACEFILLER4]
mov ebp,[tiltlighting+ebx*4-12]
mov esi,edx
mov al,[ebp+eax]
mov ebp,ecx
mov [edi+3],al
x5 shr ebp,26
m5 and esi,0xfc000000
y5 shr esi,20
add ecx,[step_v]
add edx,[step_u]
fetch5 mov al,[ebp+esi+SPACEFILLER4]
mov ebp,[tiltlighting+ebx*4-16]
mov esi,edx
mov al,[ebp+eax]
mov ebp,ecx
mov [edi+4],al
x6 shr ebp,26
m6 and esi,0xfc000000
y6 shr esi,20
add ecx,[step_v]
add edx,[step_u]
fetch6 mov al,[ebp+esi+SPACEFILLER4]
mov ebp,[tiltlighting+ebx*4-20]
mov esi,edx
mov al,[ebp+eax]
mov ebp,ecx
mov [edi+5],al
x7 shr ebp,26
m7 and esi,0xfc000000
y7 shr esi,20
add ecx,[step_v]
add edx,[step_u]
fetch7 mov al,[ebp+esi+SPACEFILLER4]
mov ebp,[tiltlighting+ebx*4-24]
x8 shr ecx,26
mov al,[ebp+eax]
m8 and edx,0xfc000000
mov [edi+6],al
y8 shr edx,20
mov ebp,[tiltlighting+ebx*4-28]
fetch8 mov al,[edx+ecx+SPACEFILLER4]
mov al,[ebp+eax]
mov [edi+7],al
add edi,8
sub ebx,8
jl near Done
fld st1
fistp qword [start_u]
fld st2
fistp qword [start_v]
cmp ebx,7
jl near EndIsShort
fst dword [end_z]
fld st5 ; u/z | z | u | v | v/z | 1/z | u/z
fmul st0,st1 ; u' | z | u | v | v/z | 1/z | u/z
fxch st1 ; z | u' | u | v | v/z | 1/z | u/z
fmul st0,st4 ; v' | u' | u | v | v/z | 1/z | u/z
fxch st3 ; v | u' | u | v' | v/z | 1/z | u/z
fsubr st0,st3 ; v'-v | u' | u | v' | v/z | 1/z | u/z
fxch st2 ; u | u' | v'-v | v' | v/z | 1/z | u/z
fsubr st0,st1 ; u'-u | u' | v'-v | v' | v/z | 1/z | u/z
fxch st2 ; v'-v | u' | u'-u | v' | v/z | 1/z | u/z
fmul dword [fp_8recip] ; vstep | u' | u'-u | v' | v/z | 1/z | u/z
fxch st1 ; u' | vstep | u'-u | v' | v/z | 1/z | u/z
fxch st2 ; u'-u | vstep | u' | v' | v/z | 1/z | u/z
fmul dword [fp_8recip] ; ustep | vstep | u' | v' | v/z | 1/z | u/z
fxch st1 ; vstep | ustep | u' | v' | v/z | 1/z | u/z
fistp qword [step_v] ; ustep | u' | v' | v/z | 1/z | u/z
fistp qword [step_u] ; u | v | v/z | 1/z | u/z
jmp FullSpan
OnlyOnePixelAtEnd:
fld st0
fistp qword [start_u]
fld st1
fistp qword [start_v]
OnlyOnePixel:
mov edx,[start_v]
mov ecx,[start_u]
add edx,[pviewy]
add ecx,[pviewx]
x9 shr edx,26
m9 and ecx,0xfc000000
y9 shr ecx,20
mov ebp,[tiltlighting]
fetch9 mov al,[ecx+edx+SPACEFILLER4]
mov al,[ebp+eax]
mov [edi],al
Done:
fcompp
fcompp
fstp st0
pop ebp
pop edi
pop esi
pop ebx
ret
ShortStrip:
cmp ebx,0
jle near OnlyOnePixel
MoreThanOnePixel:
fld dword [sz_i] ; sz.i | u | v | v/z | 1/z | u/z
fmul dword [fp_quickint+ebx*4]
fld dword [sv_i] ; sv.i | sz.i | u | v | v/z | 1/z | u/z
fmul dword [fp_quickint+ebx*4]
fld dword [su_i] ; su.i | sv.i | sz.i | u | v | v/z | 1/z | u/z
fmul dword [fp_quickint+ebx*4]
fxch st2 ; sz.i | sv.i | su.i | u | v | v/z | 1/z | u/z
faddp st6,st0 ; sv.i | su.i | u | v | v/z | 1/z | u/z
faddp st4,st0 ; su.i | u | v | v/z | 1/z | u/z
faddp st5,st0 ; u | v | v/z | 1/z | u/z
fld st3 ; 1/z | u | v | v/z | 1/z | u/z
fdivr dword [fp_1] ; z | u | v | v/z | 1/z | u/z
jmp CalcPartialSteps
EndIsShort:
cmp ebx,0
je near OnlyOnePixelAtEnd
CalcPartialSteps:
fst dword [end_z]
fld st5 ; u/z | z | u | v | v/z | 1/z | u/z
fmul st0,st1 ; u' | z | u | v | v/z | 1/z | u/z
fxch st1 ; z | u' | u | v | v/z | 1/z | u/z
fmul st0,st4 ; v' | u' | u | v | v/z | 1/z | u/z
fxch st1 ; u' | v' | u | v | v/z | 1/z | u/z
fsubrp st2,st0 ; v' | u'-u | v | v/z | 1/z | u/z
fsubrp st2,st0 ; u'-u | v'-v | v/z | 1/z | u/z
fmul dword [spanrecips+ebx*4] ;ustep | v'-v | v/z | 1/z | u/z
fxch st1 ; v'-v | ustep | v/z | 1/z | u/z
fmul dword [spanrecips+ebx*4] ;vstep | ustep | v/z | 1/z | u/z
fxch st1 ; ustep | vstep | v/z | 1/z | u/z
fistp qword [step_u] ; vstep | v/z | 1/z | u/z
fistp qword [step_v] ; v/z | 1/z | u/z
mov ecx,[start_v]
mov edx,[start_u]
add ecx,[pviewy]
add edx,[pviewx]
mov esi,edx
mov ebp,ecx
endloop:
x10 shr ebp,26
m10 and esi,0xfc000000
y10 shr esi,20
inc edi
add ecx,[step_v]
add edx,[step_u]
fetch10 mov al,[ebp+esi+SPACEFILLER4]
mov ebp,[tiltlighting+ebx*4]
mov esi,edx
dec ebx
mov al,[ebp+eax]
mov ebp,ecx
mov [edi-1],al
jge endloop
fcompp
fstp st0
pop ebp
pop edi
pop esi
pop ebx
ret
rtext_end:
%ifdef M_TARGET_MACHO
GLOBAL _rtext_tmap2_end
_rtext_tmap2_end:
%endif

View File

@ -1,344 +0,0 @@
%include "valgrind.inc"
%ifdef M_TARGET_WATCOM
SEGMENT DATA PUBLIC ALIGN=16 CLASS=DATA USE32
SEGMENT DATA
%else
SECTION .data
%endif
%ifndef M_TARGET_LINUX
%define ylookup _ylookup
%define vplce _vplce
%define vince _vince
%define palookupoffse _palookupoffse
%define bufplce _bufplce
%define dc_iscale _dc_iscale
%define dc_colormap _dc_colormap
%define dc_count _dc_count
%define dc_dest _dc_dest
%define dc_source _dc_source
%define dc_texturefrac _dc_texturefrac
%define dc_pitch _dc_pitch
%define setupvlinetallasm _setupvlinetallasm
%define vlinetallasm4 _vlinetallasm4
%define vlinetallasmathlon4 _vlinetallasmathlon4
%define vlinetallasm1 _vlinetallasm1
%define prevlinetallasm1 _prevlinetallasm1
%endif
EXTERN vplce
EXTERN vince
EXTERN palookupoffse
EXTERN bufplce
EXTERN ylookup
EXTERN dc_iscale
EXTERN dc_colormap
EXTERN dc_count
EXTERN dc_dest
EXTERN dc_source
EXTERN dc_texturefrac
EXTERN dc_pitch
GLOBAL vlt4pitch
GLOBAL vlt1pitch
%ifdef M_TARGET_WATCOM
SEGMENT CODE PUBLIC ALIGN=16 CLASS=CODE USE32
SEGMENT CODE
%else
SECTION .text
%endif
ALIGN 16
GLOBAL setpitch3
setpitch3:
mov [vltpitch+2], eax
mov [vltpitcha+2],eax
mov [vlt1pitch1+2], eax
mov [vlt1pitch2+2], eax
selfmod vltpitch, vlt1pitch2+6
ret
ALIGN 16
GLOBAL setupvlinetallasm
setupvlinetallasm:
mov ecx, [esp+4]
mov [shifter1+2], cl
mov [shifter2+2], cl
mov [shifter3+2], cl
mov [shifter4+2], cl
mov [shifter1a+2], cl
mov [shifter2a+2], cl
mov [shifter3a+2], cl
mov [shifter4a+2], cl
mov [preshift+2], cl
mov [shift11+2], cl
mov [shift12+2], cl
selfmod shifter1, shift12+6
ret
%ifdef M_TARGET_MACHO
SECTION .text align=64
GLOBAL _rtext_tmap3_start
_rtext_tmap3_start:
%else
SECTION .rtext progbits alloc exec write align=64
%endif
ALIGN 16
GLOBAL vlinetallasm4
vlinetallasm4:
push ebx
mov eax, [bufplce+0]
mov ebx, [bufplce+4]
mov ecx, [bufplce+8]
mov edx, [bufplce+12]
mov [source1+3], eax
mov [source2+3], ebx
mov [source3+3], ecx
mov [source4+3], edx
mov eax, [palookupoffse+0]
mov ebx, [palookupoffse+4]
mov ecx, [palookupoffse+8]
mov edx, [palookupoffse+12]
mov [lookup1+2], eax
mov [lookup2+2], ebx
mov [lookup3+2], ecx
mov [lookup4+2], edx
mov eax, [vince+0]
mov ebx, [vince+4]
mov ecx, [vince+8]
mov edx, [vince+12]
mov [step1+2], eax
mov [step2+2], ebx
mov [step3+2], ecx
mov [step4+1], edx
push ebp
push esi
push edi
mov ecx, [dc_count]
mov edi, [dc_dest]
mov eax, dword [ylookup+ecx*4-4]
add eax, edi
sub edi, eax
mov [write1+2],eax
inc eax
mov [write2+2],eax
inc eax
mov [write3+2],eax
inc eax
mov [write4+2],eax
mov ebx, [vplce]
mov ecx, [vplce+4]
mov esi, [vplce+8]
mov eax, [vplce+12]
selfmod loopit, vltpitch
jmp loopit
ALIGN 16
loopit:
mov edx, ebx
shifter1: shr edx, 24
source1: movzx edx, BYTE [edx+0x88888888]
lookup1: mov dl, [edx+0x88888888]
write1: mov [edi+0x88888880], dl
step1: add ebx, 0x88888888
mov edx, ecx
shifter2: shr edx, 24
source2: movzx edx, BYTE [edx+0x88888888]
lookup2: mov dl, [edx+0x88888888]
write2: mov [edi+0x88888881], dl
step2: add ecx, 0x88888888
mov edx, esi
shifter3: shr edx, 24
source3: movzx edx, BYTE [edx+0x88888888]
lookup3: mov dl, BYTE [edx+0x88888888]
write3: mov [edi+0x88888882], dl
step3: add esi, 0x88888888
mov edx, eax
shifter4: shr edx, 24
source4: movzx edx, BYTE [edx+0x88888888]
lookup4: mov dl, [edx+0x88888888]
write4: mov [edi+0x88888883], dl
step4: add eax, 0x88888888
vltpitch: add edi, 320
jle near loopit
mov [vplce], ebx
mov [vplce+4], ecx
mov [vplce+8], esi
mov [vplce+12], eax
pop edi
pop esi
pop ebp
pop ebx
ret
ALIGN 16
GLOBAL vlinetallasmathlon4
vlinetallasmathlon4:
push ebx
mov eax, [bufplce+0]
mov ebx, [bufplce+4]
mov ecx, [bufplce+8]
mov edx, [bufplce+12]
mov [source1a+3], eax
mov [source2a+3], ebx
mov [source3a+3], ecx
mov [source4a+3], edx
mov eax, [palookupoffse+0]
mov ebx, [palookupoffse+4]
mov ecx, [palookupoffse+8]
mov edx, [palookupoffse+12]
mov [lookup1a+2], eax
mov [lookup2a+2], ebx
mov [lookup3a+2], ecx
mov [lookup4a+2], edx
mov eax, [vince+0]
mov ebx, [vince+4]
mov ecx, [vince+8]
mov edx, [vince+12]
mov [step1a+2], eax
mov [step2a+2], ebx
mov [step3a+2], ecx
mov [step4a+1], edx
push ebp
push esi
push edi
mov ecx, [dc_count]
mov edi, [dc_dest]
mov eax, dword [ylookup+ecx*4-4]
add eax, edi
sub edi, eax
mov [write1a+2],eax
inc eax
mov [write2a+2],eax
inc eax
mov [write3a+2],eax
inc eax
mov [write4a+2],eax
mov ebp, [vplce]
mov ecx, [vplce+4]
mov esi, [vplce+8]
mov eax, [vplce+12]
selfmod loopita, vltpitcha
jmp loopita
; Unfortunately, this code has not been carefully analyzed to determine
; how well it utilizes the processor's instruction units. Instead, I just
; kept rearranging code, seeing what sped it up and what slowed it down
; until I arrived at this. The is the fastest version I was able to
; manage, but that does not mean it cannot be made faster with careful
; instructing shuffling.
ALIGN 64
loopita: mov edx, ebp
mov ebx, ecx
shifter1a: shr edx, 24
shifter2a: shr ebx, 24
source1a: movzx edx, BYTE [edx+0x88888888]
source2a: movzx ebx, BYTE [ebx+0x88888888]
step1a: add ebp, 0x88888888
step2a: add ecx, 0x88888888
lookup1a: mov dl, [edx+0x88888888]
lookup2a: mov dh, [ebx+0x88888888]
mov ebx, esi
write1a: mov [edi+0x88888880], dl
write2a: mov [edi+0x88888881], dh
shifter3a: shr ebx, 24
mov edx, eax
source3a: movzx ebx, BYTE [ebx+0x88888888]
shifter4a: shr edx, 24
step3a: add esi, 0x88888888
source4a: movzx edx, BYTE [edx+0x88888888]
step4a: add eax, 0x88888888
lookup3a: mov bl, [ebx+0x88888888]
lookup4a: mov dl, [edx+0x88888888]
write3a: mov [edi+0x88888882], bl
write4a: mov [edi+0x88888883], dl
vltpitcha: add edi, 320
jle near loopita
mov [vplce], ebp
mov [vplce+4], ecx
mov [vplce+8], esi
mov [vplce+12], eax
pop edi
pop esi
pop ebp
pop ebx
ret
ALIGN 16
GLOBAL prevlinetallasm1
prevlinetallasm1:
mov ecx, [dc_count]
cmp ecx, 1
ja vlinetallasm1
mov eax, [dc_iscale]
mov edx, [dc_texturefrac]
add eax, edx
mov ecx, [dc_source]
preshift: shr edx, 16
push ebx
push edi
mov edi, [dc_colormap]
movzx ebx, byte [ecx+edx]
mov ecx, [dc_dest]
mov bl, byte [edi+ebx]
pop edi
mov byte [ecx], bl
pop ebx
ret
ALIGN 16
GLOBAL vlinetallasm1
vlinetallasm1:
push ebp
push ebx
push edi
push esi
mov ebp, [dc_count]
mov ebx, [dc_texturefrac] ; ebx = frac
mov edi, [dc_dest]
mov ecx, ebx
shift11: shr ecx, 16
mov esi, [dc_source]
mov edx, [dc_iscale]
vlt1pitch1: sub edi, 0x88888888
mov eax, [dc_colormap]
loop2:
movzx ecx, BYTE [esi+ecx]
add ebx, edx
vlt1pitch2: add edi, 0x88888888
mov cl,[eax+ecx]
mov [edi],cl
mov ecx,ebx
shift12: shr ecx,16
dec ebp
jnz loop2
mov eax,ebx
pop esi
pop edi
pop ebx
pop ebp
ret
%ifdef M_TARGET_MACHO
GLOBAL _rtext_tmap3_end
_rtext_tmap3_end:
%endif

View File

@ -1,150 +0,0 @@
%ifnidn __OUTPUT_FORMAT__,win64
%error tmap3.asm is for Win64 output. You should use tmap.s for other systems.
%endif
BITS 64
DEFAULT REL
EXTERN vplce
EXTERN vince
EXTERN palookupoffse
EXTERN bufplce
EXTERN dc_count
EXTERN dc_dest
EXTERN dc_pitch
SECTION .text
GLOBAL ASM_PatchPitch
ASM_PatchPitch:
mov ecx, [dc_pitch]
mov [pm+3], ecx
mov [vltpitch+3], ecx
ret
align 16
GLOBAL setupvlinetallasm
setupvlinetallasm:
mov [shifter1+2], cl
mov [shifter2+2], cl
mov [shifter3+2], cl
mov [shifter4+2], cl
ret
align 16
; Yasm can't do progbits alloc exec for win64?
; Hmm, looks like it's automatic. No worries, then.
SECTION .rtext write ;progbits alloc exec
GLOBAL vlinetallasm4
PROC_FRAME vlinetallasm4
rex_push_reg rbx
push_reg rdi
push_reg r15
push_reg r14
push_reg r13
push_reg r12
push_reg rbp
push_reg rsi
alloc_stack 8 ; Stack must be 16-byte aligned
END_PROLOGUE
; rax = bufplce base address
; rbx =
; rcx = offset from rdi/count (negative)
; edx/rdx = scratch
; rdi = bottom of columns to write to
; r8d-r11d = column offsets
; r12-r15 = palookupoffse[0] - palookupoffse[4]
mov ecx, [dc_count]
mov rdi, [dc_dest]
test ecx, ecx
jle vltepilog ; count must be positive
mov rax, [bufplce]
mov r8, [bufplce+8]
sub r8, rax
mov r9, [bufplce+16]
sub r9, rax
mov r10, [bufplce+24]
sub r10, rax
mov [source2+4], r8d
mov [source3+4], r9d
mov [source4+4], r10d
pm: imul rcx, 320
mov r12, [palookupoffse]
mov r13, [palookupoffse+8]
mov r14, [palookupoffse+16]
mov r15, [palookupoffse+24]
mov r8d, [vince]
mov r9d, [vince+4]
mov r10d, [vince+8]
mov r11d, [vince+12]
mov [step1+3], r8d
mov [step2+3], r9d
mov [step3+3], r10d
mov [step4+3], r11d
add rdi, rcx
neg rcx
mov r8d, [vplce]
mov r9d, [vplce+4]
mov r10d, [vplce+8]
mov r11d, [vplce+12]
jmp loopit
ALIGN 16
loopit:
mov edx, r8d
shifter1: shr edx, 24
step1: add r8d, 0x88888888
movzx edx, BYTE [rax+rdx]
mov ebx, r9d
mov dl, [r12+rdx]
shifter2: shr ebx, 24
step2: add r9d, 0x88888888
source2: movzx ebx, BYTE [rax+rbx+0x88888888]
mov ebp, r10d
mov bl, [r13+rbx]
shifter3: shr ebp, 24
step3: add r10d, 0x88888888
source3: movzx ebp, BYTE [rax+rbp+0x88888888]
mov esi, r11d
mov bpl, BYTE [r14+rbp]
shifter4: shr esi, 24
step4: add r11d, 0x88888888
source4: movzx esi, BYTE [rax+rsi+0x88888888]
mov [rdi+rcx], dl
mov [rdi+rcx+1], bl
mov sil, BYTE [r15+rsi]
mov [rdi+rcx+2], bpl
mov [rdi+rcx+3], sil
vltpitch: add rcx, 320
jl loopit
mov [vplce], r8d
mov [vplce+4], r9d
mov [vplce+8], r10d
mov [vplce+12], r11d
vltepilog:
add rsp, 8
pop rsi
pop rbp
pop r12
pop r13
pop r14
pop r15
pop rdi
pop rbx
ret
vlinetallasm4_end:
ENDPROC_FRAME
ALIGN 16

View File

@ -1,141 +0,0 @@
#%include "valgrind.inc"
.section .text
.globl ASM_PatchPitch
ASM_PatchPitch:
movl dc_pitch(%rip), %ecx
movl %ecx, pm+3(%rip)
movl %ecx, vltpitch+3(%rip)
# selfmod pm, vltpitch+6
ret
.align 16
.globl setupvlinetallasm
setupvlinetallasm:
movb %dil, shifter1+2(%rip)
movb %dil, shifter2+2(%rip)
movb %dil, shifter3+2(%rip)
movb %dil, shifter4+2(%rip)
# selfmod shifter1, shifter4+3
ret
.align 16
.section .rtext,"awx"
.globl vlinetallasm4
.type vlinetallasm4,@function
vlinetallasm4:
.cfi_startproc
push %rbx
push %rdi
push %r15
push %r14
push %r13
push %r12
push %rbp
push %rsi
subq $8, %rsp # Does the stack need to be 16-byte aligned for Linux?
.cfi_adjust_cfa_offset 8
# rax = bufplce base address
# rbx =
# rcx = offset from rdi/count (negative)
# edx/rdx = scratch
# rdi = bottom of columns to write to
# r8d-r11d = column offsets
# r12-r15 = palookupoffse[0] - palookupoffse[4]
movl dc_count(%rip), %ecx
movq dc_dest(%rip), %rdi
testl %ecx, %ecx
jle vltepilog # count must be positive
movq bufplce(%rip), %rax
movq bufplce+8(%rip), %r8
subq %rax, %r8
movq bufplce+16(%rip), %r9
subq %rax, %r9
movq bufplce+24(%rip), %r10
subq %rax, %r10
movl %r8d, source2+4(%rip)
movl %r9d, source3+4(%rip)
movl %r10d, source4+4(%rip)
pm: imulq $320, %rcx
movq palookupoffse(%rip), %r12
movq palookupoffse+8(%rip), %r13
movq palookupoffse+16(%rip), %r14
movq palookupoffse+24(%rip), %r15
movl vince(%rip), %r8d
movl vince+4(%rip), %r9d
movl vince+8(%rip), %r10d
movl vince+12(%rip), %r11d
movl %r8d, step1+3(%rip)
movl %r9d, step2+3(%rip)
movl %r10d, step3+3(%rip)
movl %r11d, step4+3(%rip)
addq %rcx, %rdi
negq %rcx
movl vplce(%rip), %r8d
movl vplce+4(%rip), %r9d
movl vplce+8(%rip), %r10d
movl vplce+12(%rip), %r11d
# selfmod loopit, vltepilog
jmp loopit
.align 16
loopit:
movl %r8d, %edx
shifter1: shrl $24, %edx
step1: addl $0x44444444, %r8d
movzbl (%rax,%rdx), %edx
movl %r9d, %ebx
movb (%r12,%rdx), %dl
shifter2: shrl $24, %ebx
step2: addl $0x44444444, %r9d
source2: movzbl 0x44444444(%rax,%rbx), %ebx
movl %r10d, %ebp
movb (%r13,%rbx), %bl
shifter3: shr $24, %ebp
step3: addl $0x44444444, %r10d
source3: movzbl 0x44444444(%rax,%rbp), %ebp
movl %r11d, %esi
movb (%r14,%rbp), %bpl
shifter4: shr $24, %esi
step4: add $0x44444444, %r11d
source4: movzbl 0x44444444(%rax,%rsi), %esi
movb %dl, (%rdi,%rcx)
movb %bl, 1(%rdi,%rcx)
movb (%r15,%rsi), %sil
movb %bpl, 2(%rdi,%rcx)
movb %sil, 3(%rdi,%rcx)
vltpitch: addq $320, %rcx
jl loopit
movl %r8d, vplce(%rip)
movl %r9d, vplce+4(%rip)
movl %r10d, vplce+8(%rip)
movl %r11d, vplce+12(%rip)
vltepilog:
addq $8, %rsp
.cfi_adjust_cfa_offset -8
pop %rsi
pop %rbp
pop %r12
pop %r13
pop %r14
pop %r15
pop %rdi
pop %rbx
ret
.cfi_endproc
.align 16