mirror of
https://github.com/ZDoom/qzdoom-gpl.git
synced 2025-03-05 08:20:57 +00:00
registers AMD64 provides, this routine still needs to be written as self- modifying code for maximum performance. The additional registers do allow for further optimization over the x86 version by allowing all four pixels to be in flight at the same time. The end result is that AMD64 ASM is about 2.18 times faster than AMD64 C and about 1.06 times faster than x86 ASM. (For further comparison, AMD64 C and x86 C are practically the same for this function.) Should I port any more assembly to AMD64, mvlineasm4 is the most likely candidate, but it's not used enough at this point to bother. Also, this may or may not work with Linux at the moment, since it doesn't have the eh_handler metadata. Win64 is easier, since I just need to structure the function prologue and epilogue properly and use some assembler directives/macros to automatically generate the metadata. And that brings up another point: You need YASM to assemble the AMD64 code, because NASM doesn't support the Win64 metadata directives. - Added an SSE version of DoBlending. This is strictly C intrinsics. VC++ still throws around unneccessary register moves. GCC seems to be pretty close to optimal, requiring only about 2 cycles/color. They're both faster than my hand-written MMX routine, so I don't need to feel bad about not hand-optimizing this for x64 builds. - Removed an extra instruction from DoBlending_MMX, transposed two instructions, and unrolled it once, shaving off about 80 cycles from the time required to blend 256 palette entries. Why? Because I tried writing a C version of the routine using compiler intrinsics and was appalled by all the extra movq's VC++ added to the code. GCC was better, but still generated extra instructions. I only wanted a C version because I can't use inline assembly with VC++'s x64 compiler, and x64 assembly is a bit of a pain. (It's a pain because Linux and Windows have different calling conventions, and you need to maintain extra metadata for functions.) So, the assembly version stays and the C version stays out. - Removed all the pixel doubling r_detail modes, since the one platform they were intended to assist (486) actually sees very little benefit from them. - Rewrote CheckMMX in C and renamed it to CheckCPU. - Fixed: CPUID function 0x80000005 is specified to return detailed L1 cache only for AMD processors, so we must not use it on other architectures, or we end up overwriting the L1 cache line size with 0 or some other number we don't actually understand. SVN r1134 (trunk)
630 lines
16 KiB
NASM
630 lines
16 KiB
NASM
;*
|
|
;* tmap2.nas
|
|
;* The tilted plane inner loop.
|
|
;*
|
|
;*---------------------------------------------------------------------------
|
|
;* Copyright 1998-2006 Randy Heit
|
|
;* All rights reserved.
|
|
;*
|
|
;* Redistribution and use in source and binary forms, with or without
|
|
;* modification, are permitted provided that the following conditions
|
|
;* are met:
|
|
;*
|
|
;* 1. Redistributions of source code must retain the above copyright
|
|
;* notice, this list of conditions and the following disclaimer.
|
|
;* 2. Redistributions in binary form must reproduce the above copyright
|
|
;* notice, this list of conditions and the following disclaimer in the
|
|
;* documentation and/or other materials provided with the distribution.
|
|
;* 3. The name of the author may not be used to endorse or promote products
|
|
;* derived from this software without specific prior written permission.
|
|
;*
|
|
;* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
|
;* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|
;* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
|
;* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
|
;* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
;* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
;* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
;* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
|
;* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
;*---------------------------------------------------------------------------
|
|
;*
|
|
;* I tried doing the ROL trick that R_DrawSpanP_ASM uses, and it was
|
|
;* actually slightly slower than the more straight-forward approach
|
|
;* used here, probably because the trick requires too much setup time.
|
|
;*
|
|
|
|
BITS 32
|
|
|
|
%include "valgrind.inc"
|
|
|
|
%define SPACEFILLER4 (0x44444444)
|
|
|
|
%ifndef M_TARGET_LINUX
|
|
|
|
%define plane_sz _plane_sz
|
|
%define plane_su _plane_su
|
|
%define plane_sv _plane_sv
|
|
%define plane_shade _plane_shade
|
|
%define planelightfloat _planelightfloat
|
|
%define spanend _spanend
|
|
%define ylookup _ylookup
|
|
%define dc_destorg _dc_destorg
|
|
%define ds_colormap _ds_colormap
|
|
%define ds_source _ds_source
|
|
%define centery _centery
|
|
%define centerx _centerx
|
|
%define ds_curtiltedsource _ds_curtiltedsource
|
|
%define pviewx _pviewx
|
|
%define pviewy _pviewy
|
|
%define tiltlighting _tiltlighting
|
|
|
|
%define R_DrawTiltedPlane_ASM _R_DrawTiltedPlane_ASM
|
|
%define R_SetTiltedSpanSource_ASM _R_SetTiltedSpanSource_ASM
|
|
%define R_CalcTiltedLighting _R_CalcTiltedLighting
|
|
|
|
%endif
|
|
|
|
EXTERN plane_sz
|
|
EXTERN plane_su
|
|
EXTERN plane_sv
|
|
EXTERN planelightfloat
|
|
EXTERN spanend
|
|
EXTERN ylookup
|
|
EXTERN dc_destorg
|
|
EXTERN ds_colormap
|
|
EXTERN centery
|
|
EXTERN centerx
|
|
EXTERN ds_source
|
|
EXTERN plane_shade
|
|
EXTERN pviewx
|
|
EXTERN pviewy
|
|
EXTERN tiltlighting
|
|
EXTERN R_CalcTiltedLighting
|
|
|
|
GLOBAL ds_curtiltedsource
|
|
|
|
%define sv_i plane_sv
|
|
%define sv_j plane_sv+4
|
|
%define sv_k plane_sv+8
|
|
|
|
%define su_i plane_su
|
|
%define su_j plane_su+4
|
|
%define su_k plane_su+8
|
|
|
|
%define sz_i plane_sz
|
|
%define sz_j plane_sz+4
|
|
%define sz_k plane_sz+8
|
|
|
|
%define SPANBITS 3
|
|
|
|
section .bss
|
|
|
|
start_u: resq 1
|
|
start_v: resq 1
|
|
step_u: resq 1
|
|
step_v: resq 1
|
|
|
|
step_iz: resq 1
|
|
step_uz: resq 1
|
|
step_vz: resq 1
|
|
|
|
end_z: resd 1
|
|
|
|
section .data
|
|
|
|
ds_curtiltedsource: dd SPACEFILLER4
|
|
|
|
fp_1:
|
|
spanrecips: dd 0x3f800000 ; 1/1
|
|
dd 0x3f000000 ; 1/2
|
|
dd 0x3eaaaaab ; 1/3
|
|
dd 0x3e800000 ; 1/4
|
|
dd 0x3e4ccccd ; 1/5
|
|
dd 0x3e2aaaab ; 1/6
|
|
dd 0x3e124925 ; 1/7
|
|
fp_8recip: dd 0x3e000000 ; 1/8
|
|
dd 0x3de38e39 ; 1/9
|
|
dd 0x3dcccccd ; 1/10
|
|
dd 0x3dba2e8c ; 1/11
|
|
dd 0x3daaaaab ; 1/12
|
|
dd 0x3d9d89d9 ; 1/13
|
|
dd 0x3d924925 ; 1/14
|
|
dd 0x3d888889 ; 1/15
|
|
|
|
fp_quickint: dd 0x3f800000 ; 1
|
|
dd 0x40000000 ; 2
|
|
dd 0x40400000 ; 3
|
|
dd 0x40800000 ; 4
|
|
dd 0x40a00000 ; 5
|
|
dd 0x40c00000 ; 6
|
|
dd 0x40e00000 ; 7
|
|
fp_8: dd 0x41000000 ; 8
|
|
|
|
section .text
|
|
|
|
GLOBAL R_SetTiltedSpanSource_ASM
|
|
GLOBAL @R_SetTiltedSpanSource_ASM@4
|
|
|
|
R_SetTiltedSpanSource_ASM:
|
|
mov ecx,[esp+4]
|
|
|
|
@R_SetTiltedSpanSource_ASM@4:
|
|
mov [fetch1+3],ecx
|
|
mov [fetch2+3],ecx
|
|
mov [fetch3+3],ecx
|
|
mov [fetch4+3],ecx
|
|
mov [fetch5+3],ecx
|
|
mov [fetch6+3],ecx
|
|
mov [fetch7+3],ecx
|
|
mov [fetch8+3],ecx
|
|
mov [fetch9+3],ecx
|
|
mov [fetch10+3],ecx
|
|
mov [ds_curtiltedsource],ecx
|
|
selfmod rtext_start, rtext_end
|
|
ret
|
|
|
|
GLOBAL SetTiltedSpanSize
|
|
|
|
SetTiltedSpanSize:
|
|
push ecx
|
|
mov cl,dl
|
|
neg cl
|
|
mov eax,1
|
|
shl eax,cl
|
|
mov cl,[esp]
|
|
neg cl
|
|
mov [x1+2],cl
|
|
mov [x2+2],cl
|
|
mov [x3+2],cl
|
|
mov [x4+2],cl
|
|
mov [x5+2],cl
|
|
mov [x6+2],cl
|
|
mov [x7+2],cl
|
|
mov [x8+2],cl
|
|
mov [x9+2],cl
|
|
mov [x10+2],cl
|
|
|
|
sub cl,dl
|
|
dec eax
|
|
mov [y1+2],cl
|
|
mov [y2+2],cl
|
|
mov [y3+2],cl
|
|
mov [y4+2],cl
|
|
mov [y5+2],cl
|
|
mov [y6+2],cl
|
|
mov [y7+2],cl
|
|
mov [y8+2],cl
|
|
mov [y9+2],cl
|
|
mov [y10+2],cl
|
|
not eax
|
|
pop ecx
|
|
|
|
mov [m1+2],eax
|
|
mov [m2+2],eax
|
|
mov [m3+2],eax
|
|
mov [m4+2],eax
|
|
mov [m5+2],eax
|
|
mov [m6+2],eax
|
|
mov [m7+2],eax
|
|
mov [m8+2],eax
|
|
mov [m9+2],eax
|
|
mov [m10+2],eax
|
|
|
|
selfmod rtext_start, rtext_end
|
|
|
|
ret
|
|
|
|
SECTION .rtext progbits alloc exec write align=64
|
|
|
|
rtext_start:
|
|
|
|
GLOBAL R_DrawTiltedPlane_ASM
|
|
GLOBAL @R_DrawTiltedPlane_ASM@8
|
|
|
|
R_DrawTiltedPlane_ASM:
|
|
mov ecx,[esp+4]
|
|
mov edx,[esp+8]
|
|
|
|
; ecx = y
|
|
; edx = x
|
|
|
|
@R_DrawTiltedPlane_ASM@8:
|
|
push ebx
|
|
push esi
|
|
push edi
|
|
push ebp
|
|
|
|
mov eax,[centery]
|
|
movzx ebx,word [spanend+ecx*2]
|
|
sub eax,ecx ; eax = centery-y
|
|
sub ebx,edx ; ebx = span length - 1
|
|
mov edi,[ylookup+ecx*4]
|
|
push eax
|
|
add edi,[dc_destorg]
|
|
add edi,edx ; edi = frame buffer pointer
|
|
sub edx,[centerx] ; edx = x-centerx
|
|
push edx
|
|
xor eax,eax
|
|
|
|
fild dword [esp+4] ; ymul
|
|
fild dword [esp] ; xmul | ymul
|
|
fld dword [sv_j] ; sv.j | xmul | ymul
|
|
fmul st0,st2 ; sv.j*ymul | xmul | ymul
|
|
fld dword [su_j] ; su.j | sv.j*ymul | xmul | ymul
|
|
fmul st0,st3 ; su.j*ymul | sv.j*ymul | xmul | ymul
|
|
fld dword [sz_j] ; sz.j | su.j*ymul | sv.j*ymul | xmul | ymul
|
|
fmulp st4,st0 ; su.j*ymul | sv.j*ymul | xmul | sz.j*ymul
|
|
fld dword [sv_i] ; sv.i | su.j*ymul | sv.j*ymul | xmul | sz.j*ymul
|
|
fmul st0,st3 ; sv.i*xmul | su.j*ymul | sv.j*ymul | xmul | sz.j*ymul
|
|
fld dword [su_i] ; su.i | sv.i*xmul | su.j*ymul | sv.j*ymul | xmul | sz.j*ymul
|
|
fmul st0,st4 ; su.i*xmul | sv.i*xmul | su.j*ymul | sv.j*ymul | xmul | sz.j*ymul
|
|
fld dword [sz_i] ; sz.i | su.i*xmul | sv.i*xmul | su.j*ymul | sv.j*ymul | xmul | sz.j*ymul
|
|
fmulp st5,st0 ; su.i*xmul | sv.i*xmul | su.j*ymul | sv.j*ymul | sz.i*xmul | sz.j*ymul
|
|
fxch st1 ; sv.i*xmul | su.i*xmul | su.j*ymul | sv.j*ymul | sz.i*xmul | sz.j*ymul
|
|
faddp st3,st0 ; su.i*xmul | su.j*ymul | sv.i*xmul+sv.j*ymul | sz.i*xmul | sz.j*ymul
|
|
faddp st1,st0 ; su.i*xmul+su.j*ymul | sv.i*xmul+sv.j*ymul | sz.i*xmul | sz.j*ymul
|
|
fxch st3 ; sz.j*ymul | sv.i*xmul+sv.j*ymul | sz.i*xmul | su.i*xmul+su.j*ymul
|
|
faddp st2,st0 ; sv.i*xmul+sv.j*ymul | sz.i*xmul+sz.j*ymul | su.i*xmul+su.j*ymul
|
|
fadd dword [sv_k] ; v/z | sz.i*xmul+sz.j*ymul | su.i*xmul+su.j*ymul
|
|
fxch st1 ; sz.i*xmul+sz.j*ymul | v/z | su.i*xmul+su.j*ymul
|
|
fadd dword [sz_k] ; 1/z | v/z | su.i*xmul+su.j*ymul
|
|
fxch st2 ; su.i*xmul+su.j*ymul | v/z | 1/z
|
|
fadd dword [su_k] ; u/z | v/z | 1/z
|
|
fxch st2 ; 1/z | v/z | u/z
|
|
fxch st1 ; v/z | 1/z | u/z
|
|
|
|
; if lighting is on, fill out the light table
|
|
mov al,[plane_shade]
|
|
test al,al
|
|
jz .litup
|
|
|
|
push ebx
|
|
fild dword [esp] ; width | v/z | 1/z | u/z
|
|
fmul dword [sz_i] ; width*sz.i | v/z | 1/z | u/z
|
|
fadd st0,st2 ; 1/endz | v/z | 1/z | u/z
|
|
fld st2 ; 1/z | 1/endz | v/z | 1/z | u/z
|
|
fmul dword [planelightfloat]
|
|
fxch st1
|
|
fmul dword [planelightfloat]
|
|
sub esp,8
|
|
fistp dword [esp]
|
|
fistp dword [esp+4]
|
|
call R_CalcTiltedLighting
|
|
add esp, 12
|
|
xor eax, eax
|
|
|
|
.litup add esp, 8
|
|
|
|
; calculate initial z, u, and v values
|
|
fld st1 ; 1/z | v/z | 1/z | u/z
|
|
fdivr dword [fp_1] ; z | v/z | 1/z | u/z
|
|
|
|
fld st3 ; u/z | z | v/z | 1/z | u/z
|
|
fmul st0,st1 ; u | z | v/z | 1/z | u/z
|
|
fld st2 ; v/z | u | z | v/z | 1/z | u/z
|
|
fmulp st2,st0 ; u | v | v/z | 1/z | u/z
|
|
fld st0
|
|
fistp qword [start_u]
|
|
fld st1
|
|
fistp qword [start_v]
|
|
|
|
cmp ebx,7 ; Do we have at least 8 pixels to plot?
|
|
jl near ShortStrip
|
|
|
|
; yes, we do, so figure out tex coords at end of this span
|
|
|
|
; multiply i values by span length (8)
|
|
fld dword [su_i] ; su.i
|
|
fmul dword [fp_8] ; su.i*8
|
|
fld dword [sv_i] ; sv.i | su.i*8
|
|
fmul dword [fp_8] ; sv.i*8 | su.i*8
|
|
fld dword [sz_i] ; sz.i | sv.i*8 | su.i*8
|
|
fmul dword [fp_8] ; sz.i*8 | sv.i*8 | su.i*8
|
|
fxch st2 ; su.i*8 | sv.i*8 | sz.i*8
|
|
fstp qword [step_uz] ; sv.i*8 | sz.i*8
|
|
fstp qword [step_vz] ; sz.i*8
|
|
fst qword [step_iz] ; sz.i*8
|
|
|
|
; find tex coords at start of next span
|
|
faddp st4
|
|
fld qword [step_vz]
|
|
faddp st3
|
|
fld qword [step_uz]
|
|
faddp st5
|
|
|
|
fld st3 ; 1/z | u | v | v/z | 1/z | u/z
|
|
fdivr dword [fp_1] ; z | u | v | v/z | 1/z | u/z
|
|
fst dword [end_z]
|
|
fld st5 ; u/z | z | u | v | v/z | 1/z | u/z
|
|
fmul st0,st1 ; u' | z | u | v | v/z | 1/z | u/z
|
|
fxch st1 ; z | u' | u | v | v/z | 1/z | u/z
|
|
fmul st0,st4 ; v' | u' | u | v | v/z | 1/z | u/z
|
|
fxch st3 ; v | u' | u | v' | v/z | 1/z | u/z
|
|
|
|
; now subtract to get stepping values for this span
|
|
fsubr st0,st3 ; v'-v | u' | u | v' | v/z | 1/z | u/z
|
|
fxch st2 ; u | u' | v'-v | v' | v/z | 1/z | u/z
|
|
fsubr st0,st1 ; u'-u | u' | v'-v | v' | v/z | 1/z | u/z
|
|
fxch st2 ; v'-v | u' | u'-u | v' | v/z | 1/z | u/z
|
|
fmul dword [fp_8recip] ; vstep | u' | u'-u | v' | v/z | 1/z | u/z
|
|
fxch st1 ; u' | vstep | u'-u | v' | v/z | 1/z | u/z
|
|
fxch st2 ; u'-u | vstep | u' | v' | v/z | 1/z | u/z
|
|
fmul dword [fp_8recip] ; ustep | vstep | u' | v' | v/z | 1/z | u/z
|
|
fxch st1 ; vstep | ustep | u' | v' | v/z | 1/z | u/z
|
|
fistp qword [step_v] ; ustep | u' | v' | v/z | 1/z | u/z
|
|
fistp qword [step_u] ; u | v | v/z | 1/z | u/z
|
|
|
|
FullSpan:
|
|
xor eax,eax
|
|
cmp ebx,15 ; is there another complete span after this one?
|
|
jl NextIsShort
|
|
|
|
; there is a complete span after this one
|
|
fld qword [step_iz]
|
|
faddp st4,st0
|
|
fld qword [step_vz]
|
|
faddp st3,st0
|
|
fld qword [step_uz]
|
|
faddp st5,st0
|
|
jmp StartDiv
|
|
|
|
NextIsShort:
|
|
cmp ebx,8 ; if next span is no more than 1 pixel, then we already
|
|
jle DrawFullSpan ; know everything we need to draw it
|
|
|
|
fld dword [sz_i] ; sz.i | u | v | v/z | 1/z | u/z
|
|
fmul dword [fp_quickint-8*4+ebx*4]
|
|
fld dword [sv_i] ; sv.i | sz.i | u | v | v/z | 1/z | u/z
|
|
fmul dword [fp_quickint-8*4+ebx*4]
|
|
fld dword [su_i] ; su.i | sv.i | sz.i | u | v | v/z | 1/z | u/z
|
|
fmul dword [fp_quickint-8*4+ebx*4]
|
|
fxch st2 ; sz.i | sv.i | su.i | u | v | v/z | 1/z | u/z
|
|
faddp st6,st0 ; sv.i | su.i | u | v | v/z | 1/z | u/z
|
|
faddp st4,st0 ; su.i | u | v | v/z | 1/z | u/z
|
|
faddp st5,st0 ; u | v | v/z | 1/z | u/z
|
|
|
|
StartDiv:
|
|
fld st3 ; 1/z | u | v | v/z | 1/z | u/z
|
|
fdivr dword [fp_1] ; z | u | v | v/z | 1/z | u/z
|
|
|
|
DrawFullSpan:
|
|
mov ecx,[start_v]
|
|
mov edx,[start_u]
|
|
|
|
add ecx,[pviewy]
|
|
add edx,[pviewx]
|
|
|
|
mov esi,edx
|
|
mov ebp,ecx
|
|
x1 shr ebp,26
|
|
m1 and esi,0xfc000000
|
|
y1 shr esi,20
|
|
add ecx,[step_v]
|
|
add edx,[step_u]
|
|
fetch1 mov al,[ebp+esi+SPACEFILLER4]
|
|
mov ebp,[tiltlighting+ebx*4]
|
|
mov esi,edx
|
|
mov al,[ebp+eax]
|
|
mov ebp,ecx
|
|
mov [edi+0],al
|
|
|
|
x2 shr ebp,26
|
|
m2 and esi,0xfc000000
|
|
y2 shr esi,20
|
|
add ecx,[step_v]
|
|
add edx,[step_u]
|
|
fetch2 mov al,[ebp+esi+SPACEFILLER4]
|
|
mov ebp,[tiltlighting+ebx*4-4]
|
|
mov esi,edx
|
|
mov al,[ebp+eax]
|
|
mov ebp,ecx
|
|
mov [edi+1],al
|
|
|
|
x3 shr ebp,26
|
|
m3 and esi,0xfc000000
|
|
y3 shr esi,20
|
|
add ecx,[step_v]
|
|
add edx,[step_u]
|
|
fetch3 mov al,[ebp+esi+SPACEFILLER4]
|
|
mov ebp,[tiltlighting+ebx*4-8]
|
|
mov esi,edx
|
|
mov al,[ebp+eax]
|
|
mov ebp,ecx
|
|
mov [edi+2],al
|
|
|
|
x4 shr ebp,26
|
|
m4 and esi,0xfc000000
|
|
y4 shr esi,20
|
|
add ecx,[step_v]
|
|
add edx,[step_u]
|
|
fetch4 mov al,[ebp+esi+SPACEFILLER4]
|
|
mov ebp,[tiltlighting+ebx*4-12]
|
|
mov esi,edx
|
|
mov al,[ebp+eax]
|
|
mov ebp,ecx
|
|
mov [edi+3],al
|
|
|
|
x5 shr ebp,26
|
|
m5 and esi,0xfc000000
|
|
y5 shr esi,20
|
|
add ecx,[step_v]
|
|
add edx,[step_u]
|
|
fetch5 mov al,[ebp+esi+SPACEFILLER4]
|
|
mov ebp,[tiltlighting+ebx*4-16]
|
|
mov esi,edx
|
|
mov al,[ebp+eax]
|
|
mov ebp,ecx
|
|
mov [edi+4],al
|
|
|
|
x6 shr ebp,26
|
|
m6 and esi,0xfc000000
|
|
y6 shr esi,20
|
|
add ecx,[step_v]
|
|
add edx,[step_u]
|
|
fetch6 mov al,[ebp+esi+SPACEFILLER4]
|
|
mov ebp,[tiltlighting+ebx*4-20]
|
|
mov esi,edx
|
|
mov al,[ebp+eax]
|
|
mov ebp,ecx
|
|
mov [edi+5],al
|
|
|
|
x7 shr ebp,26
|
|
m7 and esi,0xfc000000
|
|
y7 shr esi,20
|
|
add ecx,[step_v]
|
|
add edx,[step_u]
|
|
fetch7 mov al,[ebp+esi+SPACEFILLER4]
|
|
mov ebp,[tiltlighting+ebx*4-24]
|
|
x8 shr ecx,26
|
|
mov al,[ebp+eax]
|
|
m8 and edx,0xfc000000
|
|
mov [edi+6],al
|
|
|
|
y8 shr edx,20
|
|
mov ebp,[tiltlighting+ebx*4-28]
|
|
fetch8 mov al,[edx+ecx+SPACEFILLER4]
|
|
mov al,[ebp+eax]
|
|
mov [edi+7],al
|
|
add edi,8
|
|
|
|
sub ebx,8
|
|
jl near Done
|
|
|
|
fld st1
|
|
fistp qword [start_u]
|
|
fld st2
|
|
fistp qword [start_v]
|
|
|
|
cmp ebx,7
|
|
jl near EndIsShort
|
|
|
|
fst dword [end_z]
|
|
fld st5 ; u/z | z | u | v | v/z | 1/z | u/z
|
|
fmul st0,st1 ; u' | z | u | v | v/z | 1/z | u/z
|
|
fxch st1 ; z | u' | u | v | v/z | 1/z | u/z
|
|
fmul st0,st4 ; v' | u' | u | v | v/z | 1/z | u/z
|
|
fxch st3 ; v | u' | u | v' | v/z | 1/z | u/z
|
|
fsubr st0,st3 ; v'-v | u' | u | v' | v/z | 1/z | u/z
|
|
fxch st2 ; u | u' | v'-v | v' | v/z | 1/z | u/z
|
|
fsubr st0,st1 ; u'-u | u' | v'-v | v' | v/z | 1/z | u/z
|
|
fxch st2 ; v'-v | u' | u'-u | v' | v/z | 1/z | u/z
|
|
fmul dword [fp_8recip] ; vstep | u' | u'-u | v' | v/z | 1/z | u/z
|
|
fxch st1 ; u' | vstep | u'-u | v' | v/z | 1/z | u/z
|
|
fxch st2 ; u'-u | vstep | u' | v' | v/z | 1/z | u/z
|
|
fmul dword [fp_8recip] ; ustep | vstep | u' | v' | v/z | 1/z | u/z
|
|
fxch st1 ; vstep | ustep | u' | v' | v/z | 1/z | u/z
|
|
fistp qword [step_v] ; ustep | u' | v' | v/z | 1/z | u/z
|
|
fistp qword [step_u] ; u | v | v/z | 1/z | u/z
|
|
jmp FullSpan
|
|
|
|
OnlyOnePixelAtEnd:
|
|
fld st0
|
|
fistp qword [start_u]
|
|
fld st1
|
|
fistp qword [start_v]
|
|
|
|
OnlyOnePixel:
|
|
mov edx,[start_v]
|
|
mov ecx,[start_u]
|
|
add edx,[pviewy]
|
|
add ecx,[pviewx]
|
|
x9 shr edx,26
|
|
m9 and ecx,0xfc000000
|
|
y9 shr ecx,20
|
|
mov ebp,[tiltlighting]
|
|
fetch9 mov al,[ecx+edx+SPACEFILLER4]
|
|
mov al,[ebp+eax]
|
|
mov [edi],al
|
|
|
|
Done:
|
|
fcompp
|
|
fcompp
|
|
fstp st0
|
|
|
|
pop ebp
|
|
pop edi
|
|
pop esi
|
|
pop ebx
|
|
ret
|
|
|
|
ShortStrip:
|
|
cmp ebx,0
|
|
jle near OnlyOnePixel
|
|
|
|
MoreThanOnePixel:
|
|
fld dword [sz_i] ; sz.i | u | v | v/z | 1/z | u/z
|
|
fmul dword [fp_quickint+ebx*4]
|
|
fld dword [sv_i] ; sv.i | sz.i | u | v | v/z | 1/z | u/z
|
|
fmul dword [fp_quickint+ebx*4]
|
|
fld dword [su_i] ; su.i | sv.i | sz.i | u | v | v/z | 1/z | u/z
|
|
fmul dword [fp_quickint+ebx*4]
|
|
fxch st2 ; sz.i | sv.i | su.i | u | v | v/z | 1/z | u/z
|
|
faddp st6,st0 ; sv.i | su.i | u | v | v/z | 1/z | u/z
|
|
faddp st4,st0 ; su.i | u | v | v/z | 1/z | u/z
|
|
faddp st5,st0 ; u | v | v/z | 1/z | u/z
|
|
fld st3 ; 1/z | u | v | v/z | 1/z | u/z
|
|
fdivr dword [fp_1] ; z | u | v | v/z | 1/z | u/z
|
|
jmp CalcPartialSteps
|
|
|
|
EndIsShort:
|
|
cmp ebx,0
|
|
je near OnlyOnePixelAtEnd
|
|
|
|
CalcPartialSteps:
|
|
fst dword [end_z]
|
|
fld st5 ; u/z | z | u | v | v/z | 1/z | u/z
|
|
fmul st0,st1 ; u' | z | u | v | v/z | 1/z | u/z
|
|
fxch st1 ; z | u' | u | v | v/z | 1/z | u/z
|
|
fmul st0,st4 ; v' | u' | u | v | v/z | 1/z | u/z
|
|
fxch st1 ; u' | v' | u | v | v/z | 1/z | u/z
|
|
fsubrp st2,st0 ; v' | u'-u | v | v/z | 1/z | u/z
|
|
fsubrp st2,st0 ; u'-u | v'-v | v/z | 1/z | u/z
|
|
fmul dword [spanrecips+ebx*4] ;ustep | v'-v | v/z | 1/z | u/z
|
|
fxch st1 ; v'-v | ustep | v/z | 1/z | u/z
|
|
fmul dword [spanrecips+ebx*4] ;vstep | ustep | v/z | 1/z | u/z
|
|
fxch st1 ; ustep | vstep | v/z | 1/z | u/z
|
|
fistp qword [step_u] ; vstep | v/z | 1/z | u/z
|
|
fistp qword [step_v] ; v/z | 1/z | u/z
|
|
|
|
mov ecx,[start_v]
|
|
mov edx,[start_u]
|
|
|
|
add ecx,[pviewy]
|
|
add edx,[pviewx]
|
|
|
|
mov esi,edx
|
|
mov ebp,ecx
|
|
endloop:
|
|
x10 shr ebp,26
|
|
m10 and esi,0xfc000000
|
|
|
|
y10 shr esi,20
|
|
inc edi
|
|
|
|
add ecx,[step_v]
|
|
add edx,[step_u]
|
|
|
|
fetch10 mov al,[ebp+esi+SPACEFILLER4]
|
|
mov ebp,[tiltlighting+ebx*4]
|
|
|
|
mov esi,edx
|
|
dec ebx
|
|
|
|
mov al,[ebp+eax]
|
|
mov ebp,ecx
|
|
|
|
mov [edi-1],al
|
|
jge endloop
|
|
|
|
fcompp
|
|
fstp st0
|
|
|
|
pop ebp
|
|
pop edi
|
|
pop esi
|
|
pop ebx
|
|
ret
|
|
|
|
rtext_end:
|