qzdoom-gpl/src/asm_ia32/tmap2.asm
Randy Heit dda5ddd3c2 - Ported vlinetallasm4 to AMD64 assembly. Even with the increased number of
registers AMD64 provides, this routine still needs to be written as self-
  modifying code for maximum performance. The additional registers do allow
  for further optimization over the x86 version by allowing all four pixels
  to be in flight at the same time. The end result is that AMD64 ASM is about
  2.18 times faster than AMD64 C and about 1.06 times faster than x86 ASM.
  (For further comparison, AMD64 C and x86 C are practically the same for
  this function.) Should I port any more assembly to AMD64, mvlineasm4 is the
  most likely candidate, but it's not used enough at this point to bother.
  Also, this may or may not work with Linux at the moment, since it doesn't
  have the eh_handler metadata. Win64 is easier, since I just need to
  structure the function prologue and epilogue properly and use some
  assembler directives/macros to automatically generate the metadata. And
  that brings up another point: You need YASM to assemble the AMD64 code,
  because NASM doesn't support the Win64 metadata directives.
- Added an SSE version of DoBlending. This is strictly C intrinsics.
  VC++ still throws around unneccessary register moves. GCC seems to be
  pretty close to optimal, requiring only about 2 cycles/color. They're
  both faster than my hand-written MMX routine, so I don't need to feel
  bad about not hand-optimizing this for x64 builds.
- Removed an extra instruction from DoBlending_MMX, transposed two
  instructions, and unrolled it once, shaving off about 80 cycles from the
  time required to blend 256 palette entries. Why? Because I tried writing
  a C version of the routine using compiler intrinsics and was appalled by
  all the extra movq's VC++ added to the code. GCC was better, but still
  generated extra instructions. I only wanted a C version because I can't
  use inline assembly with VC++'s x64 compiler, and x64 assembly is a bit
  of a pain. (It's a pain because Linux and Windows have different calling
  conventions, and you need to maintain extra metadata for functions.) So,
  the assembly version stays and the C version stays out.
- Removed all the pixel doubling r_detail modes, since the one platform they
  were intended to assist (486) actually sees very little benefit from them.
- Rewrote CheckMMX in C and renamed it to CheckCPU.
- Fixed: CPUID function 0x80000005 is specified to return detailed L1 cache
  only for AMD processors, so we must not use it on other architectures, or
  we end up overwriting the L1 cache line size with 0 or some other number
  we don't actually understand.


SVN r1134 (trunk)
2008-08-09 03:13:43 +00:00

630 lines
16 KiB
NASM

;*
;* tmap2.nas
;* The tilted plane inner loop.
;*
;*---------------------------------------------------------------------------
;* Copyright 1998-2006 Randy Heit
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* 1. Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;* 2. Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in the
;* documentation and/or other materials provided with the distribution.
;* 3. The name of the author may not be used to endorse or promote products
;* derived from this software without specific prior written permission.
;*
;* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
;* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
;* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
;* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
;* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
;* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;*---------------------------------------------------------------------------
;*
;* I tried doing the ROL trick that R_DrawSpanP_ASM uses, and it was
;* actually slightly slower than the more straight-forward approach
;* used here, probably because the trick requires too much setup time.
;*
BITS 32
%include "valgrind.inc"
%define SPACEFILLER4 (0x44444444)
%ifndef M_TARGET_LINUX
%define plane_sz _plane_sz
%define plane_su _plane_su
%define plane_sv _plane_sv
%define plane_shade _plane_shade
%define planelightfloat _planelightfloat
%define spanend _spanend
%define ylookup _ylookup
%define dc_destorg _dc_destorg
%define ds_colormap _ds_colormap
%define ds_source _ds_source
%define centery _centery
%define centerx _centerx
%define ds_curtiltedsource _ds_curtiltedsource
%define pviewx _pviewx
%define pviewy _pviewy
%define tiltlighting _tiltlighting
%define R_DrawTiltedPlane_ASM _R_DrawTiltedPlane_ASM
%define R_SetTiltedSpanSource_ASM _R_SetTiltedSpanSource_ASM
%define R_CalcTiltedLighting _R_CalcTiltedLighting
%endif
EXTERN plane_sz
EXTERN plane_su
EXTERN plane_sv
EXTERN planelightfloat
EXTERN spanend
EXTERN ylookup
EXTERN dc_destorg
EXTERN ds_colormap
EXTERN centery
EXTERN centerx
EXTERN ds_source
EXTERN plane_shade
EXTERN pviewx
EXTERN pviewy
EXTERN tiltlighting
EXTERN R_CalcTiltedLighting
GLOBAL ds_curtiltedsource
%define sv_i plane_sv
%define sv_j plane_sv+4
%define sv_k plane_sv+8
%define su_i plane_su
%define su_j plane_su+4
%define su_k plane_su+8
%define sz_i plane_sz
%define sz_j plane_sz+4
%define sz_k plane_sz+8
%define SPANBITS 3
section .bss
start_u: resq 1
start_v: resq 1
step_u: resq 1
step_v: resq 1
step_iz: resq 1
step_uz: resq 1
step_vz: resq 1
end_z: resd 1
section .data
ds_curtiltedsource: dd SPACEFILLER4
fp_1:
spanrecips: dd 0x3f800000 ; 1/1
dd 0x3f000000 ; 1/2
dd 0x3eaaaaab ; 1/3
dd 0x3e800000 ; 1/4
dd 0x3e4ccccd ; 1/5
dd 0x3e2aaaab ; 1/6
dd 0x3e124925 ; 1/7
fp_8recip: dd 0x3e000000 ; 1/8
dd 0x3de38e39 ; 1/9
dd 0x3dcccccd ; 1/10
dd 0x3dba2e8c ; 1/11
dd 0x3daaaaab ; 1/12
dd 0x3d9d89d9 ; 1/13
dd 0x3d924925 ; 1/14
dd 0x3d888889 ; 1/15
fp_quickint: dd 0x3f800000 ; 1
dd 0x40000000 ; 2
dd 0x40400000 ; 3
dd 0x40800000 ; 4
dd 0x40a00000 ; 5
dd 0x40c00000 ; 6
dd 0x40e00000 ; 7
fp_8: dd 0x41000000 ; 8
section .text
GLOBAL R_SetTiltedSpanSource_ASM
GLOBAL @R_SetTiltedSpanSource_ASM@4
R_SetTiltedSpanSource_ASM:
mov ecx,[esp+4]
@R_SetTiltedSpanSource_ASM@4:
mov [fetch1+3],ecx
mov [fetch2+3],ecx
mov [fetch3+3],ecx
mov [fetch4+3],ecx
mov [fetch5+3],ecx
mov [fetch6+3],ecx
mov [fetch7+3],ecx
mov [fetch8+3],ecx
mov [fetch9+3],ecx
mov [fetch10+3],ecx
mov [ds_curtiltedsource],ecx
selfmod rtext_start, rtext_end
ret
GLOBAL SetTiltedSpanSize
SetTiltedSpanSize:
push ecx
mov cl,dl
neg cl
mov eax,1
shl eax,cl
mov cl,[esp]
neg cl
mov [x1+2],cl
mov [x2+2],cl
mov [x3+2],cl
mov [x4+2],cl
mov [x5+2],cl
mov [x6+2],cl
mov [x7+2],cl
mov [x8+2],cl
mov [x9+2],cl
mov [x10+2],cl
sub cl,dl
dec eax
mov [y1+2],cl
mov [y2+2],cl
mov [y3+2],cl
mov [y4+2],cl
mov [y5+2],cl
mov [y6+2],cl
mov [y7+2],cl
mov [y8+2],cl
mov [y9+2],cl
mov [y10+2],cl
not eax
pop ecx
mov [m1+2],eax
mov [m2+2],eax
mov [m3+2],eax
mov [m4+2],eax
mov [m5+2],eax
mov [m6+2],eax
mov [m7+2],eax
mov [m8+2],eax
mov [m9+2],eax
mov [m10+2],eax
selfmod rtext_start, rtext_end
ret
SECTION .rtext progbits alloc exec write align=64
rtext_start:
GLOBAL R_DrawTiltedPlane_ASM
GLOBAL @R_DrawTiltedPlane_ASM@8
R_DrawTiltedPlane_ASM:
mov ecx,[esp+4]
mov edx,[esp+8]
; ecx = y
; edx = x
@R_DrawTiltedPlane_ASM@8:
push ebx
push esi
push edi
push ebp
mov eax,[centery]
movzx ebx,word [spanend+ecx*2]
sub eax,ecx ; eax = centery-y
sub ebx,edx ; ebx = span length - 1
mov edi,[ylookup+ecx*4]
push eax
add edi,[dc_destorg]
add edi,edx ; edi = frame buffer pointer
sub edx,[centerx] ; edx = x-centerx
push edx
xor eax,eax
fild dword [esp+4] ; ymul
fild dword [esp] ; xmul | ymul
fld dword [sv_j] ; sv.j | xmul | ymul
fmul st0,st2 ; sv.j*ymul | xmul | ymul
fld dword [su_j] ; su.j | sv.j*ymul | xmul | ymul
fmul st0,st3 ; su.j*ymul | sv.j*ymul | xmul | ymul
fld dword [sz_j] ; sz.j | su.j*ymul | sv.j*ymul | xmul | ymul
fmulp st4,st0 ; su.j*ymul | sv.j*ymul | xmul | sz.j*ymul
fld dword [sv_i] ; sv.i | su.j*ymul | sv.j*ymul | xmul | sz.j*ymul
fmul st0,st3 ; sv.i*xmul | su.j*ymul | sv.j*ymul | xmul | sz.j*ymul
fld dword [su_i] ; su.i | sv.i*xmul | su.j*ymul | sv.j*ymul | xmul | sz.j*ymul
fmul st0,st4 ; su.i*xmul | sv.i*xmul | su.j*ymul | sv.j*ymul | xmul | sz.j*ymul
fld dword [sz_i] ; sz.i | su.i*xmul | sv.i*xmul | su.j*ymul | sv.j*ymul | xmul | sz.j*ymul
fmulp st5,st0 ; su.i*xmul | sv.i*xmul | su.j*ymul | sv.j*ymul | sz.i*xmul | sz.j*ymul
fxch st1 ; sv.i*xmul | su.i*xmul | su.j*ymul | sv.j*ymul | sz.i*xmul | sz.j*ymul
faddp st3,st0 ; su.i*xmul | su.j*ymul | sv.i*xmul+sv.j*ymul | sz.i*xmul | sz.j*ymul
faddp st1,st0 ; su.i*xmul+su.j*ymul | sv.i*xmul+sv.j*ymul | sz.i*xmul | sz.j*ymul
fxch st3 ; sz.j*ymul | sv.i*xmul+sv.j*ymul | sz.i*xmul | su.i*xmul+su.j*ymul
faddp st2,st0 ; sv.i*xmul+sv.j*ymul | sz.i*xmul+sz.j*ymul | su.i*xmul+su.j*ymul
fadd dword [sv_k] ; v/z | sz.i*xmul+sz.j*ymul | su.i*xmul+su.j*ymul
fxch st1 ; sz.i*xmul+sz.j*ymul | v/z | su.i*xmul+su.j*ymul
fadd dword [sz_k] ; 1/z | v/z | su.i*xmul+su.j*ymul
fxch st2 ; su.i*xmul+su.j*ymul | v/z | 1/z
fadd dword [su_k] ; u/z | v/z | 1/z
fxch st2 ; 1/z | v/z | u/z
fxch st1 ; v/z | 1/z | u/z
; if lighting is on, fill out the light table
mov al,[plane_shade]
test al,al
jz .litup
push ebx
fild dword [esp] ; width | v/z | 1/z | u/z
fmul dword [sz_i] ; width*sz.i | v/z | 1/z | u/z
fadd st0,st2 ; 1/endz | v/z | 1/z | u/z
fld st2 ; 1/z | 1/endz | v/z | 1/z | u/z
fmul dword [planelightfloat]
fxch st1
fmul dword [planelightfloat]
sub esp,8
fistp dword [esp]
fistp dword [esp+4]
call R_CalcTiltedLighting
add esp, 12
xor eax, eax
.litup add esp, 8
; calculate initial z, u, and v values
fld st1 ; 1/z | v/z | 1/z | u/z
fdivr dword [fp_1] ; z | v/z | 1/z | u/z
fld st3 ; u/z | z | v/z | 1/z | u/z
fmul st0,st1 ; u | z | v/z | 1/z | u/z
fld st2 ; v/z | u | z | v/z | 1/z | u/z
fmulp st2,st0 ; u | v | v/z | 1/z | u/z
fld st0
fistp qword [start_u]
fld st1
fistp qword [start_v]
cmp ebx,7 ; Do we have at least 8 pixels to plot?
jl near ShortStrip
; yes, we do, so figure out tex coords at end of this span
; multiply i values by span length (8)
fld dword [su_i] ; su.i
fmul dword [fp_8] ; su.i*8
fld dword [sv_i] ; sv.i | su.i*8
fmul dword [fp_8] ; sv.i*8 | su.i*8
fld dword [sz_i] ; sz.i | sv.i*8 | su.i*8
fmul dword [fp_8] ; sz.i*8 | sv.i*8 | su.i*8
fxch st2 ; su.i*8 | sv.i*8 | sz.i*8
fstp qword [step_uz] ; sv.i*8 | sz.i*8
fstp qword [step_vz] ; sz.i*8
fst qword [step_iz] ; sz.i*8
; find tex coords at start of next span
faddp st4
fld qword [step_vz]
faddp st3
fld qword [step_uz]
faddp st5
fld st3 ; 1/z | u | v | v/z | 1/z | u/z
fdivr dword [fp_1] ; z | u | v | v/z | 1/z | u/z
fst dword [end_z]
fld st5 ; u/z | z | u | v | v/z | 1/z | u/z
fmul st0,st1 ; u' | z | u | v | v/z | 1/z | u/z
fxch st1 ; z | u' | u | v | v/z | 1/z | u/z
fmul st0,st4 ; v' | u' | u | v | v/z | 1/z | u/z
fxch st3 ; v | u' | u | v' | v/z | 1/z | u/z
; now subtract to get stepping values for this span
fsubr st0,st3 ; v'-v | u' | u | v' | v/z | 1/z | u/z
fxch st2 ; u | u' | v'-v | v' | v/z | 1/z | u/z
fsubr st0,st1 ; u'-u | u' | v'-v | v' | v/z | 1/z | u/z
fxch st2 ; v'-v | u' | u'-u | v' | v/z | 1/z | u/z
fmul dword [fp_8recip] ; vstep | u' | u'-u | v' | v/z | 1/z | u/z
fxch st1 ; u' | vstep | u'-u | v' | v/z | 1/z | u/z
fxch st2 ; u'-u | vstep | u' | v' | v/z | 1/z | u/z
fmul dword [fp_8recip] ; ustep | vstep | u' | v' | v/z | 1/z | u/z
fxch st1 ; vstep | ustep | u' | v' | v/z | 1/z | u/z
fistp qword [step_v] ; ustep | u' | v' | v/z | 1/z | u/z
fistp qword [step_u] ; u | v | v/z | 1/z | u/z
FullSpan:
xor eax,eax
cmp ebx,15 ; is there another complete span after this one?
jl NextIsShort
; there is a complete span after this one
fld qword [step_iz]
faddp st4,st0
fld qword [step_vz]
faddp st3,st0
fld qword [step_uz]
faddp st5,st0
jmp StartDiv
NextIsShort:
cmp ebx,8 ; if next span is no more than 1 pixel, then we already
jle DrawFullSpan ; know everything we need to draw it
fld dword [sz_i] ; sz.i | u | v | v/z | 1/z | u/z
fmul dword [fp_quickint-8*4+ebx*4]
fld dword [sv_i] ; sv.i | sz.i | u | v | v/z | 1/z | u/z
fmul dword [fp_quickint-8*4+ebx*4]
fld dword [su_i] ; su.i | sv.i | sz.i | u | v | v/z | 1/z | u/z
fmul dword [fp_quickint-8*4+ebx*4]
fxch st2 ; sz.i | sv.i | su.i | u | v | v/z | 1/z | u/z
faddp st6,st0 ; sv.i | su.i | u | v | v/z | 1/z | u/z
faddp st4,st0 ; su.i | u | v | v/z | 1/z | u/z
faddp st5,st0 ; u | v | v/z | 1/z | u/z
StartDiv:
fld st3 ; 1/z | u | v | v/z | 1/z | u/z
fdivr dword [fp_1] ; z | u | v | v/z | 1/z | u/z
DrawFullSpan:
mov ecx,[start_v]
mov edx,[start_u]
add ecx,[pviewy]
add edx,[pviewx]
mov esi,edx
mov ebp,ecx
x1 shr ebp,26
m1 and esi,0xfc000000
y1 shr esi,20
add ecx,[step_v]
add edx,[step_u]
fetch1 mov al,[ebp+esi+SPACEFILLER4]
mov ebp,[tiltlighting+ebx*4]
mov esi,edx
mov al,[ebp+eax]
mov ebp,ecx
mov [edi+0],al
x2 shr ebp,26
m2 and esi,0xfc000000
y2 shr esi,20
add ecx,[step_v]
add edx,[step_u]
fetch2 mov al,[ebp+esi+SPACEFILLER4]
mov ebp,[tiltlighting+ebx*4-4]
mov esi,edx
mov al,[ebp+eax]
mov ebp,ecx
mov [edi+1],al
x3 shr ebp,26
m3 and esi,0xfc000000
y3 shr esi,20
add ecx,[step_v]
add edx,[step_u]
fetch3 mov al,[ebp+esi+SPACEFILLER4]
mov ebp,[tiltlighting+ebx*4-8]
mov esi,edx
mov al,[ebp+eax]
mov ebp,ecx
mov [edi+2],al
x4 shr ebp,26
m4 and esi,0xfc000000
y4 shr esi,20
add ecx,[step_v]
add edx,[step_u]
fetch4 mov al,[ebp+esi+SPACEFILLER4]
mov ebp,[tiltlighting+ebx*4-12]
mov esi,edx
mov al,[ebp+eax]
mov ebp,ecx
mov [edi+3],al
x5 shr ebp,26
m5 and esi,0xfc000000
y5 shr esi,20
add ecx,[step_v]
add edx,[step_u]
fetch5 mov al,[ebp+esi+SPACEFILLER4]
mov ebp,[tiltlighting+ebx*4-16]
mov esi,edx
mov al,[ebp+eax]
mov ebp,ecx
mov [edi+4],al
x6 shr ebp,26
m6 and esi,0xfc000000
y6 shr esi,20
add ecx,[step_v]
add edx,[step_u]
fetch6 mov al,[ebp+esi+SPACEFILLER4]
mov ebp,[tiltlighting+ebx*4-20]
mov esi,edx
mov al,[ebp+eax]
mov ebp,ecx
mov [edi+5],al
x7 shr ebp,26
m7 and esi,0xfc000000
y7 shr esi,20
add ecx,[step_v]
add edx,[step_u]
fetch7 mov al,[ebp+esi+SPACEFILLER4]
mov ebp,[tiltlighting+ebx*4-24]
x8 shr ecx,26
mov al,[ebp+eax]
m8 and edx,0xfc000000
mov [edi+6],al
y8 shr edx,20
mov ebp,[tiltlighting+ebx*4-28]
fetch8 mov al,[edx+ecx+SPACEFILLER4]
mov al,[ebp+eax]
mov [edi+7],al
add edi,8
sub ebx,8
jl near Done
fld st1
fistp qword [start_u]
fld st2
fistp qword [start_v]
cmp ebx,7
jl near EndIsShort
fst dword [end_z]
fld st5 ; u/z | z | u | v | v/z | 1/z | u/z
fmul st0,st1 ; u' | z | u | v | v/z | 1/z | u/z
fxch st1 ; z | u' | u | v | v/z | 1/z | u/z
fmul st0,st4 ; v' | u' | u | v | v/z | 1/z | u/z
fxch st3 ; v | u' | u | v' | v/z | 1/z | u/z
fsubr st0,st3 ; v'-v | u' | u | v' | v/z | 1/z | u/z
fxch st2 ; u | u' | v'-v | v' | v/z | 1/z | u/z
fsubr st0,st1 ; u'-u | u' | v'-v | v' | v/z | 1/z | u/z
fxch st2 ; v'-v | u' | u'-u | v' | v/z | 1/z | u/z
fmul dword [fp_8recip] ; vstep | u' | u'-u | v' | v/z | 1/z | u/z
fxch st1 ; u' | vstep | u'-u | v' | v/z | 1/z | u/z
fxch st2 ; u'-u | vstep | u' | v' | v/z | 1/z | u/z
fmul dword [fp_8recip] ; ustep | vstep | u' | v' | v/z | 1/z | u/z
fxch st1 ; vstep | ustep | u' | v' | v/z | 1/z | u/z
fistp qword [step_v] ; ustep | u' | v' | v/z | 1/z | u/z
fistp qword [step_u] ; u | v | v/z | 1/z | u/z
jmp FullSpan
OnlyOnePixelAtEnd:
fld st0
fistp qword [start_u]
fld st1
fistp qword [start_v]
OnlyOnePixel:
mov edx,[start_v]
mov ecx,[start_u]
add edx,[pviewy]
add ecx,[pviewx]
x9 shr edx,26
m9 and ecx,0xfc000000
y9 shr ecx,20
mov ebp,[tiltlighting]
fetch9 mov al,[ecx+edx+SPACEFILLER4]
mov al,[ebp+eax]
mov [edi],al
Done:
fcompp
fcompp
fstp st0
pop ebp
pop edi
pop esi
pop ebx
ret
ShortStrip:
cmp ebx,0
jle near OnlyOnePixel
MoreThanOnePixel:
fld dword [sz_i] ; sz.i | u | v | v/z | 1/z | u/z
fmul dword [fp_quickint+ebx*4]
fld dword [sv_i] ; sv.i | sz.i | u | v | v/z | 1/z | u/z
fmul dword [fp_quickint+ebx*4]
fld dword [su_i] ; su.i | sv.i | sz.i | u | v | v/z | 1/z | u/z
fmul dword [fp_quickint+ebx*4]
fxch st2 ; sz.i | sv.i | su.i | u | v | v/z | 1/z | u/z
faddp st6,st0 ; sv.i | su.i | u | v | v/z | 1/z | u/z
faddp st4,st0 ; su.i | u | v | v/z | 1/z | u/z
faddp st5,st0 ; u | v | v/z | 1/z | u/z
fld st3 ; 1/z | u | v | v/z | 1/z | u/z
fdivr dword [fp_1] ; z | u | v | v/z | 1/z | u/z
jmp CalcPartialSteps
EndIsShort:
cmp ebx,0
je near OnlyOnePixelAtEnd
CalcPartialSteps:
fst dword [end_z]
fld st5 ; u/z | z | u | v | v/z | 1/z | u/z
fmul st0,st1 ; u' | z | u | v | v/z | 1/z | u/z
fxch st1 ; z | u' | u | v | v/z | 1/z | u/z
fmul st0,st4 ; v' | u' | u | v | v/z | 1/z | u/z
fxch st1 ; u' | v' | u | v | v/z | 1/z | u/z
fsubrp st2,st0 ; v' | u'-u | v | v/z | 1/z | u/z
fsubrp st2,st0 ; u'-u | v'-v | v/z | 1/z | u/z
fmul dword [spanrecips+ebx*4] ;ustep | v'-v | v/z | 1/z | u/z
fxch st1 ; v'-v | ustep | v/z | 1/z | u/z
fmul dword [spanrecips+ebx*4] ;vstep | ustep | v/z | 1/z | u/z
fxch st1 ; ustep | vstep | v/z | 1/z | u/z
fistp qword [step_u] ; vstep | v/z | 1/z | u/z
fistp qword [step_v] ; v/z | 1/z | u/z
mov ecx,[start_v]
mov edx,[start_u]
add ecx,[pviewy]
add edx,[pviewx]
mov esi,edx
mov ebp,ecx
endloop:
x10 shr ebp,26
m10 and esi,0xfc000000
y10 shr esi,20
inc edi
add ecx,[step_v]
add edx,[step_u]
fetch10 mov al,[ebp+esi+SPACEFILLER4]
mov ebp,[tiltlighting+ebx*4]
mov esi,edx
dec ebx
mov al,[ebp+eax]
mov ebp,ecx
mov [edi-1],al
jge endloop
fcompp
fstp st0
pop ebp
pop edi
pop esi
pop ebx
ret
rtext_end: