;* ;* tmap2.nas ;* The tilted plane inner loop. ;* ;*--------------------------------------------------------------------------- ;* Copyright 1998-2006 Randy Heit ;* All rights reserved. ;* ;* Redistribution and use in source and binary forms, with or without ;* modification, are permitted provided that the following conditions ;* are met: ;* ;* 1. Redistributions of source code must retain the above copyright ;* notice, this list of conditions and the following disclaimer. ;* 2. Redistributions in binary form must reproduce the above copyright ;* notice, this list of conditions and the following disclaimer in the ;* documentation and/or other materials provided with the distribution. ;* 3. The name of the author may not be used to endorse or promote products ;* derived from this software without specific prior written permission. ;* ;* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR ;* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES ;* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. ;* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, ;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT ;* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ;* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ;* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ;* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF ;* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;*--------------------------------------------------------------------------- ;* ;* I tried doing the ROL trick that R_DrawSpanP_ASM uses, and it was ;* actually slightly slower than the more straight-forward approach ;* used here, probably because the trick requires too much setup time. ;* BITS 32 %include "valgrind.inc" %define SPACEFILLER4 (0x44444444) %ifndef M_TARGET_LINUX %define plane_sz _plane_sz %define plane_su _plane_su %define plane_sv _plane_sv %define plane_shade _plane_shade %define planelightfloat _planelightfloat %define spanend _spanend %define ylookup _ylookup %define dc_destorg _dc_destorg %define ds_colormap _ds_colormap %define ds_source _ds_source %define centery _centery %define centerx _centerx %define ds_curtiltedsource _ds_curtiltedsource %define pviewx _pviewx %define pviewy _pviewy %define tiltlighting _tiltlighting %define R_DrawTiltedPlane_ASM _R_DrawTiltedPlane_ASM %define R_SetTiltedSpanSource_ASM _R_SetTiltedSpanSource_ASM %define R_CalcTiltedLighting _R_CalcTiltedLighting %endif EXTERN plane_sz EXTERN plane_su EXTERN plane_sv EXTERN planelightfloat EXTERN spanend EXTERN ylookup EXTERN dc_destorg EXTERN ds_colormap EXTERN centery EXTERN centerx EXTERN ds_source EXTERN plane_shade EXTERN pviewx EXTERN pviewy EXTERN tiltlighting EXTERN R_CalcTiltedLighting GLOBAL ds_curtiltedsource %define sv_i plane_sv %define sv_j plane_sv+4 %define sv_k plane_sv+8 %define su_i plane_su %define su_j plane_su+4 %define su_k plane_su+8 %define sz_i plane_sz %define sz_j plane_sz+4 %define sz_k plane_sz+8 %define SPANBITS 3 section .bss start_u: resq 1 start_v: resq 1 step_u: resq 1 step_v: resq 1 step_iz: resq 1 step_uz: resq 1 step_vz: resq 1 end_z: resd 1 section .data ds_curtiltedsource: dd SPACEFILLER4 fp_1: spanrecips: dd 0x3f800000 ; 1/1 dd 0x3f000000 ; 1/2 dd 0x3eaaaaab ; 1/3 dd 0x3e800000 ; 1/4 dd 0x3e4ccccd ; 1/5 dd 0x3e2aaaab ; 1/6 dd 0x3e124925 ; 1/7 fp_8recip: dd 0x3e000000 ; 1/8 dd 0x3de38e39 ; 1/9 dd 0x3dcccccd ; 1/10 dd 0x3dba2e8c ; 1/11 dd 0x3daaaaab ; 1/12 dd 0x3d9d89d9 ; 1/13 dd 0x3d924925 ; 1/14 dd 0x3d888889 ; 1/15 fp_quickint: dd 0x3f800000 ; 1 dd 0x40000000 ; 2 dd 0x40400000 ; 3 dd 0x40800000 ; 4 dd 0x40a00000 ; 5 dd 0x40c00000 ; 6 dd 0x40e00000 ; 7 fp_8: dd 0x41000000 ; 8 section .text GLOBAL R_SetTiltedSpanSource_ASM GLOBAL @R_SetTiltedSpanSource_ASM@4 R_SetTiltedSpanSource_ASM: mov ecx,[esp+4] @R_SetTiltedSpanSource_ASM@4: mov [fetch1+3],ecx mov [fetch2+3],ecx mov [fetch3+3],ecx mov [fetch4+3],ecx mov [fetch5+3],ecx mov [fetch6+3],ecx mov [fetch7+3],ecx mov [fetch8+3],ecx mov [fetch9+3],ecx mov [fetch10+3],ecx mov [ds_curtiltedsource],ecx selfmod rtext_start, rtext_end ret GLOBAL SetTiltedSpanSize SetTiltedSpanSize: push ecx mov cl,dl neg cl mov eax,1 shl eax,cl mov cl,[esp] neg cl mov [x1+2],cl mov [x2+2],cl mov [x3+2],cl mov [x4+2],cl mov [x5+2],cl mov [x6+2],cl mov [x7+2],cl mov [x8+2],cl mov [x9+2],cl mov [x10+2],cl sub cl,dl dec eax mov [y1+2],cl mov [y2+2],cl mov [y3+2],cl mov [y4+2],cl mov [y5+2],cl mov [y6+2],cl mov [y7+2],cl mov [y8+2],cl mov [y9+2],cl mov [y10+2],cl not eax pop ecx mov [m1+2],eax mov [m2+2],eax mov [m3+2],eax mov [m4+2],eax mov [m5+2],eax mov [m6+2],eax mov [m7+2],eax mov [m8+2],eax mov [m9+2],eax mov [m10+2],eax selfmod rtext_start, rtext_end ret SECTION .rtext progbits alloc exec write align=64 rtext_start: GLOBAL R_DrawTiltedPlane_ASM GLOBAL @R_DrawTiltedPlane_ASM@8 R_DrawTiltedPlane_ASM: mov ecx,[esp+4] mov edx,[esp+8] ; ecx = y ; edx = x @R_DrawTiltedPlane_ASM@8: push ebx push esi push edi push ebp mov eax,[centery] movzx ebx,word [spanend+ecx*2] sub eax,ecx ; eax = centery-y sub ebx,edx ; ebx = span length - 1 mov edi,[ylookup+ecx*4] push eax add edi,[dc_destorg] add edi,edx ; edi = frame buffer pointer sub edx,[centerx] ; edx = x-centerx push edx xor eax,eax fild dword [esp+4] ; ymul fild dword [esp] ; xmul | ymul fld dword [sv_j] ; sv.j | xmul | ymul fmul st0,st2 ; sv.j*ymul | xmul | ymul fld dword [su_j] ; su.j | sv.j*ymul | xmul | ymul fmul st0,st3 ; su.j*ymul | sv.j*ymul | xmul | ymul fld dword [sz_j] ; sz.j | su.j*ymul | sv.j*ymul | xmul | ymul fmulp st4,st0 ; su.j*ymul | sv.j*ymul | xmul | sz.j*ymul fld dword [sv_i] ; sv.i | su.j*ymul | sv.j*ymul | xmul | sz.j*ymul fmul st0,st3 ; sv.i*xmul | su.j*ymul | sv.j*ymul | xmul | sz.j*ymul fld dword [su_i] ; su.i | sv.i*xmul | su.j*ymul | sv.j*ymul | xmul | sz.j*ymul fmul st0,st4 ; su.i*xmul | sv.i*xmul | su.j*ymul | sv.j*ymul | xmul | sz.j*ymul fld dword [sz_i] ; sz.i | su.i*xmul | sv.i*xmul | su.j*ymul | sv.j*ymul | xmul | sz.j*ymul fmulp st5,st0 ; su.i*xmul | sv.i*xmul | su.j*ymul | sv.j*ymul | sz.i*xmul | sz.j*ymul fxch st1 ; sv.i*xmul | su.i*xmul | su.j*ymul | sv.j*ymul | sz.i*xmul | sz.j*ymul faddp st3,st0 ; su.i*xmul | su.j*ymul | sv.i*xmul+sv.j*ymul | sz.i*xmul | sz.j*ymul faddp st1,st0 ; su.i*xmul+su.j*ymul | sv.i*xmul+sv.j*ymul | sz.i*xmul | sz.j*ymul fxch st3 ; sz.j*ymul | sv.i*xmul+sv.j*ymul | sz.i*xmul | su.i*xmul+su.j*ymul faddp st2,st0 ; sv.i*xmul+sv.j*ymul | sz.i*xmul+sz.j*ymul | su.i*xmul+su.j*ymul fadd dword [sv_k] ; v/z | sz.i*xmul+sz.j*ymul | su.i*xmul+su.j*ymul fxch st1 ; sz.i*xmul+sz.j*ymul | v/z | su.i*xmul+su.j*ymul fadd dword [sz_k] ; 1/z | v/z | su.i*xmul+su.j*ymul fxch st2 ; su.i*xmul+su.j*ymul | v/z | 1/z fadd dword [su_k] ; u/z | v/z | 1/z fxch st2 ; 1/z | v/z | u/z fxch st1 ; v/z | 1/z | u/z ; if lighting is on, fill out the light table mov al,[plane_shade] test al,al jz .litup push ebx fild dword [esp] ; width | v/z | 1/z | u/z fmul dword [sz_i] ; width*sz.i | v/z | 1/z | u/z fadd st0,st2 ; 1/endz | v/z | 1/z | u/z fld st2 ; 1/z | 1/endz | v/z | 1/z | u/z fmul dword [planelightfloat] fxch st1 fmul dword [planelightfloat] sub esp,8 fistp dword [esp] fistp dword [esp+4] call R_CalcTiltedLighting add esp, 12 xor eax, eax .litup add esp, 8 ; calculate initial z, u, and v values fld st1 ; 1/z | v/z | 1/z | u/z fdivr dword [fp_1] ; z | v/z | 1/z | u/z fld st3 ; u/z | z | v/z | 1/z | u/z fmul st0,st1 ; u | z | v/z | 1/z | u/z fld st2 ; v/z | u | z | v/z | 1/z | u/z fmulp st2,st0 ; u | v | v/z | 1/z | u/z fld st0 fistp qword [start_u] fld st1 fistp qword [start_v] cmp ebx,7 ; Do we have at least 8 pixels to plot? jl near ShortStrip ; yes, we do, so figure out tex coords at end of this span ; multiply i values by span length (8) fld dword [su_i] ; su.i fmul dword [fp_8] ; su.i*8 fld dword [sv_i] ; sv.i | su.i*8 fmul dword [fp_8] ; sv.i*8 | su.i*8 fld dword [sz_i] ; sz.i | sv.i*8 | su.i*8 fmul dword [fp_8] ; sz.i*8 | sv.i*8 | su.i*8 fxch st2 ; su.i*8 | sv.i*8 | sz.i*8 fstp qword [step_uz] ; sv.i*8 | sz.i*8 fstp qword [step_vz] ; sz.i*8 fst qword [step_iz] ; sz.i*8 ; find tex coords at start of next span faddp st4 fld qword [step_vz] faddp st3 fld qword [step_uz] faddp st5 fld st3 ; 1/z | u | v | v/z | 1/z | u/z fdivr dword [fp_1] ; z | u | v | v/z | 1/z | u/z fst dword [end_z] fld st5 ; u/z | z | u | v | v/z | 1/z | u/z fmul st0,st1 ; u' | z | u | v | v/z | 1/z | u/z fxch st1 ; z | u' | u | v | v/z | 1/z | u/z fmul st0,st4 ; v' | u' | u | v | v/z | 1/z | u/z fxch st3 ; v | u' | u | v' | v/z | 1/z | u/z ; now subtract to get stepping values for this span fsubr st0,st3 ; v'-v | u' | u | v' | v/z | 1/z | u/z fxch st2 ; u | u' | v'-v | v' | v/z | 1/z | u/z fsubr st0,st1 ; u'-u | u' | v'-v | v' | v/z | 1/z | u/z fxch st2 ; v'-v | u' | u'-u | v' | v/z | 1/z | u/z fmul dword [fp_8recip] ; vstep | u' | u'-u | v' | v/z | 1/z | u/z fxch st1 ; u' | vstep | u'-u | v' | v/z | 1/z | u/z fxch st2 ; u'-u | vstep | u' | v' | v/z | 1/z | u/z fmul dword [fp_8recip] ; ustep | vstep | u' | v' | v/z | 1/z | u/z fxch st1 ; vstep | ustep | u' | v' | v/z | 1/z | u/z fistp qword [step_v] ; ustep | u' | v' | v/z | 1/z | u/z fistp qword [step_u] ; u | v | v/z | 1/z | u/z FullSpan: xor eax,eax cmp ebx,15 ; is there another complete span after this one? jl NextIsShort ; there is a complete span after this one fld qword [step_iz] faddp st4,st0 fld qword [step_vz] faddp st3,st0 fld qword [step_uz] faddp st5,st0 jmp StartDiv NextIsShort: cmp ebx,8 ; if next span is no more than 1 pixel, then we already jle DrawFullSpan ; know everything we need to draw it fld dword [sz_i] ; sz.i | u | v | v/z | 1/z | u/z fmul dword [fp_quickint-8*4+ebx*4] fld dword [sv_i] ; sv.i | sz.i | u | v | v/z | 1/z | u/z fmul dword [fp_quickint-8*4+ebx*4] fld dword [su_i] ; su.i | sv.i | sz.i | u | v | v/z | 1/z | u/z fmul dword [fp_quickint-8*4+ebx*4] fxch st2 ; sz.i | sv.i | su.i | u | v | v/z | 1/z | u/z faddp st6,st0 ; sv.i | su.i | u | v | v/z | 1/z | u/z faddp st4,st0 ; su.i | u | v | v/z | 1/z | u/z faddp st5,st0 ; u | v | v/z | 1/z | u/z StartDiv: fld st3 ; 1/z | u | v | v/z | 1/z | u/z fdivr dword [fp_1] ; z | u | v | v/z | 1/z | u/z DrawFullSpan: mov ecx,[start_v] mov edx,[start_u] add ecx,[pviewy] add edx,[pviewx] mov esi,edx mov ebp,ecx x1 shr ebp,26 m1 and esi,0xfc000000 y1 shr esi,20 add ecx,[step_v] add edx,[step_u] fetch1 mov al,[ebp+esi+SPACEFILLER4] mov ebp,[tiltlighting+ebx*4] mov esi,edx mov al,[ebp+eax] mov ebp,ecx mov [edi+0],al x2 shr ebp,26 m2 and esi,0xfc000000 y2 shr esi,20 add ecx,[step_v] add edx,[step_u] fetch2 mov al,[ebp+esi+SPACEFILLER4] mov ebp,[tiltlighting+ebx*4-4] mov esi,edx mov al,[ebp+eax] mov ebp,ecx mov [edi+1],al x3 shr ebp,26 m3 and esi,0xfc000000 y3 shr esi,20 add ecx,[step_v] add edx,[step_u] fetch3 mov al,[ebp+esi+SPACEFILLER4] mov ebp,[tiltlighting+ebx*4-8] mov esi,edx mov al,[ebp+eax] mov ebp,ecx mov [edi+2],al x4 shr ebp,26 m4 and esi,0xfc000000 y4 shr esi,20 add ecx,[step_v] add edx,[step_u] fetch4 mov al,[ebp+esi+SPACEFILLER4] mov ebp,[tiltlighting+ebx*4-12] mov esi,edx mov al,[ebp+eax] mov ebp,ecx mov [edi+3],al x5 shr ebp,26 m5 and esi,0xfc000000 y5 shr esi,20 add ecx,[step_v] add edx,[step_u] fetch5 mov al,[ebp+esi+SPACEFILLER4] mov ebp,[tiltlighting+ebx*4-16] mov esi,edx mov al,[ebp+eax] mov ebp,ecx mov [edi+4],al x6 shr ebp,26 m6 and esi,0xfc000000 y6 shr esi,20 add ecx,[step_v] add edx,[step_u] fetch6 mov al,[ebp+esi+SPACEFILLER4] mov ebp,[tiltlighting+ebx*4-20] mov esi,edx mov al,[ebp+eax] mov ebp,ecx mov [edi+5],al x7 shr ebp,26 m7 and esi,0xfc000000 y7 shr esi,20 add ecx,[step_v] add edx,[step_u] fetch7 mov al,[ebp+esi+SPACEFILLER4] mov ebp,[tiltlighting+ebx*4-24] x8 shr ecx,26 mov al,[ebp+eax] m8 and edx,0xfc000000 mov [edi+6],al y8 shr edx,20 mov ebp,[tiltlighting+ebx*4-28] fetch8 mov al,[edx+ecx+SPACEFILLER4] mov al,[ebp+eax] mov [edi+7],al add edi,8 sub ebx,8 jl near Done fld st1 fistp qword [start_u] fld st2 fistp qword [start_v] cmp ebx,7 jl near EndIsShort fst dword [end_z] fld st5 ; u/z | z | u | v | v/z | 1/z | u/z fmul st0,st1 ; u' | z | u | v | v/z | 1/z | u/z fxch st1 ; z | u' | u | v | v/z | 1/z | u/z fmul st0,st4 ; v' | u' | u | v | v/z | 1/z | u/z fxch st3 ; v | u' | u | v' | v/z | 1/z | u/z fsubr st0,st3 ; v'-v | u' | u | v' | v/z | 1/z | u/z fxch st2 ; u | u' | v'-v | v' | v/z | 1/z | u/z fsubr st0,st1 ; u'-u | u' | v'-v | v' | v/z | 1/z | u/z fxch st2 ; v'-v | u' | u'-u | v' | v/z | 1/z | u/z fmul dword [fp_8recip] ; vstep | u' | u'-u | v' | v/z | 1/z | u/z fxch st1 ; u' | vstep | u'-u | v' | v/z | 1/z | u/z fxch st2 ; u'-u | vstep | u' | v' | v/z | 1/z | u/z fmul dword [fp_8recip] ; ustep | vstep | u' | v' | v/z | 1/z | u/z fxch st1 ; vstep | ustep | u' | v' | v/z | 1/z | u/z fistp qword [step_v] ; ustep | u' | v' | v/z | 1/z | u/z fistp qword [step_u] ; u | v | v/z | 1/z | u/z jmp FullSpan OnlyOnePixelAtEnd: fld st0 fistp qword [start_u] fld st1 fistp qword [start_v] OnlyOnePixel: mov edx,[start_v] mov ecx,[start_u] add edx,[pviewy] add ecx,[pviewx] x9 shr edx,26 m9 and ecx,0xfc000000 y9 shr ecx,20 mov ebp,[tiltlighting] fetch9 mov al,[ecx+edx+SPACEFILLER4] mov al,[ebp+eax] mov [edi],al Done: fcompp fcompp fstp st0 pop ebp pop edi pop esi pop ebx ret ShortStrip: cmp ebx,0 jle near OnlyOnePixel MoreThanOnePixel: fld dword [sz_i] ; sz.i | u | v | v/z | 1/z | u/z fmul dword [fp_quickint+ebx*4] fld dword [sv_i] ; sv.i | sz.i | u | v | v/z | 1/z | u/z fmul dword [fp_quickint+ebx*4] fld dword [su_i] ; su.i | sv.i | sz.i | u | v | v/z | 1/z | u/z fmul dword [fp_quickint+ebx*4] fxch st2 ; sz.i | sv.i | su.i | u | v | v/z | 1/z | u/z faddp st6,st0 ; sv.i | su.i | u | v | v/z | 1/z | u/z faddp st4,st0 ; su.i | u | v | v/z | 1/z | u/z faddp st5,st0 ; u | v | v/z | 1/z | u/z fld st3 ; 1/z | u | v | v/z | 1/z | u/z fdivr dword [fp_1] ; z | u | v | v/z | 1/z | u/z jmp CalcPartialSteps EndIsShort: cmp ebx,0 je near OnlyOnePixelAtEnd CalcPartialSteps: fst dword [end_z] fld st5 ; u/z | z | u | v | v/z | 1/z | u/z fmul st0,st1 ; u' | z | u | v | v/z | 1/z | u/z fxch st1 ; z | u' | u | v | v/z | 1/z | u/z fmul st0,st4 ; v' | u' | u | v | v/z | 1/z | u/z fxch st1 ; u' | v' | u | v | v/z | 1/z | u/z fsubrp st2,st0 ; v' | u'-u | v | v/z | 1/z | u/z fsubrp st2,st0 ; u'-u | v'-v | v/z | 1/z | u/z fmul dword [spanrecips+ebx*4] ;ustep | v'-v | v/z | 1/z | u/z fxch st1 ; v'-v | ustep | v/z | 1/z | u/z fmul dword [spanrecips+ebx*4] ;vstep | ustep | v/z | 1/z | u/z fxch st1 ; ustep | vstep | v/z | 1/z | u/z fistp qword [step_u] ; vstep | v/z | 1/z | u/z fistp qword [step_v] ; v/z | 1/z | u/z mov ecx,[start_v] mov edx,[start_u] add ecx,[pviewy] add edx,[pviewx] mov esi,edx mov ebp,ecx endloop: x10 shr ebp,26 m10 and esi,0xfc000000 y10 shr esi,20 inc edi add ecx,[step_v] add edx,[step_u] fetch10 mov al,[ebp+esi+SPACEFILLER4] mov ebp,[tiltlighting+ebx*4] mov esi,edx dec ebx mov al,[ebp+eax] mov ebp,ecx mov [edi-1],al jge endloop fcompp fstp st0 pop ebp pop edi pop esi pop ebx ret rtext_end: