;* ;* tmap.nas ;* The texture-mapping inner loops in pure assembly language. ;* ;*--------------------------------------------------------------------------- ;* Copyright 1998-2005 Randy Heit ;* All rights reserved. ;* ;* Redistribution and use in source and binary forms, with or without ;* modification, are permitted provided that the following conditions ;* are met: ;* ;* 1. Redistributions of source code must retain the above copyright ;* notice, this list of conditions and the following disclaimer. ;* 2. Redistributions in binary form must reproduce the above copyright ;* notice, this list of conditions and the following disclaimer in the ;* documentation and/or other materials provided with the distribution. ;* 3. The name of the author may not be used to endorse or promote products ;* derived from this software without specific prior written permission. ;* ;* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR ;* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES ;* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. ;* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, ;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT ;* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ;* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ;* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ;* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF ;* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;*--------------------------------------------------------------------------- ;* BITS 32 ; Segment/section definition macros. SECTION .data %define SPACEFILLER4 (0x44444444) ; If you change this in r_draw.c, be sure to change it here, too! FUZZTABLE equ 50 %ifdef M_TARGET_LINUX EXTERN ylookup EXTERN centery EXTERN fuzzpos EXTERN fuzzoffset EXTERN NormalLight EXTERN realviewheight EXTERN fuzzviewheight EXTERN CPU EXTERN dc_pitch EXTERN dc_colormap EXTERN dc_iscale EXTERN dc_texturefrac EXTERN dc_source EXTERN dc_yl EXTERN dc_yh EXTERN dc_x EXTERN dc_count EXTERN dc_dest EXTERN dc_destorg EXTERN dc_ctspan EXTERN dc_temp EXTERN ds_xstep EXTERN ds_ystep EXTERN ds_colormap EXTERN ds_source EXTERN ds_x1 EXTERN ds_x2 EXTERN ds_xfrac EXTERN ds_yfrac EXTERN ds_y GLOBAL ds_cursource GLOBAL ds_curcolormap %else EXTERN _ylookup EXTERN _centery EXTERN _fuzzpos EXTERN _fuzzoffset EXTERN _NormalLight EXTERN _realviewheight EXTERN _fuzzviewheight EXTERN _CPU EXTERN _dc_pitch EXTERN _dc_colormap EXTERN _dc_iscale EXTERN _dc_texturefrac EXTERN _dc_source EXTERN _dc_yl EXTERN _dc_yh EXTERN _dc_x EXTERN _dc_count EXTERN _dc_dest EXTERN _dc_destorg EXTERN _dc_ctspan EXTERN _dc_temp EXTERN _ds_xstep EXTERN _ds_ystep EXTERN _ds_colormap EXTERN _ds_source EXTERN _ds_x1 EXTERN _ds_x2 EXTERN _ds_xfrac EXTERN _ds_yfrac EXTERN _ds_y GLOBAL _ds_cursource GLOBAL _ds_curcolormap %define ylookup _ylookup %define centery _centery %define fuzzpos _fuzzpos %define fuzzoffset _fuzzoffset %define NormalLight _NormalLight %define realviewheight _realviewheight %define fuzzviewheight _fuzzviewheight %define CPU _CPU %define dc_pitch _dc_pitch %define dc_colormap _dc_colormap %define dc_iscale _dc_iscale %define dc_texturefrac _dc_texturefrac %define dc_source _dc_source %define dc_yl _dc_yl %define dc_yh _dc_yh %define dc_x _dc_x %define dc_count _dc_count %define dc_dest _dc_dest %define dc_destorg _dc_destorg %define dc_ctspan _dc_ctspan %define dc_temp _dc_temp %define ds_xstep _ds_xstep %define ds_ystep _ds_ystep %define ds_colormap _ds_colormap %define ds_source _ds_source %define ds_x1 _ds_x1 %define ds_x2 _ds_x2 %define ds_xfrac _ds_xfrac %define ds_yfrac _ds_yfrac %define ds_y _ds_y %define R_SetSpanSource_ASM _R_SetSpanSource_ASM %define R_SetSpanSize_ASM _R_SetSpanSize_ASM %define R_SetSpanColormap_ASM _R_SetSpanColormap_ASM %endif _ds_cursource: ds_cursource: DD 0 _ds_curcolormap: ds_curcolormap: DD 0 ; Local stuff: lastAddress DD 0 pixelcount DD 0 SECTION .text GLOBAL @R_SetSpanSource_ASM@4 GLOBAL R_SetSpanSource_ASM R_SetSpanSource_ASM: mov ecx,[esp+4] @R_SetSpanSource_ASM@4: mov [spreada+2],ecx mov [spreadb+2],ecx mov [spreadc+2],ecx mov [spreadd+2],ecx mov [spreade+2],ecx mov [spreadf+2],ecx mov [spreadg+2],ecx mov [mspreada+2],ecx mov [mspreadb+2],ecx mov [mspreadc+2],ecx mov [mspreadd+2],ecx mov [mspreade+2],ecx mov [mspreadf+2],ecx mov [mspreadg+2],ecx mov [ds_cursource],ecx ret GLOBAL @R_SetSpanColormap_ASM@4 GLOBAL R_SetSpanColormap_ASM R_SetSpanColormap_ASM: mov ecx,[esp+4] @R_SetSpanColormap_ASM@4: mov [spmapa+2],ecx mov [spmapb+2],ecx mov [spmapc+2],ecx mov [spmapd+2],ecx mov [spmape+2],ecx mov [spmapf+2],ecx mov [spmapg+2],ecx mov [mspmapa+2],ecx mov [mspmapb+2],ecx mov [mspmapc+2],ecx mov [mspmapd+2],ecx mov [mspmape+2],ecx mov [mspmapf+2],ecx mov [mspmapg+2],ecx mov [ds_curcolormap],ecx ret GLOBAL R_SetSpanSize_ASM EXTERN SetTiltedSpanSize R_SetSpanSize_ASM: mov edx,[esp+4] mov ecx,[esp+8] call SetTiltedSpanSize mov [dsy1+2],dl mov [dsy2+2],dl mov [dsx1+2],cl mov [dsx2+2],cl mov [dsx3+2],cl mov [dsx4+2],cl mov [dsx5+2],cl mov [dsx6+2],cl mov [dsx7+2],cl mov [dmsy1+2],dl mov [dmsy2+2],dl mov [dmsx1+2],cl mov [dmsx2+2],cl mov [dmsx3+2],cl mov [dmsx4+2],cl mov [dmsx5+2],cl mov [dmsx6+2],cl mov [dmsx7+2],cl push ecx add ecx,edx mov eax,1 shl eax,cl dec eax mov [dsm1+2],eax mov [dsm5+1],eax mov [dsm6+1],eax mov [dsm7+1],eax mov [dmsm1+2],eax mov [dmsm5+1],eax mov [dmsm6+1],eax mov [dmsm7+1],eax pop ecx ror eax,cl mov [dsm2+2],eax mov [dsm3+2],eax mov [dsm4+2],eax mov [dmsm2+2],eax mov [dmsm3+2],eax mov [dmsm4+2],eax and eax,0xffff not eax mov [dsm8+2],eax mov [dsm9+2],eax mov [dmsm8+2],eax mov [dmsm9+2],eax neg dl mov [dsy3+2],dl mov [dsy4+2],dl mov [dmsy3+2],dl mov [dmsy4+2],dl aret: ret SECTION .rtext progbits alloc exec write align=64 GLOBAL @R_DrawSpanP_ASM@0 GLOBAL _R_DrawSpanP_ASM GLOBAL R_DrawSpanP_ASM ; eax: scratch ; ebx: zero ; ecx: yfrac at top end, xfrac int part at low end ; edx: xfrac frac part at top end ; edi: dest ; ebp: scratch ; esi: count align 16 @R_DrawSpanP_ASM@0: _R_DrawSpanP_ASM: R_DrawSpanP_ASM: mov eax,[ds_x2] mov ecx,[ds_x1] sub eax,ecx jl aret ; count < 0: nothing to do, so leave push ebx push edi push ebp push esi mov edi,ecx add edi,[dc_destorg] mov ecx,[ds_y] add edi,[ylookup+ecx*4] mov edx,[ds_xstep] dsy1: shl edx,6 mov ebp,[ds_xstep] dsy3: shr ebp,26 xor ebx,ebx lea esi,[eax+1] mov [ds_xstep],edx mov edx,[ds_ystep] mov ecx,[ds_xfrac] dsy4: shr ecx,26 dsm8: and edx,0xffffffc0 or ebp,edx mov [ds_ystep],ebp mov ebp,[ds_yfrac] mov edx,[ds_xfrac] dsy2: shl edx,6 dsm9: and ebp,0xffffffc0 or ecx,ebp shr esi,1 jnc dseven1 ; do odd pixel mov ebp,ecx dsx1: rol ebp,6 dsm1: and ebp,0xfff add edx,[ds_xstep] adc ecx,[ds_ystep] spreada mov bl,[ebp+SPACEFILLER4] spmapa mov bl,[ebx+SPACEFILLER4] mov [edi],bl inc edi dseven1 shr esi,1 jnc dsrest ; do two more pixels mov ebp,ecx add edx,[ds_xstep] adc ecx,[ds_ystep] dsm2: and ebp,0xfc00003f dsx2: rol ebp,6 mov eax,ecx add edx,[ds_xstep] adc ecx,[ds_ystep] spreadb mov bl,[ebp+SPACEFILLER4] ;read texel1 dsx3: rol eax,6 dsm6: and eax,0xfff spmapb mov bl,[ebx+SPACEFILLER4] ;map texel1 mov [edi],bl ;store texel1 add edi,2 spreadc mov bl,[eax+SPACEFILLER4] ;read texel2 spmapc mov bl,[ebx+SPACEFILLER4] ;map texel2 mov [edi-1],bl ;store texel2 ; do the rest dsrest test esi,esi jz near dsdone align 16 dsloop mov ebp,ecx spstep1d add edx,[ds_xstep] spstep2d adc ecx,[ds_ystep] dsm3: and ebp,0xfc00003f dsx4: rol ebp,6 mov eax,ecx spstep1e add edx,[ds_xstep] spstep2e adc ecx,[ds_ystep] spreadd mov bl,[ebp+SPACEFILLER4] ;read texel1 dsx5: rol eax,6 dsm5: and eax,0xfff spmapd mov bl,[ebx+SPACEFILLER4] ;map texel1 mov [edi],bl ;store texel1 mov ebp,ecx spreade mov bl,[eax+SPACEFILLER4] ;read texel2 spstep1f add edx,[ds_xstep] spstep2f adc ecx,[ds_ystep] dsm4: and ebp,0xfc00003f dsx6: rol ebp,6 spmape mov bl,[ebx+SPACEFILLER4] ;map texel2 mov eax,ecx mov [edi+1],bl ;store texel2 spreadf mov bl,[ebp+SPACEFILLER4] ;read texel3 spmapf mov bl,[ebx+SPACEFILLER4] ;map texel3 add edi,4 dsx7: rol eax,6 dsm7: and eax,0xfff mov [edi-2],bl ;store texel3 spreadg mov bl,[eax+SPACEFILLER4] ;read texel4 spstep1g add edx,[ds_xstep] spstep2g adc ecx,[ds_ystep] spmapg mov bl,[ebx+SPACEFILLER4] ;map texel4 dec esi mov [edi-1],bl ;store texel4 jnz near dsloop dsdone pop esi pop ebp pop edi pop ebx rdspret ret ; This is the same as the previous routine, except it doesn't draw pixels ; where the texture's color value is 0. GLOBAL @R_DrawSpanMaskedP_ASM@0 GLOBAL _R_DrawSpanMaskedP_ASM GLOBAL R_DrawSpanMaskedP_ASM ; eax: scratch ; ebx: zero ; ecx: yfrac at top end, xfrac int part at low end ; edx: xfrac frac part at top end ; edi: dest ; ebp: scratch ; esi: count align 16 @R_DrawSpanMaskedP_ASM@0: _R_DrawSpanMaskedP_ASM: R_DrawSpanMaskedP_ASM: mov eax,[ds_x2] mov ecx,[ds_x1] sub eax,ecx jl rdspret ; count < 0: nothing to do, so leave push ebx push edi push ebp push esi mov edi,ecx add edi,[dc_destorg] mov ecx,[ds_y] add edi,[ylookup+ecx*4] mov edx,[ds_xstep] dmsy1: shl edx,6 mov ebp,[ds_xstep] dmsy3: shr ebp,26 xor ebx,ebx lea esi,[eax+1] mov [ds_xstep],edx mov edx,[ds_ystep] mov ecx,[ds_xfrac] dmsy4: shr ecx,26 dmsm8: and edx,0xffffffc0 or ebp,edx mov [ds_ystep],ebp mov ebp,[ds_yfrac] mov edx,[ds_xfrac] dmsy2: shl edx,6 dmsm9: and ebp,0xffffffc0 or ecx,ebp shr esi,1 jnc dmseven1 ; do odd pixel mov ebp,ecx dmsx1: rol ebp,6 dmsm1: and ebp,0xfff add edx,[ds_xstep] adc ecx,[ds_ystep] mspreada mov bl,[ebp+SPACEFILLER4] cmp bl,0 je mspskipa mspmapa mov bl,[ebx+SPACEFILLER4] mov [edi],bl mspskipa: inc edi dmseven1 shr esi,1 jnc dmsrest ; do two more pixels mov ebp,ecx add edx,[ds_xstep] adc ecx,[ds_ystep] dmsm2: and ebp,0xfc00003f dmsx2: rol ebp,6 mov eax,ecx add edx,[ds_xstep] adc ecx,[ds_ystep] mspreadb mov bl,[ebp+SPACEFILLER4] ;read texel1 dmsx3: rol eax,6 dmsm6: and eax,0xfff cmp bl,0 je mspskipb mspmapb mov bl,[ebx+SPACEFILLER4] ;map texel1 mov [edi],bl ;store texel1 mspskipb add edi,2 mspreadc mov bl,[eax+SPACEFILLER4] ;read texel2 cmp bl,0 je dmsrest mspmapc mov bl,[ebx+SPACEFILLER4] ;map texel2 mov [edi-1],bl ;store texel2 ; do the rest dmsrest test esi,esi jz near dmsdone align 16 dmsloop mov ebp,ecx mspstep1d add edx,[ds_xstep] mspstep2d adc ecx,[ds_ystep] dmsm3: and ebp,0xfc00003f dmsx4: rol ebp,6 mov eax,ecx mspstep1e add edx,[ds_xstep] mspstep2e adc ecx,[ds_ystep] mspreadd mov bl,[ebp+SPACEFILLER4] ;read texel1 dmsx5: rol eax,6 dmsm5: and eax,0xfff cmp bl,0 mov ebp,ecx je mspreade mspmapd mov bl,[ebx+SPACEFILLER4] ;map texel1 mov [edi],bl ;store texel1 mspreade mov bl,[eax+SPACEFILLER4] ;read texel2 mspstep1f add edx,[ds_xstep] mspstep2f adc ecx,[ds_ystep] dmsm4: and ebp,0xfc00003f dmsx6: rol ebp,6 cmp bl,0 mov eax,ecx je mspreadf mspmape mov bl,[ebx+SPACEFILLER4] ;map texel2 mov [edi+1],bl ;store texel2 mspreadf mov bl,[ebp+SPACEFILLER4] ;read texel3 add edi,4 dmsx7: rol eax,6 dmsm7: and eax,0xfff cmp bl,0 je mspreadg mspmapf mov bl,[ebx+SPACEFILLER4] ;map texel3 mov [edi-2],bl ;store texel3 mspreadg mov bl,[eax+SPACEFILLER4] ;read texel4 mspstep1g add edx,[ds_xstep] mspstep2g adc ecx,[ds_ystep] cmp bl,0 je mspskipg mspmapg mov bl,[ebx+SPACEFILLER4] ;map texel4 mov [edi-1],bl ;store texel4 mspskipg dec esi jnz near dmsloop dmsdone pop esi pop ebp pop edi pop ebx ret ;*---------------------------------------------------------------------- ;* ;* R_DrawColumnP ;* ;*---------------------------------------------------------------------- GLOBAL @R_DrawColumnP_ASM@0 GLOBAL _R_DrawColumnP_ASM GLOBAL R_DrawColumnP_ASM align 16 R_DrawColumnP_ASM: _R_DrawColumnP_ASM: @R_DrawColumnP_ASM@0: ; count = dc_yh - dc_yl; mov ecx,[dc_count] test ecx,ecx jle near rdcpret ; count <= 0: nothing to do, so leave push ebp ; save registers push ebx push edi push esi ; dest = ylookup[dc_yl] + dc_x + dc_destorg; mov edi,[dc_dest] mov ebp,ecx mov ebx,[dc_texturefrac] ; ebx = frac rdcp1: sub edi,SPACEFILLER4 mov ecx,ebx shr ecx,16 mov esi,[dc_source] mov edx,[dc_iscale] mov eax,[dc_colormap] cmp BYTE [CPU+66],byte 5 jg rdcploop2 ; need 12 bytes of filler to make it aligned db 0x8D,0x80,0,0,0,0 ; lea eax,[eax+00000000] db 0x8D,0xBF,0,0,0,0 ; lea edi,[edi+00000000] align 16 ; The registers should now look like this: ; ; [31 .. 16][15 .. 8][7 .. 0] ; eax [colormap ] ; ebx [yi ][yf ] ; ecx [scratch ] ; edx [dyi ][dyf ] ; esi [source texture column ] ; edi [destination screen pointer ] ; ebp [counter ] ; ; Note the partial register stalls on anything better than a Pentium ; That's why there are two versions of this loop. rdcploop: mov cl,[esi+ecx] ; Fetch texel xor ch,ch add ebx,edx ; increment frac rdcp2: add edi,SPACEFILLER4 ; increment destination pointer mov cl,[eax+ecx] ; colormap texel mov [edi],cl ; Store texel mov ecx,ebx shr ecx,16 dec ebp jnz rdcploop ; loop pop esi pop edi pop ebx pop ebp rdcpret: ret align 16 rdcploop2: movzx ecx,byte [esi+ecx] ; Fetch texel add ebx,edx ; increment frac mov cl,[eax+ecx] ; colormap texel rdcp3: add edi,SPACEFILLER4 ; increment destination pointer mov [edi],cl ; Store texel mov ecx,ebx shr ecx,16 dec ebp jnz rdcploop2 ; loop pop esi pop edi pop ebx pop ebp ret ;*---------------------------------------------------------------------- ;* ;* R_DrawFuzzColumnP ;* ;*---------------------------------------------------------------------- GLOBAL @R_DrawFuzzColumnP_ASM@0 GLOBAL _R_DrawFuzzColumnP_ASM GLOBAL R_DrawFuzzColumnP_ASM align 16 R_DrawFuzzColumnP_ASM: _R_DrawFuzzColumnP_ASM: @R_DrawFuzzColumnP_ASM@0: ; Adjust borders. Low... mov eax,[dc_yl] push ebx push esi push edi push ebp cmp eax,0 jg .ylok mov eax,1 nop ; ...and high. .ylok mov edx,[fuzzviewheight] mov esi,[dc_yh] cmp esi,edx jle .yhok mov esi,edx nop .yhok mov edx,[dc_x] sub esi,eax ; esi = count js near .dfcdone ; Zero length (or less) mov edi,[ylookup+eax*4] mov ebx,edx add edi,[dc_destorg] mov eax,[NormalLight] mov ecx,[fuzzpos] add edi,ebx add eax,256*6 inc esi mov ebp,[dc_pitch] mov edx,FUZZTABLE test ecx,ecx je .fuzz0 ; ; esi = count ; edi = dest ; ecx = fuzzpos ; eax = colormap 6 ; ; first loop: end with fuzzpos or count 0, whichever happens first sub edx,ecx ; edx = # of entries left in fuzzoffset mov ebx,esi cmp esi,edx jle .enuf mov esi,edx .enuf sub ebx,esi mov edx,[fuzzoffset+ecx*4] push ebx xor ebx,ebx .loop1 inc ecx mov bl,[edi+edx] dec esi mov bl,[eax+ebx] mov [edi],bl lea edi,[edi+ebp] mov edx,[fuzzoffset+ecx*4] jnz .loop1 ; second loop: Chunk it into groups of FUZZTABLE-sized spans and do those pop esi cmp ecx,FUZZTABLE jl .savefuzzpos xor ecx,ecx nop .fuzz0 cmp esi,FUZZTABLE jl .chunked .oloop lea edx,[esi-FUZZTABLE] mov esi,FUZZTABLE push edx mov edx,[fuzzoffset+ecx*4] .iloop inc ecx mov bl,[edi+edx] dec esi mov bl,[eax+ebx] mov [edi],bl lea edi,[edi+ebp] mov edx,[fuzzoffset+ecx*4] jnz .iloop pop esi xor ecx,ecx cmp esi,FUZZTABLE jge .oloop ; third loop: Do whatever is left .chunked test esi,esi jle .savefuzzpos mov edx,[fuzzoffset+ecx*4] nop .loop3 inc ecx mov bl,[edi+edx] dec esi mov bl,[eax+ebx] mov [edi],bl lea edi,[edi+ebp] mov edx,[fuzzoffset+ecx*4] jnz .loop3 .savefuzzpos: mov [fuzzpos],ecx .dfcdone: pop ebp pop edi pop esi pop ebx ret ;*---------------------------------------------------------------------- ;* ;* R_DrawColumnHorizP_ASM ;* ;*---------------------------------------------------------------------- GLOBAL @R_DrawColumnHorizP_ASM@0 GLOBAL _R_DrawColumnHorizP_ASM GLOBAL R_DrawColumnHorizP_ASM align 16 @R_DrawColumnHorizP_ASM@0: R_DrawColumnHorizP_ASM: _R_DrawColumnHorizP_ASM: ; count = dc_yh - dc_yl; mov eax,[dc_yh] mov ecx,[dc_yl] sub eax,ecx mov edx,[dc_x] jl near .leave ; count < 0: nothing to do, so leave push ebp ; save registers push ebx push edi push esi inc eax ; make 0 count mean 0 pixels and edx,3 push eax mov esi,[dc_ctspan+edx*4] lea eax,[dc_temp+ecx*4+edx] ; eax = top of column in buffer mov ebp,[dc_yh] mov [esi],ecx mov [esi+4],ebp add esi,8 mov edi,[dc_source] mov [dc_ctspan+edx*4],esi mov esi,[dc_iscale] mov ecx,[dc_texturefrac] ; ecx = frac mov dl,[edi] ; load cache mov ebx,[esp] and ebx,0xfffffff8 jnz .mthan8 ; Register usage in the following code is: ; ; eax: dest ; edi: source ; ecx: frac (16.16) ; esi: fracstep (16.16) ; ebx: add1 ; ebp: add2 ; dl: texel1 ; dh: texel2 ;[esp] count ; there are fewer than 8 pixels to draw mov ebx,[esp] .lthan8 shr ebx,1 jnc .even ; do one pixel before loop (little opportunity for pairing) mov ebp,ecx ; copy frac to ebx add ecx,esi ; increment frac shr ebp,16 ; shift frac over to low end add eax,4 mov dl,[edi+ebp] mov [eax-4],dl .even test ebx,ebx jz near .done .loop2 mov [esp],ebx ; save counter mov ebx,ecx ; copy frac for texel1 to ebx shr ebx,16 ; shift frac for texel1 to low end add ecx,esi ; increment frac mov ebp,ecx ; copy frac for texel2 to ebp shr ebp,16 ; shift frac for texel2 to low end add ecx,esi ; increment frac mov dl,[edi+ebx] ; read texel1 mov ebx,[esp] ; fetch counter mov dh,[edi+ebp] ; read texel2 mov [eax],dl ; write texel1 mov [eax+4],dh ; write texel2 add eax,8 ; increment dest dec ebx ; decrement counter jnz .loop2 ; loop until it hits 0 jmp .done ; there are more than 8 pixels to draw. position eax as close to a 32 byte ; boundary as possible, then do whatever is left. .mthan8 test eax,4 jz .try2 mov ebp,ecx ; frac: in ebp add ecx,esi ; step shr ebp,16 ; frac: shift add eax,4 ; increment dest mov ebx,[esp] ; fetch counter mov dl,[edi+ebp] ; tex: read dec ebx ; decrement counter mov [eax-4],dl ; tex: write mov [esp],ebx ; store counter .try2 test eax,8 jz .try4 mov ebx,ecx ; frac1: in ebx add ecx,esi ; step shr ebx,16 ; frac1: shift mov ebp,ecx ; frac2: in ebp shr ebp,16 ; frac2: shift add ecx,esi ; step mov dl,[edi+ebx] ; tex1: read mov ebx,[esp] ; fetch counter mov dh,[edi+ebp] ; tex2: read mov [eax],dl ; tex1: write mov [eax+4],dh ; tex2: write sub ebx,2 ; decrement counter add eax,8 ; increment dest mov [esp],ebx ; store counter .try4 test eax,16 jz .try8 mov ebx,ecx ; frac1: in ebx add ecx,esi ; step shr ebx,16 ; frac1: shift mov ebp,ecx ; frac2: in ebp shr ebp,16 ; frac2: shift add ecx,esi ; step mov dl,[edi+ebx] ; tex1: read mov ebx,ecx ; frac3: in ebx shr ebx,16 ; frac3: shift mov dh,[edi+ebp] ; tex2: read add ecx,esi ; step mov [eax],dl ; tex1: write mov [eax+4],dh ; tex2: write mov ebp,ecx ; frac4: in ebp shr ebp,16 ; frac4: shift add ecx,esi ; step mov dl,[edi+ebx] ; tex3: read mov ebx,[esp] ; fetch counter mov dh,[edi+ebp] ; tex4: read sub ebx,4 ; decrement counter mov [esp],ebx ; store counter mov [eax+8],dl ; tex3: write mov [eax+12],dh ; tex4: write add eax,16 ; increment dest .try8 mov ebx,[esp] ; make counter count groups of 8 sub esp,4 shr ebx,3 jmp .tail8 align 16 .loop8 mov [esp],ebx ; save counter mov ebx,ecx ; frac1: in ebx shr ebx,16 ; frac1: shift add ecx,esi ; step mov ebp,ecx ; frac2: in ebp shr ebp,16 ; frac2: shift add ecx,esi ; step mov dl,[edi+ebx] ; tex1: read mov ebx,ecx ; frac3: in ebx mov dh,[edi+ebp] ; tex2: read shr ebx,16 ; frac3: shift add ecx,esi ; step mov [eax],dl ; tex1: write mov [eax+4],dh ; tex2: write mov ebp,ecx ; frac4: in ebp shr ebp,16 ; frac4: shift add ecx,esi ; step mov dl,[edi+ebx] ; tex3: read mov ebx,ecx ; frac5: in ebx mov dh,[edi+ebp] ; tex4: read shr ebx,16 ; frac5: shift mov [eax+8],dl ; tex3: write mov [eax+12],dh ; tex4: write add ecx,esi ; step mov ebp,ecx ; frac6: in ebp shr ebp,16 ; frac6: shift mov dl,[edi+ebx] ; tex5: read add ecx,esi ; step mov ebx,ecx ; frac7: in ebx mov [eax+16],dl ; tex5: write shr ebx,16 ; frac7: shift mov dh,[edi+ebp] ; tex6: read add ecx,esi ; step mov ebp,ecx ; frac8: in ebp mov [eax+20],dh ; tex6: write shr ebp,16 ; frac8: shift add eax,32 ; increment dest pointer mov dl,[edi+ebx] ; tex7: read mov ebx,[esp] ; fetch counter mov [eax-8],dl ; tex7: write mov dh,[edi+ebp] ; tex8: read add ecx,esi ; step mov [eax-4],dh ; tex8: write mov dl,[eax] ; load cache dec ebx ; decrement counter .tail8 jnz near .loop8 ; loop if more to do pop ebp mov ebx,[esp] and ebx,7 jnz near .lthan8 .done pop eax pop esi pop edi pop ebx pop ebp .leave ret ;*---------------------------------------------------------------------- ;* ;* rt_copy1col_asm ;* ;* ecx = hx ;* edx = sx ;* [esp+4] = yl ;* [esp+8] = yh ;* ;*---------------------------------------------------------------------- GLOBAL @rt_copy1col_asm@16 GLOBAL _rt_copy1col_asm GLOBAL rt_copy1col_asm align 16 rt_copy1col_asm: _rt_copy1col_asm: pop eax mov edx,[esp+4*3] mov ecx,[esp+4*2] push edx push ecx mov ecx,[esp+4*2] mov edx,[esp+4*3] push eax @rt_copy1col_asm@16: mov eax, [esp+4] push ebx mov ebx, [esp+12] push esi sub ebx, eax push edi js .done lea esi,[eax*4] inc ebx ; ebx = count mov eax,edx lea ecx,[dc_temp+ecx+esi] ; ecx = source mov edi,[ylookup+esi] mov esi,[dc_pitch] ; esi = pitch add eax,edi ; eax = dest add eax,[dc_destorg] shr ebx,1 jnc .even mov dl,[ecx] add ecx,4 mov [eax],dl add eax,esi .even and ebx,ebx jz .done .loop mov dl,[ecx] mov dh,[ecx+4] mov [eax],dl mov [eax+esi],dh add ecx,8 lea eax,[eax+esi*2] dec ebx jnz .loop .done pop edi pop esi pop ebx ret 8 ;*---------------------------------------------------------------------- ;* ;* rt_copy4cols_asm ;* ;* ecx = sx ;* edx = yl ;* [esp+4] = yh ;* ;*---------------------------------------------------------------------- GLOBAL @rt_copy4cols_asm@12 GLOBAL _rt_copy4cols_asm GLOBAL rt_copy4cols_asm align 16 rt_copy4cols_asm: _rt_copy4cols_asm: pop eax mov ecx,[esp+8] mov edx,[esp+4] push ecx mov ecx,[esp+4] push eax @rt_copy4cols_asm@12: push ebx mov ebx,[esp+8] push esi sub ebx,edx push edi js .done inc ebx ; ebx = count mov eax,ecx mov esi,[ylookup+edx*4] lea ecx,[dc_temp+edx*4] ; ecx = source mov edx,[dc_pitch] ; edx = pitch add eax,esi ; eax = dest add eax,[dc_destorg] shr ebx,1 jnc .even mov esi,[ecx] add ecx,4 mov [eax],esi add eax,edx .even and ebx,ebx jz .done .loop mov esi,[ecx] mov edi,[ecx+4] mov [eax],esi mov [eax+edx],edi add ecx,8 lea eax,[eax+edx*2] dec ebx jnz .loop .done pop edi pop esi pop ebx ret 4 ;*---------------------------------------------------------------------- ;* ;* rt_map1col_asm ;* ;* ecx = hx ;* edx = sx ;* [esp+4] = yl ;* [esp+8] = yh ;* ;*---------------------------------------------------------------------- GLOBAL @rt_map1col_asm@16 GLOBAL _rt_map1col_asm GLOBAL rt_map1col_asm align 16 rt_map1col_asm: _rt_map1col_asm: pop eax mov edx,[esp+4*3] mov ecx,[esp+4*2] push edx push ecx mov ecx,[esp+4*2] mov edx,[esp+4*3] push eax @rt_map1col_asm@16: mov eax,[esp+4] push ebx mov ebx,[esp+12] push ebp push esi sub ebx, eax push edi js .done lea edi,[eax*4] mov esi,[dc_colormap] ; esi = colormap inc ebx ; ebx = count mov eax,edx lea ebp,[dc_temp+ecx+edi] ; ebp = source mov ecx,[ylookup+edi] mov edi,[dc_pitch] ; edi = pitch add eax,ecx ; eax = dest xor ecx,ecx xor edx,edx add eax,[dc_destorg] shr ebx,1 jnc .even mov dl,[ebp] add ebp,4 mov dl,[esi+edx] mov [eax],dl add eax,edi .even and ebx,ebx jz .done .loop mov dl,[ebp] mov cl,[ebp+4] add ebp,8 mov dl,[esi+edx] mov cl,[esi+ecx] mov [eax],dl mov [eax+edi],cl dec ebx lea eax,[eax+edi*2] jnz .loop .done pop edi pop esi pop ebp pop ebx ret 8 ;*---------------------------------------------------------------------- ;* ;* rt_map4cols_asm ;* ;* rt_map4cols_asm1 is for PPro and above ;* rt_map4cols_asm2 is for Pentium and below ;* ;* ecx = sx ;* edx = yl ;* [esp+4] = yh ;* ;*---------------------------------------------------------------------- GLOBAL @rt_map4cols_asm1@12 GLOBAL _rt_map4cols_asm1 GLOBAL rt_map4cols_asm1 align 16 rt_map4cols_asm1: _rt_map4cols_asm1: pop eax mov ecx,[esp+8] mov edx,[esp+4] push ecx mov ecx,[esp+4] push eax @rt_map4cols_asm1@12: push ebx mov ebx,[esp+8] push ebp push esi sub ebx,edx push edi js near .done mov esi,[dc_colormap] ; esi = colormap shl edx,2 mov eax,ecx inc ebx ; ebx = count mov edi,[ylookup+edx] lea ebp,[dc_temp+edx] ; ebp = source add eax,edi ; eax = dest mov edi,[dc_pitch] ; edi = pitch add eax,[dc_destorg] xor ecx,ecx xor edx,edx shr ebx,1 jnc .even mov dl,[ebp] mov cl,[ebp+1] add ebp,4 mov dl,[esi+edx] mov cl,[esi+ecx] mov [eax],dl mov [eax+1],cl mov dl,[ebp-2] mov cl,[ebp-1] mov dl,[esi+edx] mov cl,[esi+ecx] mov [eax+2],dl mov [eax+3],cl add eax,edi .even and ebx,ebx jz .done .loop mov dl,[ebp] mov cl,[ebp+1] add ebp,8 mov dl,[esi+edx] mov cl,[esi+ecx] mov [eax],dl mov [eax+1],cl mov dl,[ebp-6] mov cl,[ebp-5] mov dl,[esi+edx] mov cl,[esi+ecx] mov [eax+2],dl mov [eax+3],cl mov dl,[ebp-4] mov cl,[ebp-3] mov dl,[esi+edx] mov cl,[esi+ecx] mov [eax+edi],dl mov [eax+edi+1],cl mov dl,[ebp-2] mov cl,[ebp-1] mov dl,[esi+edx] mov cl,[esi+ecx] mov [eax+edi+2],dl mov [eax+edi+3],cl lea eax,[eax+edi*2] dec ebx jnz .loop .done pop edi pop esi pop ebp pop ebx ret 4 GLOBAL @rt_map4cols_asm2@12 GLOBAL _rt_map4cols_asm2 GLOBAL rt_map4cols_asm2 align 16 rt_map4cols_asm2: _rt_map4cols_asm2: pop eax mov ecx,[esp+8] mov edx,[esp+4] push ecx mov ecx,[esp+4] push eax @rt_map4cols_asm2@12: push ebx mov ebx,[esp+8] push ebp push esi sub ebx,edx push edi js near .done mov esi,[dc_colormap] ; esi = colormap shl edx,2 mov eax,ecx inc ebx ; ebx = count mov edi,[ylookup+edx] lea ebp,[dc_temp+edx] ; ebp = source add eax,edi ; eax = dest mov edi,[dc_pitch] ; edi = pitch add eax,[dc_destorg] xor ecx,ecx xor edx,edx shr ebx,1 jnc .even mov dl,[ebp] mov cl,[ebp+1] add ebp,4 mov dl,[esi+edx] mov cl,[esi+ecx] mov [eax],dl mov [eax+1],cl mov dl,[ebp-2] mov cl,[ebp-1] mov dl,[esi+edx] mov cl,[esi+ecx] mov [eax+2],dl mov [eax+3],cl add eax,edi .even and ebx,ebx jz .done .loop mov dl,[ebp+3] mov ch,[esi+edx] mov dl,[ebp+2] mov cl,[esi+edx] shl ecx,16 mov dl,[ebp+1] mov ch,[esi+edx] mov dl,[ebp] mov cl,[esi+edx] mov [eax],ecx add eax,edi mov dl,[ebp+7] mov ch,[esi+edx] mov dl,[ebp+6] mov cl,[esi+edx] shl ecx,16 mov dl,[ebp+5] mov ch,[esi+edx] mov dl,[ebp+4] mov cl,[esi+edx] mov [eax],ecx add eax,edi add ebp,8 dec ebx jnz .loop .done pop edi pop esi pop ebp pop ebx ret 4 ;************************ SECTION .text EXTERN setvlinebpl_ EXTERN setpitch3 GLOBAL @ASM_PatchPitch@0 GLOBAL _ASM_PatchPitch GLOBAL ASM_PatchPitch ASM_PatchPitch: _ASM_PatchPitch: @ASM_PatchPitch@0: mov eax,[dc_pitch] mov [rdcp1+2],eax mov [rdcp2+2],eax mov [rdcp3+2],eax call setpitch3 jmp setvlinebpl_