quakeforge/libs/video/renderer/sw/d_draw.S

1049 lines
24 KiB
ArmAsm

/*
d_draw.S
x86 assembly-language horizontal 8-bpp span-drawing code.
Copyright (C) 1996-1997 Id Software, Inc.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to:
Free Software Foundation, Inc.
59 Temple Place - Suite 330
Boston, MA 02111-1307, USA
$Id$
*/
#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
#include "asm_i386.h"
#include "quakeasm.h"
#include "asm_draw.h"
#include "d_ifacea.h"
#ifdef PIC
#undef USE_INTEL_ASM //XXX asm pic hack
#endif
#ifdef USE_INTEL_ASM
//----------------------------------------------------------------------
// 8-bpp horizontal span drawing code for polygons, with no transparency.
//
// Assumes there is at least one span in pspans, and that every span
// contains at least one pixel
//----------------------------------------------------------------------
.text
// out-of-line, rarely-needed clamping code
LClampHigh0:
movl C(bbextents),%esi
jmp LClampReentry0
LClampHighOrLow0:
jg LClampHigh0
xorl %esi,%esi
jmp LClampReentry0
LClampHigh1:
movl C(bbextentt),%edx
jmp LClampReentry1
LClampHighOrLow1:
jg LClampHigh1
xorl %edx,%edx
jmp LClampReentry1
LClampLow2:
movl $2048,%ebp
jmp LClampReentry2
LClampHigh2:
movl C(bbextents),%ebp
jmp LClampReentry2
LClampLow3:
movl $2048,%ecx
jmp LClampReentry3
LClampHigh3:
movl C(bbextentt),%ecx
jmp LClampReentry3
LClampLow4:
movl $2048,%eax
jmp LClampReentry4
LClampHigh4:
movl C(bbextents),%eax
jmp LClampReentry4
LClampLow5:
movl $2048,%ebx
jmp LClampReentry5
LClampHigh5:
movl C(bbextentt),%ebx
jmp LClampReentry5
#define pspans 4+16
.align 4
.globl C(D_DrawSpans8)
C(D_DrawSpans8):
pushl %ebp // preserve caller's stack frame
pushl %edi
pushl %esi // preserve register variables
pushl %ebx
//
// set up scaled-by-8 steps, for 8-long segments; also set up cacheblock
// and span list pointers
//
// TODO: any overlap from rearranging?
flds C(d_sdivzstepu)
fmuls C(fp_8)
movl C(cacheblock),%edx
flds C(d_tdivzstepu)
fmuls C(fp_8)
movl pspans(%esp),%ebx // point to the first span descriptor
flds C(d_zistepu)
fmuls C(fp_8)
movl %edx,C(pbase) // pbase = cacheblock
fstps C(zi8stepu)
fstps C(tdivz8stepu)
fstps C(sdivz8stepu)
LSpanLoop:
//
// set up the initial s/z, t/z, and 1/z on the FP stack, and generate the
// initial s and t values
//
// FIXME: pipeline FILD?
fildl espan_t_v(%ebx)
fildl espan_t_u(%ebx)
fld %st(1) // dv | du | dv
fmuls C(d_sdivzstepv) // dv*d_sdivzstepv | du | dv
fld %st(1) // du | dv*d_sdivzstepv | du | dv
fmuls C(d_sdivzstepu) // du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
fld %st(2) // du | du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
fmuls C(d_tdivzstepu) // du*d_tdivzstepu | du*d_sdivzstepu |
// dv*d_sdivzstepv | du | dv
fxch %st(1) // du*d_sdivzstepu | du*d_tdivzstepu |
// dv*d_sdivzstepv | du | dv
faddp %st(0),%st(2) // du*d_tdivzstepu |
// du*d_sdivzstepu + dv*d_sdivzstepv | du | dv
fxch %st(1) // du*d_sdivzstepu + dv*d_sdivzstepv |
// du*d_tdivzstepu | du | dv
fld %st(3) // dv | du*d_sdivzstepu + dv*d_sdivzstepv |
// du*d_tdivzstepu | du | dv
fmuls C(d_tdivzstepv) // dv*d_tdivzstepv |
// du*d_sdivzstepu + dv*d_sdivzstepv |
// du*d_tdivzstepu | du | dv
fxch %st(1) // du*d_sdivzstepu + dv*d_sdivzstepv |
// dv*d_tdivzstepv | du*d_tdivzstepu | du | dv
fadds C(d_sdivzorigin) // sdivz = d_sdivzorigin + dv*d_sdivzstepv +
// du*d_sdivzstepu; stays in %st(2) at end
fxch %st(4) // dv | dv*d_tdivzstepv | du*d_tdivzstepu | du |
// s/z
fmuls C(d_zistepv) // dv*d_zistepv | dv*d_tdivzstepv |
// du*d_tdivzstepu | du | s/z
fxch %st(1) // dv*d_tdivzstepv | dv*d_zistepv |
// du*d_tdivzstepu | du | s/z
faddp %st(0),%st(2) // dv*d_zistepv |
// dv*d_tdivzstepv + du*d_tdivzstepu | du | s/z
fxch %st(2) // du | dv*d_tdivzstepv + du*d_tdivzstepu |
// dv*d_zistepv | s/z
fmuls C(d_zistepu) // du*d_zistepu |
// dv*d_tdivzstepv + du*d_tdivzstepu |
// dv*d_zistepv | s/z
fxch %st(1) // dv*d_tdivzstepv + du*d_tdivzstepu |
// du*d_zistepu | dv*d_zistepv | s/z
fadds C(d_tdivzorigin) // tdivz = d_tdivzorigin + dv*d_tdivzstepv +
// du*d_tdivzstepu; stays in %st(1) at end
fxch %st(2) // dv*d_zistepv | du*d_zistepu | t/z | s/z
faddp %st(0),%st(1) // dv*d_zistepv + du*d_zistepu | t/z | s/z
flds C(fp_64k) // fp_64k | dv*d_zistepv + du*d_zistepu | t/z | s/z
fxch %st(1) // dv*d_zistepv + du*d_zistepu | fp_64k | t/z | s/z
fadds C(d_ziorigin) // zi = d_ziorigin + dv*d_zistepv +
// du*d_zistepu; stays in %st(0) at end
// 1/z | fp_64k | t/z | s/z
//
// calculate and clamp s & t
//
fdivr %st(0),%st(1) // 1/z | z*64k | t/z | s/z
//
// point %edi to the first pixel in the span
//
movl C(d_viewbuffer),%ecx
movl espan_t_v(%ebx),%eax
movl %ebx,C(pspantemp) // preserve spans pointer
movl C(tadjust),%edx
movl C(sadjust),%esi
movl C(d_scantable)(,%eax,4),%edi // v * screenwidth
addl %ecx,%edi
movl espan_t_u(%ebx),%ecx
addl %ecx,%edi // pdest = &pdestspan[scans->u];
movl espan_t_count(%ebx),%ecx
//
// now start the FDIV for the end of the span
//
cmpl $8,%ecx
ja LSetupNotLast1
decl %ecx
jz LCleanup1 // if only one pixel, no need to start an FDIV
movl %ecx,C(spancountminus1)
// finish up the s and t calcs
fxch %st(1) // z*64k | 1/z | t/z | s/z
fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z
fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z
fxch %st(1) // z*64k | s | 1/z | t/z | s/z
fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z
fxch %st(1) // s | t | 1/z | t/z | s/z
fistpl C(s) // 1/z | t | t/z | s/z
fistpl C(t) // 1/z | t/z | s/z
fildl C(spancountminus1)
flds C(d_tdivzstepu) // C(d_tdivzstepu) | spancountminus1
flds C(d_zistepu) // C(d_zistepu) | C(d_tdivzstepu) | spancountminus1
fmul %st(2),%st(0) // C(d_zistepu)*scm1 | C(d_tdivzstepu) | scm1
fxch %st(1) // C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1
fmul %st(2),%st(0) // C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1
fxch %st(2) // scm1 | C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1
fmuls C(d_sdivzstepu) // C(d_sdivzstepu)*scm1 | C(d_zistepu)*scm1 |
// C(d_tdivzstepu)*scm1
fxch %st(1) // C(d_zistepu)*scm1 | C(d_sdivzstepu)*scm1 |
// C(d_tdivzstepu)*scm1
faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1
fxch %st(1) // C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1
faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1
faddp %st(0),%st(3)
flds C(fp_64k)
fdiv %st(1),%st(0) // this is what we've gone to all this trouble to
// overlap
jmp LFDIVInFlight1
LCleanup1:
// finish up the s and t calcs
fxch %st(1) // z*64k | 1/z | t/z | s/z
fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z
fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z
fxch %st(1) // z*64k | s | 1/z | t/z | s/z
fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z
fxch %st(1) // s | t | 1/z | t/z | s/z
fistpl C(s) // 1/z | t | t/z | s/z
fistpl C(t) // 1/z | t/z | s/z
jmp LFDIVInFlight1
.align 4
LSetupNotLast1:
// finish up the s and t calcs
fxch %st(1) // z*64k | 1/z | t/z | s/z
fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z
fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z
fxch %st(1) // z*64k | s | 1/z | t/z | s/z
fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z
fxch %st(1) // s | t | 1/z | t/z | s/z
fistpl C(s) // 1/z | t | t/z | s/z
fistpl C(t) // 1/z | t/z | s/z
fadds C(zi8stepu)
fxch %st(2)
fadds C(sdivz8stepu)
fxch %st(2)
flds C(tdivz8stepu)
faddp %st(0),%st(2)
flds C(fp_64k)
fdiv %st(1),%st(0) // z = 1/1/z
// this is what we've gone to all this trouble to
// overlap
LFDIVInFlight1:
addl C(s),%esi
addl C(t),%edx
movl C(bbextents),%ebx
movl C(bbextentt),%ebp
cmpl %ebx,%esi
ja LClampHighOrLow0
LClampReentry0:
movl %esi,C(s)
movl C(pbase),%ebx
shll $16,%esi
cmpl %ebp,%edx
movl %esi,C(sfracf)
ja LClampHighOrLow1
LClampReentry1:
movl %edx,C(t)
movl C(s),%esi // sfrac = scans->sfrac;
shll $16,%edx
movl C(t),%eax // tfrac = scans->tfrac;
sarl $16,%esi
movl %edx,C(tfracf)
//
// calculate the texture starting address
//
sarl $16,%eax
movl C(cachewidth),%edx
imull %edx,%eax // (tfrac >> 16) * cachewidth
addl %ebx,%esi
addl %eax,%esi // psource = pbase + (sfrac >> 16) +
// ((tfrac >> 16) * cachewidth);
//
// determine whether last span or not
//
cmpl $8,%ecx
jna LLastSegment
//
// not the last segment; do full 8-wide segment
//
LNotLastSegment:
//
// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
// get there
//
// pick up after the FDIV that was left in flight previously
fld %st(0) // duplicate it
fmul %st(4),%st(0) // s = s/z * z
fxch %st(1)
fmul %st(3),%st(0) // t = t/z * z
fxch %st(1)
fistpl C(snext)
fistpl C(tnext)
movl C(snext),%eax
movl C(tnext),%edx
movb (%esi),%bl // get first source texel
subl $8,%ecx // count off this segments' pixels
movl C(sadjust),%ebp
movl %ecx,C(counttemp) // remember count of remaining pixels
movl C(tadjust),%ecx
movb %bl,(%edi) // store first dest pixel
addl %eax,%ebp
addl %edx,%ecx
movl C(bbextents),%eax
movl C(bbextentt),%edx
cmpl $2048,%ebp
jl LClampLow2
cmpl %eax,%ebp
ja LClampHigh2
LClampReentry2:
cmpl $2048,%ecx
jl LClampLow3
cmpl %edx,%ecx
ja LClampHigh3
LClampReentry3:
movl %ebp,C(snext)
movl %ecx,C(tnext)
subl C(s),%ebp
subl C(t),%ecx
//
// set up advancetable
//
movl %ecx,%eax
movl %ebp,%edx
sarl $19,%eax // tstep >>= 16;
jz LZero
sarl $19,%edx // sstep >>= 16;
movl C(cachewidth),%ebx
imull %ebx,%eax
jmp LSetUp1
LZero:
sarl $19,%edx // sstep >>= 16;
movl C(cachewidth),%ebx
LSetUp1:
addl %edx,%eax // add in sstep
// (tstep >> 16) * cachewidth + (sstep >> 16);
movl C(tfracf),%edx
movl %eax,C(advancetable)+4 // advance base in t
addl %ebx,%eax // ((tstep >> 16) + 1) * cachewidth +
// (sstep >> 16);
shll $13,%ebp // left-justify sstep fractional part
movl C(sfracf),%ebx
shll $13,%ecx // left-justify tstep fractional part
movl %eax,C(advancetable) // advance extra in t
movl %ecx,C(tstep)
addl %ecx,%edx // advance tfrac fractional part by tstep frac
sbbl %ecx,%ecx // turn tstep carry into -1 (0 if none)
addl %ebp,%ebx // advance sfrac fractional part by sstep frac
adcl C(advancetable)+4(,%ecx,4),%esi // point to next source texel
addl C(tstep),%edx
sbbl %ecx,%ecx
movb (%esi),%al
addl %ebp,%ebx
movb %al,1(%edi)
adcl C(advancetable)+4(,%ecx,4),%esi
addl C(tstep),%edx
sbbl %ecx,%ecx
addl %ebp,%ebx
movb (%esi),%al
adcl C(advancetable)+4(,%ecx,4),%esi
addl C(tstep),%edx
sbbl %ecx,%ecx
movb %al,2(%edi)
addl %ebp,%ebx
movb (%esi),%al
adcl C(advancetable)+4(,%ecx,4),%esi
addl C(tstep),%edx
sbbl %ecx,%ecx
movb %al,3(%edi)
addl %ebp,%ebx
movb (%esi),%al
adcl C(advancetable)+4(,%ecx,4),%esi
//
// start FDIV for end of next segment in flight, so it can overlap
//
movl C(counttemp),%ecx
cmpl $8,%ecx // more than one segment after this?
ja LSetupNotLast2 // yes
decl %ecx
jz LFDIVInFlight2 // if only one pixel, no need to start an FDIV
movl %ecx,C(spancountminus1)
fildl C(spancountminus1)
flds C(d_zistepu) // C(d_zistepu) | spancountminus1
fmul %st(1),%st(0) // C(d_zistepu)*scm1 | scm1
flds C(d_tdivzstepu) // C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1
fmul %st(2),%st(0) // C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1
fxch %st(1) // C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1 | scm1
faddp %st(0),%st(3) // C(d_tdivzstepu)*scm1 | scm1
fxch %st(1) // scm1 | C(d_tdivzstepu)*scm1
fmuls C(d_sdivzstepu) // C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1
fxch %st(1) // C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1
faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1
flds C(fp_64k) // 64k | C(d_sdivzstepu)*scm1
fxch %st(1) // C(d_sdivzstepu)*scm1 | 64k
faddp %st(0),%st(4) // 64k
fdiv %st(1),%st(0) // this is what we've gone to all this trouble to
// overlap
jmp LFDIVInFlight2
.align 4
LSetupNotLast2:
fadds C(zi8stepu)
fxch %st(2)
fadds C(sdivz8stepu)
fxch %st(2)
flds C(tdivz8stepu)
faddp %st(0),%st(2)
flds C(fp_64k)
fdiv %st(1),%st(0) // z = 1/1/z
// this is what we've gone to all this trouble to
// overlap
LFDIVInFlight2:
movl %ecx,C(counttemp)
addl C(tstep),%edx
sbbl %ecx,%ecx
movb %al,4(%edi)
addl %ebp,%ebx
movb (%esi),%al
adcl C(advancetable)+4(,%ecx,4),%esi
addl C(tstep),%edx
sbbl %ecx,%ecx
movb %al,5(%edi)
addl %ebp,%ebx
movb (%esi),%al
adcl C(advancetable)+4(,%ecx,4),%esi
addl C(tstep),%edx
sbbl %ecx,%ecx
movb %al,6(%edi)
addl %ebp,%ebx
movb (%esi),%al
adcl C(advancetable)+4(,%ecx,4),%esi
addl $8,%edi
movl %edx,C(tfracf)
movl C(snext),%edx
movl %ebx,C(sfracf)
movl C(tnext),%ebx
movl %edx,C(s)
movl %ebx,C(t)
movl C(counttemp),%ecx // retrieve count
//
// determine whether last span or not
//
cmpl $8,%ecx // are there multiple segments remaining?
movb %al,-1(%edi)
ja LNotLastSegment // yes
//
// last segment of scan
//
LLastSegment:
//
// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
// get there. The number of pixels left is variable, and we want to land on the
// last pixel, not step one past it, so we can't run into arithmetic problems
//
testl %ecx,%ecx
jz LNoSteps // just draw the last pixel and we're done
// pick up after the FDIV that was left in flight previously
fld %st(0) // duplicate it
fmul %st(4),%st(0) // s = s/z * z
fxch %st(1)
fmul %st(3),%st(0) // t = t/z * z
fxch %st(1)
fistpl C(snext)
fistpl C(tnext)
movb (%esi),%al // load first texel in segment
movl C(tadjust),%ebx
movb %al,(%edi) // store first pixel in segment
movl C(sadjust),%eax
addl C(snext),%eax
addl C(tnext),%ebx
movl C(bbextents),%ebp
movl C(bbextentt),%edx
cmpl $2048,%eax
jl LClampLow4
cmpl %ebp,%eax
ja LClampHigh4
LClampReentry4:
movl %eax,C(snext)
cmpl $2048,%ebx
jl LClampLow5
cmpl %edx,%ebx
ja LClampHigh5
LClampReentry5:
cmpl $1,%ecx // don't bother
je LOnlyOneStep // if two pixels in segment, there's only one step,
// of the segment length
subl C(s),%eax
subl C(t),%ebx
addl %eax,%eax // convert to 15.17 format so multiply by 1.31
addl %ebx,%ebx // reciprocal yields 16.48
imull C(reciprocal_table)-8(,%ecx,4) // sstep = (snext - s) / (spancount-1)
movl %edx,%ebp
movl %ebx,%eax
imull C(reciprocal_table)-8(,%ecx,4) // tstep = (tnext - t) / (spancount-1)
LSetEntryvec:
//
// set up advancetable
//
movl C(entryvec_table)(,%ecx,4),%ebx
movl %edx,%eax
movl %ebx,C(jumptemp) // entry point into code for RET later
movl %ebp,%ecx
sarl $16,%edx // tstep >>= 16;
movl C(cachewidth),%ebx
sarl $16,%ecx // sstep >>= 16;
imull %ebx,%edx
addl %ecx,%edx // add in sstep
// (tstep >> 16) * cachewidth + (sstep >> 16);
movl C(tfracf),%ecx
movl %edx,C(advancetable)+4 // advance base in t
addl %ebx,%edx // ((tstep >> 16) + 1) * cachewidth +
// (sstep >> 16);
shll $16,%ebp // left-justify sstep fractional part
movl C(sfracf),%ebx
shll $16,%eax // left-justify tstep fractional part
movl %edx,C(advancetable) // advance extra in t
movl %eax,C(tstep)
movl %ecx,%edx
addl %eax,%edx
sbbl %ecx,%ecx
addl %ebp,%ebx
adcl C(advancetable)+4(,%ecx,4),%esi
jmp *C(jumptemp) // jump to the number-of-pixels handler
//----------------------------------------
LNoSteps:
movb (%esi),%al // load first texel in segment
subl $7,%edi // adjust for hardwired offset
jmp LEndSpan
LOnlyOneStep:
subl C(s),%eax
subl C(t),%ebx
movl %eax,%ebp
movl %ebx,%edx
jmp LSetEntryvec
//----------------------------------------
.globl C(Entry2_8)
C(Entry2_8):
subl $6,%edi // adjust for hardwired offsets
movb (%esi),%al
jmp LLEntry2_8
//----------------------------------------
.globl C(Entry3_8)
C(Entry3_8):
subl $5,%edi // adjust for hardwired offsets
addl %eax,%edx
movb (%esi),%al
sbbl %ecx,%ecx
addl %ebp,%ebx
adcl C(advancetable)+4(,%ecx,4),%esi
jmp LLEntry3_8
//----------------------------------------
.globl C(Entry4_8)
C(Entry4_8):
subl $4,%edi // adjust for hardwired offsets
addl %eax,%edx
movb (%esi),%al
sbbl %ecx,%ecx
addl %ebp,%ebx
adcl C(advancetable)+4(,%ecx,4),%esi
addl C(tstep),%edx
jmp LLEntry4_8
//----------------------------------------
.globl C(Entry5_8)
C(Entry5_8):
subl $3,%edi // adjust for hardwired offsets
addl %eax,%edx
movb (%esi),%al
sbbl %ecx,%ecx
addl %ebp,%ebx
adcl C(advancetable)+4(,%ecx,4),%esi
addl C(tstep),%edx
jmp LLEntry5_8
//----------------------------------------
.globl C(Entry6_8)
C(Entry6_8):
subl $2,%edi // adjust for hardwired offsets
addl %eax,%edx
movb (%esi),%al
sbbl %ecx,%ecx
addl %ebp,%ebx
adcl C(advancetable)+4(,%ecx,4),%esi
addl C(tstep),%edx
jmp LLEntry6_8
//----------------------------------------
.globl C(Entry7_8)
C(Entry7_8):
decl %edi // adjust for hardwired offsets
addl %eax,%edx
movb (%esi),%al
sbbl %ecx,%ecx
addl %ebp,%ebx
adcl C(advancetable)+4(,%ecx,4),%esi
addl C(tstep),%edx
jmp LLEntry7_8
//----------------------------------------
.globl C(Entry8_8)
C(Entry8_8):
addl %eax,%edx
movb (%esi),%al
sbbl %ecx,%ecx
addl %ebp,%ebx
adcl C(advancetable)+4(,%ecx,4),%esi
addl C(tstep),%edx
sbbl %ecx,%ecx
movb %al,1(%edi)
addl %ebp,%ebx
movb (%esi),%al
adcl C(advancetable)+4(,%ecx,4),%esi
addl C(tstep),%edx
LLEntry7_8:
sbbl %ecx,%ecx
movb %al,2(%edi)
addl %ebp,%ebx
movb (%esi),%al
adcl C(advancetable)+4(,%ecx,4),%esi
addl C(tstep),%edx
LLEntry6_8:
sbbl %ecx,%ecx
movb %al,3(%edi)
addl %ebp,%ebx
movb (%esi),%al
adcl C(advancetable)+4(,%ecx,4),%esi
addl C(tstep),%edx
LLEntry5_8:
sbbl %ecx,%ecx
movb %al,4(%edi)
addl %ebp,%ebx
movb (%esi),%al
adcl C(advancetable)+4(,%ecx,4),%esi
addl C(tstep),%edx
LLEntry4_8:
sbbl %ecx,%ecx
movb %al,5(%edi)
addl %ebp,%ebx
movb (%esi),%al
adcl C(advancetable)+4(,%ecx,4),%esi
LLEntry3_8:
movb %al,6(%edi)
movb (%esi),%al
LLEntry2_8:
LEndSpan:
//
// clear s/z, t/z, 1/z from FP stack
//
fstp %st(0)
fstp %st(0)
fstp %st(0)
movl C(pspantemp),%ebx // restore spans pointer
movl espan_t_pnext(%ebx),%ebx // point to next span
testl %ebx,%ebx // any more spans?
movb %al,7(%edi)
jnz LSpanLoop // more spans
popl %ebx // restore register variables
popl %esi
popl %edi
popl %ebp // restore the caller's stack frame
ret
//----------------------------------------------------------------------
// 8-bpp horizontal span z drawing codefor polygons, with no transparency.
//
// Assumes there is at least one span in pzspans, and that every span
// contains at least one pixel
//----------------------------------------------------------------------
.text
// z-clamp on a non-negative gradient span
LClamp:
movl $0x40000000,%edx
xorl %ebx,%ebx
fstp %st(0)
jmp LZDraw
// z-clamp on a negative gradient span
LClampNeg:
movl $0x40000000,%edx
xorl %ebx,%ebx
fstp %st(0)
jmp LZDrawNeg
#define pzspans 4+16
.globl C(D_DrawZSpans)
C(D_DrawZSpans):
pushl %ebp // preserve caller's stack frame
pushl %edi
pushl %esi // preserve register variables
pushl %ebx
flds C(d_zistepu)
movl C(d_zistepu),%eax
movl pzspans(%esp),%esi
testl %eax,%eax
jz LFNegSpan
fmuls C(Float2ToThe31nd)
fistpl C(izistep) // note: we are relying on FP exceptions being turned
// off here to avoid range problems
movl C(izistep),%ebx // remains loaded for all spans
LFSpanLoop:
// set up the initial 1/z value
fildl espan_t_v(%esi)
fildl espan_t_u(%esi)
movl espan_t_v(%esi),%ecx
movl C(d_pzbuffer),%edi
fmuls C(d_zistepu)
fxch %st(1)
fmuls C(d_zistepv)
fxch %st(1)
fadds C(d_ziorigin)
imull C(d_zrowbytes),%ecx
faddp %st(0),%st(1)
// clamp if z is nearer than 2 (1/z > 0.5)
fcoms C(float_point5)
addl %ecx,%edi
movl espan_t_u(%esi),%edx
addl %edx,%edx // word count
movl espan_t_count(%esi),%ecx
addl %edx,%edi // pdest = &pdestspan[scans->u];
pushl %esi // preserve spans pointer
fnstsw %ax
testb $0x45,%ah
jz LClamp
fmuls C(Float2ToThe31nd)
fistpl C(izi) // note: we are relying on FP exceptions being turned
// off here to avoid problems when the span is closer
// than 1/(2**31)
movl C(izi),%edx
// at this point:
// %ebx = izistep
// %ecx = count
// %edx = izi
// %edi = pdest
LZDraw:
// do a single pixel up front, if necessary to dword align the destination
testl $2,%edi
jz LFMiddle
movl %edx,%eax
addl %ebx,%edx
shrl $16,%eax
decl %ecx
movw %ax,(%edi)
addl $2,%edi
// do middle a pair of aligned dwords at a time
LFMiddle:
pushl %ecx
shrl $1,%ecx // count / 2
jz LFLast // no aligned dwords to do
shrl $1,%ecx // (count / 2) / 2
jnc LFMiddleLoop // even number of aligned dwords to do
movl %edx,%eax
addl %ebx,%edx
shrl $16,%eax
movl %edx,%esi
addl %ebx,%edx
andl $0xFFFF0000,%esi
orl %esi,%eax
movl %eax,(%edi)
addl $4,%edi
andl %ecx,%ecx
jz LFLast
LFMiddleLoop:
movl %edx,%eax
addl %ebx,%edx
shrl $16,%eax
movl %edx,%esi
addl %ebx,%edx
andl $0xFFFF0000,%esi
orl %esi,%eax
movl %edx,%ebp
movl %eax,(%edi)
addl %ebx,%edx
shrl $16,%ebp
movl %edx,%esi
addl %ebx,%edx
andl $0xFFFF0000,%esi
orl %esi,%ebp
movl %ebp,4(%edi) // FIXME: eliminate register contention
addl $8,%edi
decl %ecx
jnz LFMiddleLoop
LFLast:
popl %ecx // retrieve count
popl %esi // retrieve span pointer
// do the last, unaligned pixel, if there is one
andl $1,%ecx // is there an odd pixel left to do?
jz LFSpanDone // no
shrl $16,%edx
movw %dx,(%edi) // do the final pixel's z
LFSpanDone:
movl espan_t_pnext(%esi),%esi
testl %esi,%esi
jnz LFSpanLoop
jmp LFDone
LFNegSpan:
fmuls C(FloatMinus2ToThe31nd)
fistpl C(izistep) // note: we are relying on FP exceptions being turned
// off here to avoid range problems
movl C(izistep),%ebx // remains loaded for all spans
LFNegSpanLoop:
// set up the initial 1/z value
fildl espan_t_v(%esi)
fildl espan_t_u(%esi)
movl espan_t_v(%esi),%ecx
movl C(d_pzbuffer),%edi
fmuls C(d_zistepu)
fxch %st(1)
fmuls C(d_zistepv)
fxch %st(1)
fadds C(d_ziorigin)
imull C(d_zrowbytes),%ecx
faddp %st(0),%st(1)
// clamp if z is nearer than 2 (1/z > 0.5)
fcoms C(float_point5)
addl %ecx,%edi
movl espan_t_u(%esi),%edx
addl %edx,%edx // word count
movl espan_t_count(%esi),%ecx
addl %edx,%edi // pdest = &pdestspan[scans->u];
pushl %esi // preserve spans pointer
fnstsw %ax
testb $0x45,%ah
jz LClampNeg
fmuls C(Float2ToThe31nd)
fistpl C(izi) // note: we are relying on FP exceptions being turned
// off here to avoid problems when the span is closer
// than 1/(2**31)
movl C(izi),%edx
// at this point:
// %ebx = izistep
// %ecx = count
// %edx = izi
// %edi = pdest
LZDrawNeg:
// do a single pixel up front, if necessary to dword align the destination
testl $2,%edi
jz LFNegMiddle
movl %edx,%eax
subl %ebx,%edx
shrl $16,%eax
decl %ecx
movw %ax,(%edi)
addl $2,%edi
// do middle a pair of aligned dwords at a time
LFNegMiddle:
pushl %ecx
shrl $1,%ecx // count / 2
jz LFNegLast // no aligned dwords to do
shrl $1,%ecx // (count / 2) / 2
jnc LFNegMiddleLoop // even number of aligned dwords to do
movl %edx,%eax
subl %ebx,%edx
shrl $16,%eax
movl %edx,%esi
subl %ebx,%edx
andl $0xFFFF0000,%esi
orl %esi,%eax
movl %eax,(%edi)
addl $4,%edi
andl %ecx,%ecx
jz LFNegLast
LFNegMiddleLoop:
movl %edx,%eax
subl %ebx,%edx
shrl $16,%eax
movl %edx,%esi
subl %ebx,%edx
andl $0xFFFF0000,%esi
orl %esi,%eax
movl %edx,%ebp
movl %eax,(%edi)
subl %ebx,%edx
shrl $16,%ebp
movl %edx,%esi
subl %ebx,%edx
andl $0xFFFF0000,%esi
orl %esi,%ebp
movl %ebp,4(%edi) // FIXME: eliminate register contention
addl $8,%edi
decl %ecx
jnz LFNegMiddleLoop
LFNegLast:
popl %ecx // retrieve count
popl %esi // retrieve span pointer
// do the last, unaligned pixel, if there is one
andl $1,%ecx // is there an odd pixel left to do?
jz LFNegSpanDone // no
shrl $16,%edx
movw %dx,(%edi) // do the final pixel's z
LFNegSpanDone:
movl espan_t_pnext(%esi),%esi
testl %esi,%esi
jnz LFNegSpanLoop
LFDone:
popl %ebx // restore register variables
popl %esi
popl %edi
popl %ebp // restore the caller's stack frame
ret
#endif // USE_INTEL_ASM