quake2forge/ref_soft/r_draw16.asm
2001-12-22 04:27:19 +00:00

1234 lines
31 KiB
NASM

.386P
.model FLAT
;
; d_draw16.s
; x86 assembly-language horizontal 8-bpp span-drawing code, with 16-pixel
; subdivision.
;
include qasm.inc
include d_if.inc
if id386
;----------------------------------------------------------------------
; 8-bpp horizontal span drawing code for polygons, with no transparency and
; 16-pixel subdivision.
;
; Assumes there is at least one span in pspans, and that every span
; contains at least one pixel
;----------------------------------------------------------------------
_DATA SEGMENT
_DATA ENDS
_TEXT SEGMENT
; out-of-line, rarely-needed clamping code
LClampHigh0:
mov esi,ds:dword ptr[_bbextents]
jmp LClampReentry0
LClampHighOrLow0:
jg LClampHigh0
xor esi,esi
jmp LClampReentry0
LClampHigh1:
mov edx,ds:dword ptr[_bbextentt]
jmp LClampReentry1
LClampHighOrLow1:
jg LClampHigh1
xor edx,edx
jmp LClampReentry1
LClampLow2:
mov ebp,4096
jmp LClampReentry2
LClampHigh2:
mov ebp,ds:dword ptr[_bbextents]
jmp LClampReentry2
LClampLow3:
mov ecx,4096
jmp LClampReentry3
LClampHigh3:
mov ecx,ds:dword ptr[_bbextentt]
jmp LClampReentry3
LClampLow4:
mov eax,4096
jmp LClampReentry4
LClampHigh4:
mov eax,ds:dword ptr[_bbextents]
jmp LClampReentry4
LClampLow5:
mov ebx,4096
jmp LClampReentry5
LClampHigh5:
mov ebx,ds:dword ptr[_bbextentt]
jmp LClampReentry5
pspans equ 4+16
align 4
public _D_DrawSpans16
_D_DrawSpans16:
push ebp ; preserve caller's stack frame
push edi
push esi ; preserve register variables
push ebx
;
; set up scaled-by-16 steps, for 16-long segments; also set up cacheblock
; and span list pointers
;
; TODO: any overlap from rearranging?
fld ds:dword ptr[_d_sdivzstepu]
fmul ds:dword ptr[fp_16]
mov edx,ds:dword ptr[_cacheblock]
fld ds:dword ptr[_d_tdivzstepu]
fmul ds:dword ptr[fp_16]
mov ebx,ds:dword ptr[pspans+esp] ; point to the first span descriptor
fld ds:dword ptr[_d_zistepu]
fmul ds:dword ptr[fp_16]
mov ds:dword ptr[pbase],edx ; pbase = cacheblock
fstp ds:dword ptr[zi16stepu]
fstp ds:dword ptr[tdivz16stepu]
fstp ds:dword ptr[sdivz16stepu]
LSpanLoop:
;
; set up the initial s/z, t/z, and 1/z on the FP stack, and generate the
; initial s and t values
;
; FIXME: pipeline FILD?
fild ds:dword ptr[espan_t_v+ebx]
fild ds:dword ptr[espan_t_u+ebx]
fld st(1) ; dv | du | dv
fmul ds:dword ptr[_d_sdivzstepv] ; dv*d_sdivzstepv | du | dv
fld st(1) ; du | dv*d_sdivzstepv | du | dv
fmul ds:dword ptr[_d_sdivzstepu] ; du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
fld st(2) ; du | du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
fmul ds:dword ptr[_d_tdivzstepu] ; du*d_tdivzstepu | du*d_sdivzstepu |
; dv*d_sdivzstepv | du | dv
fxch st(1) ; du*d_sdivzstepu | du*d_tdivzstepu |
; dv*d_sdivzstepv | du | dv
faddp st(2),st(0) ; du*d_tdivzstepu |
; du*d_sdivzstepu + dv*d_sdivzstepv | du | dv
fxch st(1) ; du*d_sdivzstepu + dv*d_sdivzstepv |
; du*d_tdivzstepu | du | dv
fld st(3) ; dv | du*d_sdivzstepu + dv*d_sdivzstepv |
; du*d_tdivzstepu | du | dv
fmul ds:dword ptr[_d_tdivzstepv] ; dv*d_tdivzstepv |
; du*d_sdivzstepu + dv*d_sdivzstepv |
; du*d_tdivzstepu | du | dv
fxch st(1) ; du*d_sdivzstepu + dv*d_sdivzstepv |
; dv*d_tdivzstepv | du*d_tdivzstepu | du | dv
fadd ds:dword ptr[_d_sdivzorigin] ; sdivz = d_sdivzorigin + dv*d_sdivzstepv +
; du*d_sdivzstepu; stays in %st(2) at end
fxch st(4) ; dv | dv*d_tdivzstepv | du*d_tdivzstepu | du |
; s/z
fmul ds:dword ptr[_d_zistepv] ; dv*d_zistepv | dv*d_tdivzstepv |
; du*d_tdivzstepu | du | s/z
fxch st(1) ; dv*d_tdivzstepv | dv*d_zistepv |
; du*d_tdivzstepu | du | s/z
faddp st(2),st(0) ; dv*d_zistepv |
; dv*d_tdivzstepv + du*d_tdivzstepu | du | s/z
fxch st(2) ; du | dv*d_tdivzstepv + du*d_tdivzstepu |
; dv*d_zistepv | s/z
fmul ds:dword ptr[_d_zistepu] ; du*d_zistepu |
; dv*d_tdivzstepv + du*d_tdivzstepu |
; dv*d_zistepv | s/z
fxch st(1) ; dv*d_tdivzstepv + du*d_tdivzstepu |
; du*d_zistepu | dv*d_zistepv | s/z
fadd ds:dword ptr[_d_tdivzorigin] ; tdivz = d_tdivzorigin + dv*d_tdivzstepv +
; du*d_tdivzstepu; stays in %st(1) at end
fxch st(2) ; dv*d_zistepv | du*d_zistepu | t/z | s/z
faddp st(1),st(0) ; dv*d_zistepv + du*d_zistepu | t/z | s/z
fld ds:dword ptr[fp_64k] ; fp_64k | dv*d_zistepv + du*d_zistepu | t/z | s/z
fxch st(1) ; dv*d_zistepv + du*d_zistepu | fp_64k | t/z | s/z
fadd ds:dword ptr[_d_ziorigin] ; zi = d_ziorigin + dv*d_zistepv +
; du*d_zistepu; stays in %st(0) at end
; 1/z | fp_64k | t/z | s/z
;
; calculate and clamp s & t
;
fdiv st(1),st(0) ; 1/z | z*64k | t/z | s/z
;
; point %edi to the first pixel in the span
;
mov ecx,ds:dword ptr[_d_viewbuffer]
mov eax,ds:dword ptr[espan_t_v+ebx]
mov ds:dword ptr[pspantemp],ebx ; preserve spans pointer
mov edx,ds:dword ptr[_tadjust]
mov esi,ds:dword ptr[_sadjust]
mov edi,ds:dword ptr[_d_scantable+eax*4] ; v * screenwidth
add edi,ecx
mov ecx,ds:dword ptr[espan_t_u+ebx]
add edi,ecx ; pdest = &pdestspan[scans->u];
mov ecx,ds:dword ptr[espan_t_count+ebx]
;
; now start the FDIV for the end of the span
;
cmp ecx,16
ja LSetupNotLast1
dec ecx
jz LCleanup1 ; if only one pixel, no need to start an FDIV
mov ds:dword ptr[spancountminus1],ecx
; finish up the s and t calcs
fxch st(1) ; z*64k | 1/z | t/z | s/z
fld st(0) ; z*64k | z*64k | 1/z | t/z | s/z
fmul st(0),st(4) ; s | z*64k | 1/z | t/z | s/z
fxch st(1) ; z*64k | s | 1/z | t/z | s/z
fmul st(0),st(3) ; t | s | 1/z | t/z | s/z
fxch st(1) ; s | t | 1/z | t/z | s/z
fistp ds:dword ptr[s] ; 1/z | t | t/z | s/z
fistp ds:dword ptr[t] ; 1/z | t/z | s/z
fild ds:dword ptr[spancountminus1]
fld ds:dword ptr[_d_tdivzstepu] ; C(d_tdivzstepu) | spancountminus1
fld ds:dword ptr[_d_zistepu] ; C(d_zistepu) | C(d_tdivzstepu) | spancountminus1
fmul st(0),st(2) ; C(d_zistepu)*scm1 | C(d_tdivzstepu) | scm1
fxch st(1) ; C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1
fmul st(0),st(2) ; C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1
fxch st(2) ; scm1 | C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1
fmul ds:dword ptr[_d_sdivzstepu] ; C(d_sdivzstepu)*scm1 | C(d_zistepu)*scm1 |
; C(d_tdivzstepu)*scm1
fxch st(1) ; C(d_zistepu)*scm1 | C(d_sdivzstepu)*scm1 |
; C(d_tdivzstepu)*scm1
faddp st(3),st(0) ; C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1
fxch st(1) ; C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1
faddp st(3),st(0) ; C(d_sdivzstepu)*scm1
faddp st(3),st(0)
fld ds:dword ptr[fp_64k]
fdiv st(0),st(1) ; this is what we've gone to all this trouble to
; overlap
jmp LFDIVInFlight1
LCleanup1:
; finish up the s and t calcs
fxch st(1) ; z*64k | 1/z | t/z | s/z
fld st(0) ; z*64k | z*64k | 1/z | t/z | s/z
fmul st(0),st(4) ; s | z*64k | 1/z | t/z | s/z
fxch st(1) ; z*64k | s | 1/z | t/z | s/z
fmul st(0),st(3) ; t | s | 1/z | t/z | s/z
fxch st(1) ; s | t | 1/z | t/z | s/z
fistp ds:dword ptr[s] ; 1/z | t | t/z | s/z
fistp ds:dword ptr[t] ; 1/z | t/z | s/z
jmp LFDIVInFlight1
align 4
LSetupNotLast1:
; finish up the s and t calcs
fxch st(1) ; z*64k | 1/z | t/z | s/z
fld st(0) ; z*64k | z*64k | 1/z | t/z | s/z
fmul st(0),st(4) ; s | z*64k | 1/z | t/z | s/z
fxch st(1) ; z*64k | s | 1/z | t/z | s/z
fmul st(0),st(3) ; t | s | 1/z | t/z | s/z
fxch st(1) ; s | t | 1/z | t/z | s/z
fistp ds:dword ptr[s] ; 1/z | t | t/z | s/z
fistp ds:dword ptr[t] ; 1/z | t/z | s/z
fadd ds:dword ptr[zi16stepu]
fxch st(2)
fadd ds:dword ptr[sdivz16stepu]
fxch st(2)
fld ds:dword ptr[tdivz16stepu]
faddp st(2),st(0)
fld ds:dword ptr[fp_64k]
fdiv st(0),st(1) ; z = 1/1/z
; this is what we've gone to all this trouble to
; overlap
LFDIVInFlight1:
add esi,ds:dword ptr[s]
add edx,ds:dword ptr[t]
mov ebx,ds:dword ptr[_bbextents]
mov ebp,ds:dword ptr[_bbextentt]
cmp esi,ebx
ja LClampHighOrLow0
LClampReentry0:
mov ds:dword ptr[s],esi
mov ebx,ds:dword ptr[pbase]
shl esi,16
cmp edx,ebp
mov ds:dword ptr[sfracf],esi
ja LClampHighOrLow1
LClampReentry1:
mov ds:dword ptr[t],edx
mov esi,ds:dword ptr[s] ; sfrac = scans->sfrac;
shl edx,16
mov eax,ds:dword ptr[t] ; tfrac = scans->tfrac;
sar esi,16
mov ds:dword ptr[tfracf],edx
;
; calculate the texture starting address
;
sar eax,16
mov edx,ds:dword ptr[_cachewidth]
imul eax,edx ; (tfrac >> 16) * cachewidth
add esi,ebx
add esi,eax ; psource = pbase + (sfrac >> 16) +
; ((tfrac >> 16) * cachewidth);
;
; determine whether last span or not
;
cmp ecx,16
jna LLastSegment
;
; not the last segment; do full 16-wide segment
;
LNotLastSegment:
;
; advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
; get there
;
; pick up after the FDIV that was left in flight previously
fld st(0) ; duplicate it
fmul st(0),st(4) ; s = s/z * z
fxch st(1)
fmul st(0),st(3) ; t = t/z * z
fxch st(1)
fistp ds:dword ptr[snext]
fistp ds:dword ptr[tnext]
mov eax,ds:dword ptr[snext]
mov edx,ds:dword ptr[tnext]
mov bl,ds:byte ptr[esi] ; get first source texel
sub ecx,16 ; count off this segments' pixels
mov ebp,ds:dword ptr[_sadjust]
mov ds:dword ptr[counttemp],ecx ; remember count of remaining pixels
mov ecx,ds:dword ptr[_tadjust]
mov ds:byte ptr[edi],bl ; store first dest pixel
add ebp,eax
add ecx,edx
mov eax,ds:dword ptr[_bbextents]
mov edx,ds:dword ptr[_bbextentt]
cmp ebp,4096
jl LClampLow2
cmp ebp,eax
ja LClampHigh2
LClampReentry2:
cmp ecx,4096
jl LClampLow3
cmp ecx,edx
ja LClampHigh3
LClampReentry3:
mov ds:dword ptr[snext],ebp
mov ds:dword ptr[tnext],ecx
sub ebp,ds:dword ptr[s]
sub ecx,ds:dword ptr[t]
;
; set up advancetable
;
mov eax,ecx
mov edx,ebp
sar eax,20 ; tstep >>= 16;
jz LZero
sar edx,20 ; sstep >>= 16;
mov ebx,ds:dword ptr[_cachewidth]
imul eax,ebx
jmp LSetUp1
LZero:
sar edx,20 ; sstep >>= 16;
mov ebx,ds:dword ptr[_cachewidth]
LSetUp1:
add eax,edx ; add in sstep
; (tstep >> 16) * cachewidth + (sstep >> 16);
mov edx,ds:dword ptr[tfracf]
mov ds:dword ptr[advancetable+4],eax ; advance base in t
add eax,ebx ; ((tstep >> 16) + 1) * cachewidth +
; (sstep >> 16);
shl ebp,12 ; left-justify sstep fractional part
mov ebx,ds:dword ptr[sfracf]
shl ecx,12 ; left-justify tstep fractional part
mov ds:dword ptr[advancetable],eax ; advance extra in t
mov ds:dword ptr[tstep],ecx
add edx,ecx ; advance tfrac fractional part by tstep frac
sbb ecx,ecx ; turn tstep carry into -1 (0 if none)
add ebx,ebp ; advance sfrac fractional part by sstep frac
adc esi,ds:dword ptr[advancetable+4+ecx*4] ; point to next source texel
add edx,ds:dword ptr[tstep]
sbb ecx,ecx
mov al,ds:byte ptr[esi]
add ebx,ebp
mov ds:byte ptr[1+edi],al
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
sbb ecx,ecx
add ebx,ebp
mov al,ds:byte ptr[esi]
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
sbb ecx,ecx
mov ds:byte ptr[2+edi],al
add ebx,ebp
mov al,ds:byte ptr[esi]
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
sbb ecx,ecx
mov ds:byte ptr[3+edi],al
add ebx,ebp
mov al,ds:byte ptr[esi]
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
sbb ecx,ecx
mov ds:byte ptr[4+edi],al
add ebx,ebp
mov al,ds:byte ptr[esi]
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
sbb ecx,ecx
mov ds:byte ptr[5+edi],al
add ebx,ebp
mov al,ds:byte ptr[esi]
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
sbb ecx,ecx
mov ds:byte ptr[6+edi],al
add ebx,ebp
mov al,ds:byte ptr[esi]
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
sbb ecx,ecx
mov ds:byte ptr[7+edi],al
add ebx,ebp
mov al,ds:byte ptr[esi]
adc esi,ds:dword ptr[advancetable+4+ecx*4]
;
; start FDIV for end of next segment in flight, so it can overlap
;
mov ecx,ds:dword ptr[counttemp]
cmp ecx,16 ; more than one segment after this?
ja LSetupNotLast2 ; yes
dec ecx
jz LFDIVInFlight2 ; if only one pixel, no need to start an FDIV
mov ds:dword ptr[spancountminus1],ecx
fild ds:dword ptr[spancountminus1]
fld ds:dword ptr[_d_zistepu] ; C(d_zistepu) | spancountminus1
fmul st(0),st(1) ; C(d_zistepu)*scm1 | scm1
fld ds:dword ptr[_d_tdivzstepu] ; C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1
fmul st(0),st(2) ; C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1
fxch st(1) ; C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1 | scm1
faddp st(3),st(0) ; C(d_tdivzstepu)*scm1 | scm1
fxch st(1) ; scm1 | C(d_tdivzstepu)*scm1
fmul ds:dword ptr[_d_sdivzstepu] ; C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1
fxch st(1) ; C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1
faddp st(3),st(0) ; C(d_sdivzstepu)*scm1
fld ds:dword ptr[fp_64k] ; 64k | C(d_sdivzstepu)*scm1
fxch st(1) ; C(d_sdivzstepu)*scm1 | 64k
faddp st(4),st(0) ; 64k
fdiv st(0),st(1) ; this is what we've gone to all this trouble to
; overlap
jmp LFDIVInFlight2
align 4
LSetupNotLast2:
fadd ds:dword ptr[zi16stepu]
fxch st(2)
fadd ds:dword ptr[sdivz16stepu]
fxch st(2)
fld ds:dword ptr[tdivz16stepu]
faddp st(2),st(0)
fld ds:dword ptr[fp_64k]
fdiv st(0),st(1) ; z = 1/1/z
; this is what we've gone to all this trouble to
; overlap
LFDIVInFlight2:
mov ds:dword ptr[counttemp],ecx
add edx,ds:dword ptr[tstep]
sbb ecx,ecx
mov ds:byte ptr[8+edi],al
add ebx,ebp
mov al,ds:byte ptr[esi]
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
sbb ecx,ecx
mov ds:byte ptr[9+edi],al
add ebx,ebp
mov al,ds:byte ptr[esi]
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
sbb ecx,ecx
mov ds:byte ptr[10+edi],al
add ebx,ebp
mov al,ds:byte ptr[esi]
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
sbb ecx,ecx
mov ds:byte ptr[11+edi],al
add ebx,ebp
mov al,ds:byte ptr[esi]
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
sbb ecx,ecx
mov ds:byte ptr[12+edi],al
add ebx,ebp
mov al,ds:byte ptr[esi]
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
sbb ecx,ecx
mov ds:byte ptr[13+edi],al
add ebx,ebp
mov al,ds:byte ptr[esi]
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
sbb ecx,ecx
mov ds:byte ptr[14+edi],al
add ebx,ebp
mov al,ds:byte ptr[esi]
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edi,16
mov ds:dword ptr[tfracf],edx
mov edx,ds:dword ptr[snext]
mov ds:dword ptr[sfracf],ebx
mov ebx,ds:dword ptr[tnext]
mov ds:dword ptr[s],edx
mov ds:dword ptr[t],ebx
mov ecx,ds:dword ptr[counttemp] ; retrieve count
;
; determine whether last span or not
;
cmp ecx,16 ; are there multiple segments remaining?
mov ds:byte ptr[-1+edi],al
ja LNotLastSegment ; yes
;
; last segment of scan
;
LLastSegment:
;
; advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
; get there. The number of pixels left is variable, and we want to land on the
; last pixel, not step one past it, so we can't run into arithmetic problems
;
test ecx,ecx
jz LNoSteps ; just draw the last pixel and we're done
; pick up after the FDIV that was left in flight previously
fld st(0) ; duplicate it
fmul st(0),st(4) ; s = s/z * z
fxch st(1)
fmul st(0),st(3) ; t = t/z * z
fxch st(1)
fistp ds:dword ptr[snext]
fistp ds:dword ptr[tnext]
mov al,ds:byte ptr[esi] ; load first texel in segment
mov ebx,ds:dword ptr[_tadjust]
mov ds:byte ptr[edi],al ; store first pixel in segment
mov eax,ds:dword ptr[_sadjust]
add eax,ds:dword ptr[snext]
add ebx,ds:dword ptr[tnext]
mov ebp,ds:dword ptr[_bbextents]
mov edx,ds:dword ptr[_bbextentt]
cmp eax,4096
jl LClampLow4
cmp eax,ebp
ja LClampHigh4
LClampReentry4:
mov ds:dword ptr[snext],eax
cmp ebx,4096
jl LClampLow5
cmp ebx,edx
ja LClampHigh5
LClampReentry5:
cmp ecx,1 ; don't bother
je LOnlyOneStep ; if two pixels in segment, there's only one step,
; of the segment length
sub eax,ds:dword ptr[s]
sub ebx,ds:dword ptr[t]
add eax,eax ; convert to 15.17 format so multiply by 1.31
add ebx,ebx ; reciprocal yields 16.48
imul ds:dword ptr[reciprocal_table_16-8+ecx*4] ; sstep = (snext - s) /
; (spancount-1)
mov ebp,edx
mov eax,ebx
imul ds:dword ptr[reciprocal_table_16-8+ecx*4] ; tstep = (tnext - t) /
; (spancount-1)
LSetEntryvec:
;
; set up advancetable
;
mov ebx,ds:dword ptr[entryvec_table_16+ecx*4]
mov eax,edx
mov ds:dword ptr[jumptemp],ebx ; entry point into code for RET later
mov ecx,ebp
sar edx,16 ; tstep >>= 16;
mov ebx,ds:dword ptr[_cachewidth]
sar ecx,16 ; sstep >>= 16;
imul edx,ebx
add edx,ecx ; add in sstep
; (tstep >> 16) * cachewidth + (sstep >> 16);
mov ecx,ds:dword ptr[tfracf]
mov ds:dword ptr[advancetable+4],edx ; advance base in t
add edx,ebx ; ((tstep >> 16) + 1) * cachewidth +
; (sstep >> 16);
shl ebp,16 ; left-justify sstep fractional part
mov ebx,ds:dword ptr[sfracf]
shl eax,16 ; left-justify tstep fractional part
mov ds:dword ptr[advancetable],edx ; advance extra in t
mov ds:dword ptr[tstep],eax
mov edx,ecx
add edx,eax
sbb ecx,ecx
add ebx,ebp
adc esi,ds:dword ptr[advancetable+4+ecx*4]
jmp dword ptr[jumptemp] ; jump to the number-of-pixels handler
;----------------------------------------
LNoSteps:
mov al,ds:byte ptr[esi] ; load first texel in segment
sub edi,15 ; adjust for hardwired offset
jmp LEndSpan
LOnlyOneStep:
sub eax,ds:dword ptr[s]
sub ebx,ds:dword ptr[t]
mov ebp,eax
mov edx,ebx
jmp LSetEntryvec
;----------------------------------------
public Entry2_16, Entry3_16, Entry4_16, Entry5_16
public Entry6_16, Entry7_16, Entry8_16, Entry9_16
public Entry10_16, Entry11_16, Entry12_16, Entry13_16
public Entry14_16, Entry15_16, Entry16_16
Entry2_16:
sub edi,14 ; adjust for hardwired offsets
mov al,ds:byte ptr[esi]
jmp LEntry2_16
;----------------------------------------
Entry3_16:
sub edi,13 ; adjust for hardwired offsets
add edx,eax
mov al,ds:byte ptr[esi]
sbb ecx,ecx
add ebx,ebp
adc esi,ds:dword ptr[advancetable+4+ecx*4]
jmp LEntry3_16
;----------------------------------------
Entry4_16:
sub edi,12 ; adjust for hardwired offsets
add edx,eax
mov al,ds:byte ptr[esi]
sbb ecx,ecx
add ebx,ebp
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
jmp LEntry4_16
;----------------------------------------
Entry5_16:
sub edi,11 ; adjust for hardwired offsets
add edx,eax
mov al,ds:byte ptr[esi]
sbb ecx,ecx
add ebx,ebp
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
jmp LEntry5_16
;----------------------------------------
Entry6_16:
sub edi,10 ; adjust for hardwired offsets
add edx,eax
mov al,ds:byte ptr[esi]
sbb ecx,ecx
add ebx,ebp
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
jmp LEntry6_16
;----------------------------------------
Entry7_16:
sub edi,9 ; adjust for hardwired offsets
add edx,eax
mov al,ds:byte ptr[esi]
sbb ecx,ecx
add ebx,ebp
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
jmp LEntry7_16
;----------------------------------------
Entry8_16:
sub edi,8 ; adjust for hardwired offsets
add edx,eax
mov al,ds:byte ptr[esi]
sbb ecx,ecx
add ebx,ebp
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
jmp LEntry8_16
;----------------------------------------
Entry9_16:
sub edi,7 ; adjust for hardwired offsets
add edx,eax
mov al,ds:byte ptr[esi]
sbb ecx,ecx
add ebx,ebp
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
jmp LEntry9_16
;----------------------------------------
Entry10_16:
sub edi,6 ; adjust for hardwired offsets
add edx,eax
mov al,ds:byte ptr[esi]
sbb ecx,ecx
add ebx,ebp
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
jmp LEntry10_16
;----------------------------------------
Entry11_16:
sub edi,5 ; adjust for hardwired offsets
add edx,eax
mov al,ds:byte ptr[esi]
sbb ecx,ecx
add ebx,ebp
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
jmp LEntry11_16
;----------------------------------------
Entry12_16:
sub edi,4 ; adjust for hardwired offsets
add edx,eax
mov al,ds:byte ptr[esi]
sbb ecx,ecx
add ebx,ebp
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
jmp LEntry12_16
;----------------------------------------
Entry13_16:
sub edi,3 ; adjust for hardwired offsets
add edx,eax
mov al,ds:byte ptr[esi]
sbb ecx,ecx
add ebx,ebp
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
jmp LEntry13_16
;----------------------------------------
Entry14_16:
sub edi,2 ; adjust for hardwired offsets
add edx,eax
mov al,ds:byte ptr[esi]
sbb ecx,ecx
add ebx,ebp
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
jmp LEntry14_16
;----------------------------------------
Entry15_16:
dec edi ; adjust for hardwired offsets
add edx,eax
mov al,ds:byte ptr[esi]
sbb ecx,ecx
add ebx,ebp
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
jmp LEntry15_16
;----------------------------------------
Entry16_16:
add edx,eax
mov al,ds:byte ptr[esi]
sbb ecx,ecx
add ebx,ebp
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
sbb ecx,ecx
mov ds:byte ptr[1+edi],al
add ebx,ebp
mov al,ds:byte ptr[esi]
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
LEntry15_16:
sbb ecx,ecx
mov ds:byte ptr[2+edi],al
add ebx,ebp
mov al,ds:byte ptr[esi]
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
LEntry14_16:
sbb ecx,ecx
mov ds:byte ptr[3+edi],al
add ebx,ebp
mov al,ds:byte ptr[esi]
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
LEntry13_16:
sbb ecx,ecx
mov ds:byte ptr[4+edi],al
add ebx,ebp
mov al,ds:byte ptr[esi]
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
LEntry12_16:
sbb ecx,ecx
mov ds:byte ptr[5+edi],al
add ebx,ebp
mov al,ds:byte ptr[esi]
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
LEntry11_16:
sbb ecx,ecx
mov ds:byte ptr[6+edi],al
add ebx,ebp
mov al,ds:byte ptr[esi]
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
LEntry10_16:
sbb ecx,ecx
mov ds:byte ptr[7+edi],al
add ebx,ebp
mov al,ds:byte ptr[esi]
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
LEntry9_16:
sbb ecx,ecx
mov ds:byte ptr[8+edi],al
add ebx,ebp
mov al,ds:byte ptr[esi]
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
LEntry8_16:
sbb ecx,ecx
mov ds:byte ptr[9+edi],al
add ebx,ebp
mov al,ds:byte ptr[esi]
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
LEntry7_16:
sbb ecx,ecx
mov ds:byte ptr[10+edi],al
add ebx,ebp
mov al,ds:byte ptr[esi]
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
LEntry6_16:
sbb ecx,ecx
mov ds:byte ptr[11+edi],al
add ebx,ebp
mov al,ds:byte ptr[esi]
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
LEntry5_16:
sbb ecx,ecx
mov ds:byte ptr[12+edi],al
add ebx,ebp
mov al,ds:byte ptr[esi]
adc esi,ds:dword ptr[advancetable+4+ecx*4]
add edx,ds:dword ptr[tstep]
LEntry4_16:
sbb ecx,ecx
mov ds:byte ptr[13+edi],al
add ebx,ebp
mov al,ds:byte ptr[esi]
adc esi,ds:dword ptr[advancetable+4+ecx*4]
LEntry3_16:
mov ds:byte ptr[14+edi],al
mov al,ds:byte ptr[esi]
LEntry2_16:
LEndSpan:
;
; clear s/z, t/z, 1/z from FP stack
;
fstp st(0)
fstp st(0)
fstp st(0)
mov ebx,ds:dword ptr[pspantemp] ; restore spans pointer
mov ebx,ds:dword ptr[espan_t_pnext+ebx] ; point to next span
test ebx,ebx ; any more spans?
mov ds:byte ptr[15+edi],al
jnz LSpanLoop ; more spans
pop ebx ; restore register variables
pop esi
pop edi
pop ebp ; restore the caller's stack frame
ret
;----------------------------------------------------------------------
; 8-bpp horizontal span z drawing codefor polygons, with no transparency.
;
; Assumes there is at least one span in pzspans, and that every span
; contains at least one pixel
;----------------------------------------------------------------------
; z-clamp on a non-negative gradient span
LClamp:
mov edx,040000000h
xor ebx,ebx
fstp st(0)
jmp LZDraw
; z-clamp on a negative gradient span
LClampNeg:
mov edx,040000000h
xor ebx,ebx
fstp st(0)
jmp LZDrawNeg
pzspans equ 4+16
public _D_DrawZSpans
_D_DrawZSpans:
push ebp ; preserve caller's stack frame
push edi
push esi ; preserve register variables
push ebx
fld ds:dword ptr[_d_zistepu]
mov eax,ds:dword ptr[_d_zistepu]
mov esi,ds:dword ptr[pzspans+esp]
test eax,eax
jz LFNegSpan
fmul ds:dword ptr[Float2ToThe31nd]
fistp ds:dword ptr[izistep] ; note: we are relying on FP exceptions being turned
; off here to avoid range problems
mov ebx,ds:dword ptr[izistep] ; remains loaded for all spans
LFSpanLoop:
; set up the initial 1/z value
fild ds:dword ptr[espan_t_v+esi]
fild ds:dword ptr[espan_t_u+esi]
mov ecx,ds:dword ptr[espan_t_v+esi]
mov edi,ds:dword ptr[_d_pzbuffer]
fmul ds:dword ptr[_d_zistepu]
fxch st(1)
fmul ds:dword ptr[_d_zistepv]
fxch st(1)
fadd ds:dword ptr[_d_ziorigin]
imul ecx,ds:dword ptr[_d_zrowbytes]
faddp st(1),st(0)
; clamp if z is nearer than 2 (1/z > 0.5)
fcom ds:dword ptr[float_point5]
add edi,ecx
mov edx,ds:dword ptr[espan_t_u+esi]
add edx,edx ; word count
mov ecx,ds:dword ptr[espan_t_count+esi]
add edi,edx ; pdest = &pdestspan[scans->u];
push esi ; preserve spans pointer
fnstsw ax
test ah,045h
jz LClamp
fmul ds:dword ptr[Float2ToThe31nd]
fistp ds:dword ptr[izi] ; note: we are relying on FP exceptions being turned
; off here to avoid problems when the span is closer
; than 1/(2**31)
mov edx,ds:dword ptr[izi]
; at this point:
; %ebx = izistep
; %ecx = count
; %edx = izi
; %edi = pdest
LZDraw:
; do a single pixel up front, if necessary to dword align the destination
test edi,2
jz LFMiddle
mov eax,edx
add edx,ebx
shr eax,16
dec ecx
mov ds:word ptr[edi],ax
add edi,2
; do middle a pair of aligned dwords at a time
LFMiddle:
push ecx
shr ecx,1 ; count / 2
jz LFLast ; no aligned dwords to do
shr ecx,1 ; (count / 2) / 2
jnc LFMiddleLoop ; even number of aligned dwords to do
mov eax,edx
add edx,ebx
shr eax,16
mov esi,edx
add edx,ebx
and esi,0FFFF0000h
or eax,esi
mov ds:dword ptr[edi],eax
add edi,4
and ecx,ecx
jz LFLast
LFMiddleLoop:
mov eax,edx
add edx,ebx
shr eax,16
mov esi,edx
add edx,ebx
and esi,0FFFF0000h
or eax,esi
mov ebp,edx
mov ds:dword ptr[edi],eax
add edx,ebx
shr ebp,16
mov esi,edx
add edx,ebx
and esi,0FFFF0000h
or ebp,esi
mov ds:dword ptr[4+edi],ebp ; FIXME: eliminate register contention
add edi,8
dec ecx
jnz LFMiddleLoop
LFLast:
pop ecx ; retrieve count
pop esi ; retrieve span pointer
; do the last, unaligned pixel, if there is one
and ecx,1 ; is there an odd pixel left to do?
jz LFSpanDone ; no
shr edx,16
mov ds:word ptr[edi],dx ; do the final pixel's z
LFSpanDone:
mov esi,ds:dword ptr[espan_t_pnext+esi]
test esi,esi
jnz LFSpanLoop
jmp LFDone
LFNegSpan:
fmul ds:dword ptr[FloatMinus2ToThe31nd]
fistp ds:dword ptr[izistep] ; note: we are relying on FP exceptions being turned
; off here to avoid range problems
mov ebx,ds:dword ptr[izistep] ; remains loaded for all spans
LFNegSpanLoop:
; set up the initial 1/z value
fild ds:dword ptr[espan_t_v+esi]
fild ds:dword ptr[espan_t_u+esi]
mov ecx,ds:dword ptr[espan_t_v+esi]
mov edi,ds:dword ptr[_d_pzbuffer]
fmul ds:dword ptr[_d_zistepu]
fxch st(1)
fmul ds:dword ptr[_d_zistepv]
fxch st(1)
fadd ds:dword ptr[_d_ziorigin]
imul ecx,ds:dword ptr[_d_zrowbytes]
faddp st(1),st(0)
; clamp if z is nearer than 2 (1/z > 0.5)
fcom ds:dword ptr[float_point5]
add edi,ecx
mov edx,ds:dword ptr[espan_t_u+esi]
add edx,edx ; word count
mov ecx,ds:dword ptr[espan_t_count+esi]
add edi,edx ; pdest = &pdestspan[scans->u];
push esi ; preserve spans pointer
fnstsw ax
test ah,045h
jz LClampNeg
fmul ds:dword ptr[Float2ToThe31nd]
fistp ds:dword ptr[izi] ; note: we are relying on FP exceptions being turned
; off here to avoid problems when the span is closer
; than 1/(2**31)
mov edx,ds:dword ptr[izi]
; at this point:
; %ebx = izistep
; %ecx = count
; %edx = izi
; %edi = pdest
LZDrawNeg:
; do a single pixel up front, if necessary to dword align the destination
test edi,2
jz LFNegMiddle
mov eax,edx
sub edx,ebx
shr eax,16
dec ecx
mov ds:word ptr[edi],ax
add edi,2
; do middle a pair of aligned dwords at a time
LFNegMiddle:
push ecx
shr ecx,1 ; count / 2
jz LFNegLast ; no aligned dwords to do
shr ecx,1 ; (count / 2) / 2
jnc LFNegMiddleLoop ; even number of aligned dwords to do
mov eax,edx
sub edx,ebx
shr eax,16
mov esi,edx
sub edx,ebx
and esi,0FFFF0000h
or eax,esi
mov ds:dword ptr[edi],eax
add edi,4
and ecx,ecx
jz LFNegLast
LFNegMiddleLoop:
mov eax,edx
sub edx,ebx
shr eax,16
mov esi,edx
sub edx,ebx
and esi,0FFFF0000h
or eax,esi
mov ebp,edx
mov ds:dword ptr[edi],eax
sub edx,ebx
shr ebp,16
mov esi,edx
sub edx,ebx
and esi,0FFFF0000h
or ebp,esi
mov ds:dword ptr[4+edi],ebp ; FIXME: eliminate register contention
add edi,8
dec ecx
jnz LFNegMiddleLoop
LFNegLast:
pop ecx ; retrieve count
pop esi ; retrieve span pointer
; do the last, unaligned pixel, if there is one
and ecx,1 ; is there an odd pixel left to do?
jz LFNegSpanDone ; no
shr edx,16
mov ds:word ptr[edi],dx ; do the final pixel's z
LFNegSpanDone:
mov esi,ds:dword ptr[espan_t_pnext+esi]
test esi,esi
jnz LFNegSpanLoop
LFDone:
pop ebx ; restore register variables
pop esi
pop edi
pop ebp ; restore the caller's stack frame
ret
_TEXT ENDS
endif ;id386
END