diff --git a/source/Makefile b/source/Makefile index dc3bd5c..67eeecd 100644 --- a/source/Makefile +++ b/source/Makefile @@ -4,10 +4,10 @@ SV_name=qw-server SV_libs= CL_SVGA_name=qw-client-svga -SV_libs= +CL_SVGA_libs=-lvga CL_X11_name=qw-client-x11 -SV_libs= +CL_X11_libs= DIRECTORIES= vpath %.a $(patsubst @%,%,$(DIRECTORIES)) /usr/lib @@ -65,6 +65,8 @@ SV_dependencies = $(patsubst %,%.d,$(basename $(SV_sources))) SV_objects = $(patsubst %.d,%.o,$(SV_dependencies)) CL_sources=\ + cl_cmd.c \ + cl_cvar.c \ cl_demo.c \ cl_ents.c \ cl_input.c \ @@ -127,7 +129,25 @@ CL_sources=\ zone.c \ cd_linux.c \ sys_linux.c \ - snd_linux.c + snd_linux.c \ + d_copy.S \ + d_draw.S \ + d_draw16.S \ + d_parta.S \ + d_polysa.S \ + d_scana.S \ + d_spr8.S \ + d_varsa.S \ + math.S \ + r_aclipa.S \ + r_aliasa.S \ + r_drawa.S \ + r_edgea.S \ + r_varsa.S \ + snd_mixa.S \ + surf16.S \ + surf8.S \ + sys_dosa.S CL_dependencies = $(patsubst %,%.d,$(basename $(CL_sources))) CL_objects = $(patsubst %.d,%.o,$(CL_dependencies)) diff --git a/source/cl_cvar.c b/source/cl_cvar.c index 3d26acf..14e17d9 100644 --- a/source/cl_cvar.c +++ b/source/cl_cvar.c @@ -29,7 +29,7 @@ Cvar_Info(cvar_t *var) if (cls.state >= ca_connected) { MSG_WriteByte (&cls.netchan.message, clc_stringcmd); - SZ_Print (&cls.netchan.message, va("setinfo \"%s\" \"%s\"\n", var->name, string)); + SZ_Print (&cls.netchan.message, va("setinfo \"%s\" \"%s\"\n", var->name, var->string)); } } } diff --git a/source/cmd.c b/source/cmd.c index 811d1d2..fbdd47a 100644 --- a/source/cmd.c +++ b/source/cmd.c @@ -595,10 +595,6 @@ char *Cmd_CompleteCommand (char *partial) return NULL; } -void Cmd_ForwardToServer (void) -{ -} - /* ============ Cmd_ExecuteString diff --git a/source/d_copy.S b/source/d_copy.S new file mode 100644 index 0000000..92e414a --- /dev/null +++ b/source/d_copy.S @@ -0,0 +1,149 @@ +// +// d_copy.s +// x86 assembly-language screen copying code. +// + +#include "asm_i386.h" +#include "quakeasm.h" +#include "asm_draw.h" + + .data + +LCopyWidth: .long 0 +LBlockSrcStep: .long 0 +LBlockDestStep: .long 0 +LSrcDelta: .long 0 +LDestDelta: .long 0 + +#define bufptr 4+16 + +// copies 16 rows per plane at a pop; idea is that 16*512 = 8k, and since +// no Mode X mode is wider than 360, all the data should fit in the cache for +// the passes for the next 3 planes + + .text + +.globl C(VGA_UpdatePlanarScreen) +C(VGA_UpdatePlanarScreen): + pushl %ebp // preserve caller's stack frame + pushl %edi + pushl %esi // preserve register variables + pushl %ebx + + movl C(VGA_bufferrowbytes),%eax + shll $1,%eax + movl %eax,LBlockSrcStep + movl C(VGA_rowbytes),%eax + shll $1,%eax + movl %eax,LBlockDestStep + + movl $0x3C4,%edx + movb $2,%al + outb %al,%dx // point the SC to the Map Mask + incl %edx + + movl bufptr(%esp),%esi + movl C(VGA_pagebase),%edi + movl C(VGA_height),%ebp + shrl $1,%ebp + + movl C(VGA_width),%ecx + movl C(VGA_bufferrowbytes),%eax + subl %ecx,%eax + movl %eax,LSrcDelta + movl C(VGA_rowbytes),%eax + shll $2,%eax + subl %ecx,%eax + movl %eax,LDestDelta + shrl $4,%ecx + movl %ecx,LCopyWidth + +LRowLoop: + movb $1,%al + +LPlaneLoop: + outb %al,%dx + movb $2,%ah + + pushl %esi + pushl %edi +LRowSetLoop: + movl LCopyWidth,%ecx +LColumnLoop: + movb 12(%esi),%bh + movb 8(%esi),%bl + shll $16,%ebx + movb 4(%esi),%bh + movb (%esi),%bl + movl %ebx,(%edi) + addl $16,%esi + addl $4,%edi + decl %ecx + jnz LColumnLoop + + addl LDestDelta,%edi + addl LSrcDelta,%esi + decb %ah + jnz LRowSetLoop + + popl %edi + popl %esi + incl %esi + + shlb $1,%al + cmpb $16,%al + jnz LPlaneLoop + + subl $4,%esi + addl LBlockSrcStep,%esi + addl LBlockDestStep,%edi + decl %ebp + jnz LRowLoop + + popl %ebx // restore register variables + popl %esi + popl %edi + popl %ebp // restore the caller's stack frame + + ret + + +#define srcptr 4+16 +#define destptr 8+16 +#define width 12+16 +#define height 16+16 +#define srcrowbytes 20+16 +#define destrowbytes 24+16 + +.globl C(VGA_UpdateLinearScreen) +C(VGA_UpdateLinearScreen): + pushl %ebp // preserve caller's stack frame + pushl %edi + pushl %esi // preserve register variables + pushl %ebx + + cld + movl srcptr(%esp),%esi + movl destptr(%esp),%edi + movl width(%esp),%ebx + movl srcrowbytes(%esp),%eax + subl %ebx,%eax + movl destrowbytes(%esp),%edx + subl %ebx,%edx + shrl $2,%ebx + movl height(%esp),%ebp +LLRowLoop: + movl %ebx,%ecx + rep/movsl (%esi),(%edi) + addl %eax,%esi + addl %edx,%edi + decl %ebp + jnz LLRowLoop + + popl %ebx // restore register variables + popl %esi + popl %edi + popl %ebp // restore the caller's stack frame + + ret + diff --git a/source/d_draw.S b/source/d_draw.S new file mode 100644 index 0000000..d50b982 --- /dev/null +++ b/source/d_draw.S @@ -0,0 +1,1037 @@ +/* +Copyright (C) 1996-1997 Id Software, Inc. + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +*/ +// +// d_draw.s +// x86 assembly-language horizontal 8-bpp span-drawing code. +// + +#include "asm_i386.h" +#include "quakeasm.h" +#include "asm_draw.h" +#include "d_ifacea.h" + +#if id386 + +//---------------------------------------------------------------------- +// 8-bpp horizontal span drawing code for polygons, with no transparency. +// +// Assumes there is at least one span in pspans, and that every span +// contains at least one pixel +//---------------------------------------------------------------------- + + .text + +// out-of-line, rarely-needed clamping code + +LClampHigh0: + movl C(bbextents),%esi + jmp LClampReentry0 +LClampHighOrLow0: + jg LClampHigh0 + xorl %esi,%esi + jmp LClampReentry0 + +LClampHigh1: + movl C(bbextentt),%edx + jmp LClampReentry1 +LClampHighOrLow1: + jg LClampHigh1 + xorl %edx,%edx + jmp LClampReentry1 + +LClampLow2: + movl $2048,%ebp + jmp LClampReentry2 +LClampHigh2: + movl C(bbextents),%ebp + jmp LClampReentry2 + +LClampLow3: + movl $2048,%ecx + jmp LClampReentry3 +LClampHigh3: + movl C(bbextentt),%ecx + jmp LClampReentry3 + +LClampLow4: + movl $2048,%eax + jmp LClampReentry4 +LClampHigh4: + movl C(bbextents),%eax + jmp LClampReentry4 + +LClampLow5: + movl $2048,%ebx + jmp LClampReentry5 +LClampHigh5: + movl C(bbextentt),%ebx + jmp LClampReentry5 + + +#define pspans 4+16 + + .align 4 +.globl C(D_DrawSpans8) +C(D_DrawSpans8): + pushl %ebp // preserve caller's stack frame + pushl %edi + pushl %esi // preserve register variables + pushl %ebx + +// +// set up scaled-by-8 steps, for 8-long segments; also set up cacheblock +// and span list pointers +// +// TODO: any overlap from rearranging? + flds C(d_sdivzstepu) + fmuls fp_8 + movl C(cacheblock),%edx + flds C(d_tdivzstepu) + fmuls fp_8 + movl pspans(%esp),%ebx // point to the first span descriptor + flds C(d_zistepu) + fmuls fp_8 + movl %edx,pbase // pbase = cacheblock + fstps zi8stepu + fstps tdivz8stepu + fstps sdivz8stepu + +LSpanLoop: +// +// set up the initial s/z, t/z, and 1/z on the FP stack, and generate the +// initial s and t values +// +// FIXME: pipeline FILD? + fildl espan_t_v(%ebx) + fildl espan_t_u(%ebx) + + fld %st(1) // dv | du | dv + fmuls C(d_sdivzstepv) // dv*d_sdivzstepv | du | dv + fld %st(1) // du | dv*d_sdivzstepv | du | dv + fmuls C(d_sdivzstepu) // du*d_sdivzstepu | dv*d_sdivzstepv | du | dv + fld %st(2) // du | du*d_sdivzstepu | dv*d_sdivzstepv | du | dv + fmuls C(d_tdivzstepu) // du*d_tdivzstepu | du*d_sdivzstepu | + // dv*d_sdivzstepv | du | dv + fxch %st(1) // du*d_sdivzstepu | du*d_tdivzstepu | + // dv*d_sdivzstepv | du | dv + faddp %st(0),%st(2) // du*d_tdivzstepu | + // du*d_sdivzstepu + dv*d_sdivzstepv | du | dv + fxch %st(1) // du*d_sdivzstepu + dv*d_sdivzstepv | + // du*d_tdivzstepu | du | dv + fld %st(3) // dv | du*d_sdivzstepu + dv*d_sdivzstepv | + // du*d_tdivzstepu | du | dv + fmuls C(d_tdivzstepv) // dv*d_tdivzstepv | + // du*d_sdivzstepu + dv*d_sdivzstepv | + // du*d_tdivzstepu | du | dv + fxch %st(1) // du*d_sdivzstepu + dv*d_sdivzstepv | + // dv*d_tdivzstepv | du*d_tdivzstepu | du | dv + fadds C(d_sdivzorigin) // sdivz = d_sdivzorigin + dv*d_sdivzstepv + + // du*d_sdivzstepu; stays in %st(2) at end + fxch %st(4) // dv | dv*d_tdivzstepv | du*d_tdivzstepu | du | + // s/z + fmuls C(d_zistepv) // dv*d_zistepv | dv*d_tdivzstepv | + // du*d_tdivzstepu | du | s/z + fxch %st(1) // dv*d_tdivzstepv | dv*d_zistepv | + // du*d_tdivzstepu | du | s/z + faddp %st(0),%st(2) // dv*d_zistepv | + // dv*d_tdivzstepv + du*d_tdivzstepu | du | s/z + fxch %st(2) // du | dv*d_tdivzstepv + du*d_tdivzstepu | + // dv*d_zistepv | s/z + fmuls C(d_zistepu) // du*d_zistepu | + // dv*d_tdivzstepv + du*d_tdivzstepu | + // dv*d_zistepv | s/z + fxch %st(1) // dv*d_tdivzstepv + du*d_tdivzstepu | + // du*d_zistepu | dv*d_zistepv | s/z + fadds C(d_tdivzorigin) // tdivz = d_tdivzorigin + dv*d_tdivzstepv + + // du*d_tdivzstepu; stays in %st(1) at end + fxch %st(2) // dv*d_zistepv | du*d_zistepu | t/z | s/z + faddp %st(0),%st(1) // dv*d_zistepv + du*d_zistepu | t/z | s/z + + flds fp_64k // fp_64k | dv*d_zistepv + du*d_zistepu | t/z | s/z + fxch %st(1) // dv*d_zistepv + du*d_zistepu | fp_64k | t/z | s/z + fadds C(d_ziorigin) // zi = d_ziorigin + dv*d_zistepv + + // du*d_zistepu; stays in %st(0) at end + // 1/z | fp_64k | t/z | s/z +// +// calculate and clamp s & t +// + fdivr %st(0),%st(1) // 1/z | z*64k | t/z | s/z + +// +// point %edi to the first pixel in the span +// + movl C(d_viewbuffer),%ecx + movl espan_t_v(%ebx),%eax + movl %ebx,pspantemp // preserve spans pointer + + movl C(tadjust),%edx + movl C(sadjust),%esi + movl C(d_scantable)(,%eax,4),%edi // v * screenwidth + addl %ecx,%edi + movl espan_t_u(%ebx),%ecx + addl %ecx,%edi // pdest = &pdestspan[scans->u]; + movl espan_t_count(%ebx),%ecx + +// +// now start the FDIV for the end of the span +// + cmpl $8,%ecx + ja LSetupNotLast1 + + decl %ecx + jz LCleanup1 // if only one pixel, no need to start an FDIV + movl %ecx,spancountminus1 + +// finish up the s and t calcs + fxch %st(1) // z*64k | 1/z | t/z | s/z + + fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z + fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z + fxch %st(1) // z*64k | s | 1/z | t/z | s/z + fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z + fxch %st(1) // s | t | 1/z | t/z | s/z + fistpl s // 1/z | t | t/z | s/z + fistpl t // 1/z | t/z | s/z + + fildl spancountminus1 + + flds C(d_tdivzstepu) // C(d_tdivzstepu) | spancountminus1 + flds C(d_zistepu) // C(d_zistepu) | C(d_tdivzstepu) | spancountminus1 + fmul %st(2),%st(0) // C(d_zistepu)*scm1 | C(d_tdivzstepu) | scm1 + fxch %st(1) // C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1 + fmul %st(2),%st(0) // C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1 + fxch %st(2) // scm1 | C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1 + fmuls C(d_sdivzstepu) // C(d_sdivzstepu)*scm1 | C(d_zistepu)*scm1 | + // C(d_tdivzstepu)*scm1 + fxch %st(1) // C(d_zistepu)*scm1 | C(d_sdivzstepu)*scm1 | + // C(d_tdivzstepu)*scm1 + faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1 + fxch %st(1) // C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1 + faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1 + faddp %st(0),%st(3) + + flds fp_64k + fdiv %st(1),%st(0) // this is what we've gone to all this trouble to + // overlap + jmp LFDIVInFlight1 + +LCleanup1: +// finish up the s and t calcs + fxch %st(1) // z*64k | 1/z | t/z | s/z + + fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z + fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z + fxch %st(1) // z*64k | s | 1/z | t/z | s/z + fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z + fxch %st(1) // s | t | 1/z | t/z | s/z + fistpl s // 1/z | t | t/z | s/z + fistpl t // 1/z | t/z | s/z + jmp LFDIVInFlight1 + + .align 4 +LSetupNotLast1: +// finish up the s and t calcs + fxch %st(1) // z*64k | 1/z | t/z | s/z + + fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z + fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z + fxch %st(1) // z*64k | s | 1/z | t/z | s/z + fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z + fxch %st(1) // s | t | 1/z | t/z | s/z + fistpl s // 1/z | t | t/z | s/z + fistpl t // 1/z | t/z | s/z + + fadds zi8stepu + fxch %st(2) + fadds sdivz8stepu + fxch %st(2) + flds tdivz8stepu + faddp %st(0),%st(2) + flds fp_64k + fdiv %st(1),%st(0) // z = 1/1/z + // this is what we've gone to all this trouble to + // overlap +LFDIVInFlight1: + + addl s,%esi + addl t,%edx + movl C(bbextents),%ebx + movl C(bbextentt),%ebp + cmpl %ebx,%esi + ja LClampHighOrLow0 +LClampReentry0: + movl %esi,s + movl pbase,%ebx + shll $16,%esi + cmpl %ebp,%edx + movl %esi,sfracf + ja LClampHighOrLow1 +LClampReentry1: + movl %edx,t + movl s,%esi // sfrac = scans->sfrac; + shll $16,%edx + movl t,%eax // tfrac = scans->tfrac; + sarl $16,%esi + movl %edx,tfracf + +// +// calculate the texture starting address +// + sarl $16,%eax + movl C(cachewidth),%edx + imull %edx,%eax // (tfrac >> 16) * cachewidth + addl %ebx,%esi + addl %eax,%esi // psource = pbase + (sfrac >> 16) + + // ((tfrac >> 16) * cachewidth); + +// +// determine whether last span or not +// + cmpl $8,%ecx + jna LLastSegment + +// +// not the last segment; do full 8-wide segment +// +LNotLastSegment: + +// +// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to +// get there +// + +// pick up after the FDIV that was left in flight previously + + fld %st(0) // duplicate it + fmul %st(4),%st(0) // s = s/z * z + fxch %st(1) + fmul %st(3),%st(0) // t = t/z * z + fxch %st(1) + fistpl snext + fistpl tnext + movl snext,%eax + movl tnext,%edx + + movb (%esi),%bl // get first source texel + subl $8,%ecx // count off this segments' pixels + movl C(sadjust),%ebp + movl %ecx,counttemp // remember count of remaining pixels + + movl C(tadjust),%ecx + movb %bl,(%edi) // store first dest pixel + + addl %eax,%ebp + addl %edx,%ecx + + movl C(bbextents),%eax + movl C(bbextentt),%edx + + cmpl $2048,%ebp + jl LClampLow2 + cmpl %eax,%ebp + ja LClampHigh2 +LClampReentry2: + + cmpl $2048,%ecx + jl LClampLow3 + cmpl %edx,%ecx + ja LClampHigh3 +LClampReentry3: + + movl %ebp,snext + movl %ecx,tnext + + subl s,%ebp + subl t,%ecx + +// +// set up advancetable +// + movl %ecx,%eax + movl %ebp,%edx + sarl $19,%eax // tstep >>= 16; + jz LZero + sarl $19,%edx // sstep >>= 16; + movl C(cachewidth),%ebx + imull %ebx,%eax + jmp LSetUp1 + +LZero: + sarl $19,%edx // sstep >>= 16; + movl C(cachewidth),%ebx + +LSetUp1: + + addl %edx,%eax // add in sstep + // (tstep >> 16) * cachewidth + (sstep >> 16); + movl tfracf,%edx + movl %eax,advancetable+4 // advance base in t + addl %ebx,%eax // ((tstep >> 16) + 1) * cachewidth + + // (sstep >> 16); + shll $13,%ebp // left-justify sstep fractional part + movl sfracf,%ebx + shll $13,%ecx // left-justify tstep fractional part + movl %eax,advancetable // advance extra in t + + movl %ecx,tstep + addl %ecx,%edx // advance tfrac fractional part by tstep frac + + sbbl %ecx,%ecx // turn tstep carry into -1 (0 if none) + addl %ebp,%ebx // advance sfrac fractional part by sstep frac + adcl advancetable+4(,%ecx,4),%esi // point to next source texel + + addl tstep,%edx + sbbl %ecx,%ecx + movb (%esi),%al + addl %ebp,%ebx + movb %al,1(%edi) + adcl advancetable+4(,%ecx,4),%esi + + addl tstep,%edx + sbbl %ecx,%ecx + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + + addl tstep,%edx + sbbl %ecx,%ecx + movb %al,2(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + + addl tstep,%edx + sbbl %ecx,%ecx + movb %al,3(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + + +// +// start FDIV for end of next segment in flight, so it can overlap +// + movl counttemp,%ecx + cmpl $8,%ecx // more than one segment after this? + ja LSetupNotLast2 // yes + + decl %ecx + jz LFDIVInFlight2 // if only one pixel, no need to start an FDIV + movl %ecx,spancountminus1 + fildl spancountminus1 + + flds C(d_zistepu) // C(d_zistepu) | spancountminus1 + fmul %st(1),%st(0) // C(d_zistepu)*scm1 | scm1 + flds C(d_tdivzstepu) // C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1 + fmul %st(2),%st(0) // C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1 + fxch %st(1) // C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1 | scm1 + faddp %st(0),%st(3) // C(d_tdivzstepu)*scm1 | scm1 + fxch %st(1) // scm1 | C(d_tdivzstepu)*scm1 + fmuls C(d_sdivzstepu) // C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1 + fxch %st(1) // C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1 + faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1 + flds fp_64k // 64k | C(d_sdivzstepu)*scm1 + fxch %st(1) // C(d_sdivzstepu)*scm1 | 64k + faddp %st(0),%st(4) // 64k + + fdiv %st(1),%st(0) // this is what we've gone to all this trouble to + // overlap + jmp LFDIVInFlight2 + + .align 4 +LSetupNotLast2: + fadds zi8stepu + fxch %st(2) + fadds sdivz8stepu + fxch %st(2) + flds tdivz8stepu + faddp %st(0),%st(2) + flds fp_64k + fdiv %st(1),%st(0) // z = 1/1/z + // this is what we've gone to all this trouble to + // overlap +LFDIVInFlight2: + movl %ecx,counttemp + + addl tstep,%edx + sbbl %ecx,%ecx + movb %al,4(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + + addl tstep,%edx + sbbl %ecx,%ecx + movb %al,5(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + + addl tstep,%edx + sbbl %ecx,%ecx + movb %al,6(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + + addl $8,%edi + movl %edx,tfracf + movl snext,%edx + movl %ebx,sfracf + movl tnext,%ebx + movl %edx,s + movl %ebx,t + + movl counttemp,%ecx // retrieve count + +// +// determine whether last span or not +// + cmpl $8,%ecx // are there multiple segments remaining? + movb %al,-1(%edi) + ja LNotLastSegment // yes + +// +// last segment of scan +// +LLastSegment: + +// +// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to +// get there. The number of pixels left is variable, and we want to land on the +// last pixel, not step one past it, so we can't run into arithmetic problems +// + testl %ecx,%ecx + jz LNoSteps // just draw the last pixel and we're done + +// pick up after the FDIV that was left in flight previously + + + fld %st(0) // duplicate it + fmul %st(4),%st(0) // s = s/z * z + fxch %st(1) + fmul %st(3),%st(0) // t = t/z * z + fxch %st(1) + fistpl snext + fistpl tnext + + movb (%esi),%al // load first texel in segment + movl C(tadjust),%ebx + movb %al,(%edi) // store first pixel in segment + movl C(sadjust),%eax + + addl snext,%eax + addl tnext,%ebx + + movl C(bbextents),%ebp + movl C(bbextentt),%edx + + cmpl $2048,%eax + jl LClampLow4 + cmpl %ebp,%eax + ja LClampHigh4 +LClampReentry4: + movl %eax,snext + + cmpl $2048,%ebx + jl LClampLow5 + cmpl %edx,%ebx + ja LClampHigh5 +LClampReentry5: + + cmpl $1,%ecx // don't bother + je LOnlyOneStep // if two pixels in segment, there's only one step, + // of the segment length + subl s,%eax + subl t,%ebx + + addl %eax,%eax // convert to 15.17 format so multiply by 1.31 + addl %ebx,%ebx // reciprocal yields 16.48 + + imull reciprocal_table-8(,%ecx,4) // sstep = (snext - s) / (spancount-1) + movl %edx,%ebp + + movl %ebx,%eax + imull reciprocal_table-8(,%ecx,4) // tstep = (tnext - t) / (spancount-1) + +LSetEntryvec: +// +// set up advancetable +// + movl entryvec_table(,%ecx,4),%ebx + movl %edx,%eax + movl %ebx,jumptemp // entry point into code for RET later + movl %ebp,%ecx + sarl $16,%edx // tstep >>= 16; + movl C(cachewidth),%ebx + sarl $16,%ecx // sstep >>= 16; + imull %ebx,%edx + + addl %ecx,%edx // add in sstep + // (tstep >> 16) * cachewidth + (sstep >> 16); + movl tfracf,%ecx + movl %edx,advancetable+4 // advance base in t + addl %ebx,%edx // ((tstep >> 16) + 1) * cachewidth + + // (sstep >> 16); + shll $16,%ebp // left-justify sstep fractional part + movl sfracf,%ebx + shll $16,%eax // left-justify tstep fractional part + movl %edx,advancetable // advance extra in t + + movl %eax,tstep + movl %ecx,%edx + addl %eax,%edx + sbbl %ecx,%ecx + addl %ebp,%ebx + adcl advancetable+4(,%ecx,4),%esi + + jmp *jumptemp // jump to the number-of-pixels handler + +//---------------------------------------- + +LNoSteps: + movb (%esi),%al // load first texel in segment + subl $7,%edi // adjust for hardwired offset + jmp LEndSpan + + +LOnlyOneStep: + subl s,%eax + subl t,%ebx + movl %eax,%ebp + movl %ebx,%edx + jmp LSetEntryvec + +//---------------------------------------- + +.globl Entry2_8 +Entry2_8: + subl $6,%edi // adjust for hardwired offsets + movb (%esi),%al + jmp LLEntry2_8 + +//---------------------------------------- + +.globl Entry3_8 +Entry3_8: + subl $5,%edi // adjust for hardwired offsets + addl %eax,%edx + movb (%esi),%al + sbbl %ecx,%ecx + addl %ebp,%ebx + adcl advancetable+4(,%ecx,4),%esi + jmp LLEntry3_8 + +//---------------------------------------- + +.globl Entry4_8 +Entry4_8: + subl $4,%edi // adjust for hardwired offsets + addl %eax,%edx + movb (%esi),%al + sbbl %ecx,%ecx + addl %ebp,%ebx + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx + jmp LLEntry4_8 + +//---------------------------------------- + +.globl Entry5_8 +Entry5_8: + subl $3,%edi // adjust for hardwired offsets + addl %eax,%edx + movb (%esi),%al + sbbl %ecx,%ecx + addl %ebp,%ebx + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx + jmp LLEntry5_8 + +//---------------------------------------- + +.globl Entry6_8 +Entry6_8: + subl $2,%edi // adjust for hardwired offsets + addl %eax,%edx + movb (%esi),%al + sbbl %ecx,%ecx + addl %ebp,%ebx + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx + jmp LLEntry6_8 + +//---------------------------------------- + +.globl Entry7_8 +Entry7_8: + decl %edi // adjust for hardwired offsets + addl %eax,%edx + movb (%esi),%al + sbbl %ecx,%ecx + addl %ebp,%ebx + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx + jmp LLEntry7_8 + +//---------------------------------------- + +.globl Entry8_8 +Entry8_8: + addl %eax,%edx + movb (%esi),%al + sbbl %ecx,%ecx + addl %ebp,%ebx + adcl advancetable+4(,%ecx,4),%esi + + addl tstep,%edx + sbbl %ecx,%ecx + movb %al,1(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx +LLEntry7_8: + sbbl %ecx,%ecx + movb %al,2(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx +LLEntry6_8: + sbbl %ecx,%ecx + movb %al,3(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx +LLEntry5_8: + sbbl %ecx,%ecx + movb %al,4(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx +LLEntry4_8: + sbbl %ecx,%ecx + movb %al,5(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi +LLEntry3_8: + movb %al,6(%edi) + movb (%esi),%al +LLEntry2_8: + +LEndSpan: + +// +// clear s/z, t/z, 1/z from FP stack +// + fstp %st(0) + fstp %st(0) + fstp %st(0) + + movl pspantemp,%ebx // restore spans pointer + movl espan_t_pnext(%ebx),%ebx // point to next span + testl %ebx,%ebx // any more spans? + movb %al,7(%edi) + jnz LSpanLoop // more spans + + popl %ebx // restore register variables + popl %esi + popl %edi + popl %ebp // restore the caller's stack frame + ret + +//---------------------------------------------------------------------- +// 8-bpp horizontal span z drawing codefor polygons, with no transparency. +// +// Assumes there is at least one span in pzspans, and that every span +// contains at least one pixel +//---------------------------------------------------------------------- + + .text + +// z-clamp on a non-negative gradient span +LClamp: + movl $0x40000000,%edx + xorl %ebx,%ebx + fstp %st(0) + jmp LZDraw + +// z-clamp on a negative gradient span +LClampNeg: + movl $0x40000000,%edx + xorl %ebx,%ebx + fstp %st(0) + jmp LZDrawNeg + + +#define pzspans 4+16 + +.globl C(D_DrawZSpans) +C(D_DrawZSpans): + pushl %ebp // preserve caller's stack frame + pushl %edi + pushl %esi // preserve register variables + pushl %ebx + + flds C(d_zistepu) + movl C(d_zistepu),%eax + movl pzspans(%esp),%esi + testl %eax,%eax + jz LFNegSpan + + fmuls Float2ToThe31nd + fistpl izistep // note: we are relying on FP exceptions being turned + // off here to avoid range problems + movl izistep,%ebx // remains loaded for all spans + +LFSpanLoop: +// set up the initial 1/z value + fildl espan_t_v(%esi) + fildl espan_t_u(%esi) + movl espan_t_v(%esi),%ecx + movl C(d_pzbuffer),%edi + fmuls C(d_zistepu) + fxch %st(1) + fmuls C(d_zistepv) + fxch %st(1) + fadds C(d_ziorigin) + imull C(d_zrowbytes),%ecx + faddp %st(0),%st(1) + +// clamp if z is nearer than 2 (1/z > 0.5) + fcoms float_point5 + addl %ecx,%edi + movl espan_t_u(%esi),%edx + addl %edx,%edx // word count + movl espan_t_count(%esi),%ecx + addl %edx,%edi // pdest = &pdestspan[scans->u]; + pushl %esi // preserve spans pointer + fnstsw %ax + testb $0x45,%ah + jz LClamp + + fmuls Float2ToThe31nd + fistpl izi // note: we are relying on FP exceptions being turned + // off here to avoid problems when the span is closer + // than 1/(2**31) + movl izi,%edx + +// at this point: +// %ebx = izistep +// %ecx = count +// %edx = izi +// %edi = pdest + +LZDraw: + +// do a single pixel up front, if necessary to dword align the destination + testl $2,%edi + jz LFMiddle + movl %edx,%eax + addl %ebx,%edx + shrl $16,%eax + decl %ecx + movw %ax,(%edi) + addl $2,%edi + +// do middle a pair of aligned dwords at a time +LFMiddle: + pushl %ecx + shrl $1,%ecx // count / 2 + jz LFLast // no aligned dwords to do + shrl $1,%ecx // (count / 2) / 2 + jnc LFMiddleLoop // even number of aligned dwords to do + + movl %edx,%eax + addl %ebx,%edx + shrl $16,%eax + movl %edx,%esi + addl %ebx,%edx + andl $0xFFFF0000,%esi + orl %esi,%eax + movl %eax,(%edi) + addl $4,%edi + andl %ecx,%ecx + jz LFLast + +LFMiddleLoop: + movl %edx,%eax + addl %ebx,%edx + shrl $16,%eax + movl %edx,%esi + addl %ebx,%edx + andl $0xFFFF0000,%esi + orl %esi,%eax + movl %edx,%ebp + movl %eax,(%edi) + addl %ebx,%edx + shrl $16,%ebp + movl %edx,%esi + addl %ebx,%edx + andl $0xFFFF0000,%esi + orl %esi,%ebp + movl %ebp,4(%edi) // FIXME: eliminate register contention + addl $8,%edi + + decl %ecx + jnz LFMiddleLoop + +LFLast: + popl %ecx // retrieve count + popl %esi // retrieve span pointer + +// do the last, unaligned pixel, if there is one + andl $1,%ecx // is there an odd pixel left to do? + jz LFSpanDone // no + shrl $16,%edx + movw %dx,(%edi) // do the final pixel's z + +LFSpanDone: + movl espan_t_pnext(%esi),%esi + testl %esi,%esi + jnz LFSpanLoop + + jmp LFDone + +LFNegSpan: + fmuls FloatMinus2ToThe31nd + fistpl izistep // note: we are relying on FP exceptions being turned + // off here to avoid range problems + movl izistep,%ebx // remains loaded for all spans + +LFNegSpanLoop: +// set up the initial 1/z value + fildl espan_t_v(%esi) + fildl espan_t_u(%esi) + movl espan_t_v(%esi),%ecx + movl C(d_pzbuffer),%edi + fmuls C(d_zistepu) + fxch %st(1) + fmuls C(d_zistepv) + fxch %st(1) + fadds C(d_ziorigin) + imull C(d_zrowbytes),%ecx + faddp %st(0),%st(1) + +// clamp if z is nearer than 2 (1/z > 0.5) + fcoms float_point5 + addl %ecx,%edi + movl espan_t_u(%esi),%edx + addl %edx,%edx // word count + movl espan_t_count(%esi),%ecx + addl %edx,%edi // pdest = &pdestspan[scans->u]; + pushl %esi // preserve spans pointer + fnstsw %ax + testb $0x45,%ah + jz LClampNeg + + fmuls Float2ToThe31nd + fistpl izi // note: we are relying on FP exceptions being turned + // off here to avoid problems when the span is closer + // than 1/(2**31) + movl izi,%edx + +// at this point: +// %ebx = izistep +// %ecx = count +// %edx = izi +// %edi = pdest + +LZDrawNeg: + +// do a single pixel up front, if necessary to dword align the destination + testl $2,%edi + jz LFNegMiddle + movl %edx,%eax + subl %ebx,%edx + shrl $16,%eax + decl %ecx + movw %ax,(%edi) + addl $2,%edi + +// do middle a pair of aligned dwords at a time +LFNegMiddle: + pushl %ecx + shrl $1,%ecx // count / 2 + jz LFNegLast // no aligned dwords to do + shrl $1,%ecx // (count / 2) / 2 + jnc LFNegMiddleLoop // even number of aligned dwords to do + + movl %edx,%eax + subl %ebx,%edx + shrl $16,%eax + movl %edx,%esi + subl %ebx,%edx + andl $0xFFFF0000,%esi + orl %esi,%eax + movl %eax,(%edi) + addl $4,%edi + andl %ecx,%ecx + jz LFNegLast + +LFNegMiddleLoop: + movl %edx,%eax + subl %ebx,%edx + shrl $16,%eax + movl %edx,%esi + subl %ebx,%edx + andl $0xFFFF0000,%esi + orl %esi,%eax + movl %edx,%ebp + movl %eax,(%edi) + subl %ebx,%edx + shrl $16,%ebp + movl %edx,%esi + subl %ebx,%edx + andl $0xFFFF0000,%esi + orl %esi,%ebp + movl %ebp,4(%edi) // FIXME: eliminate register contention + addl $8,%edi + + decl %ecx + jnz LFNegMiddleLoop + +LFNegLast: + popl %ecx // retrieve count + popl %esi // retrieve span pointer + +// do the last, unaligned pixel, if there is one + andl $1,%ecx // is there an odd pixel left to do? + jz LFNegSpanDone // no + shrl $16,%edx + movw %dx,(%edi) // do the final pixel's z + +LFNegSpanDone: + movl espan_t_pnext(%esi),%esi + testl %esi,%esi + jnz LFNegSpanLoop + +LFDone: + popl %ebx // restore register variables + popl %esi + popl %edi + popl %ebp // restore the caller's stack frame + ret + +#endif // id386 diff --git a/source/d_draw16.S b/source/d_draw16.S new file mode 100644 index 0000000..4cd0a17 --- /dev/null +++ b/source/d_draw16.S @@ -0,0 +1,974 @@ +/* +Copyright (C) 1996-1997 Id Software, Inc. + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +*/ +// +// d_draw16.s +// x86 assembly-language horizontal 8-bpp span-drawing code, with 16-pixel +// subdivision. +// + +#include "asm_i386.h" +#include "quakeasm.h" +#include "asm_draw.h" +#include "d_ifacea.h" + +#if id386 + +//---------------------------------------------------------------------- +// 8-bpp horizontal span drawing code for polygons, with no transparency and +// 16-pixel subdivision. +// +// Assumes there is at least one span in pspans, and that every span +// contains at least one pixel +//---------------------------------------------------------------------- + + .data + + .text + +// out-of-line, rarely-needed clamping code + +LClampHigh0: + movl C(bbextents),%esi + jmp LClampReentry0 +LClampHighOrLow0: + jg LClampHigh0 + xorl %esi,%esi + jmp LClampReentry0 + +LClampHigh1: + movl C(bbextentt),%edx + jmp LClampReentry1 +LClampHighOrLow1: + jg LClampHigh1 + xorl %edx,%edx + jmp LClampReentry1 + +LClampLow2: + movl $4096,%ebp + jmp LClampReentry2 +LClampHigh2: + movl C(bbextents),%ebp + jmp LClampReentry2 + +LClampLow3: + movl $4096,%ecx + jmp LClampReentry3 +LClampHigh3: + movl C(bbextentt),%ecx + jmp LClampReentry3 + +LClampLow4: + movl $4096,%eax + jmp LClampReentry4 +LClampHigh4: + movl C(bbextents),%eax + jmp LClampReentry4 + +LClampLow5: + movl $4096,%ebx + jmp LClampReentry5 +LClampHigh5: + movl C(bbextentt),%ebx + jmp LClampReentry5 + + +#define pspans 4+16 + + .align 4 +.globl C(D_DrawSpans16) +C(D_DrawSpans16): + pushl %ebp // preserve caller's stack frame + pushl %edi + pushl %esi // preserve register variables + pushl %ebx + +// +// set up scaled-by-16 steps, for 16-long segments; also set up cacheblock +// and span list pointers +// +// TODO: any overlap from rearranging? + flds C(d_sdivzstepu) + fmuls fp_16 + movl C(cacheblock),%edx + flds C(d_tdivzstepu) + fmuls fp_16 + movl pspans(%esp),%ebx // point to the first span descriptor + flds C(d_zistepu) + fmuls fp_16 + movl %edx,pbase // pbase = cacheblock + fstps zi16stepu + fstps tdivz16stepu + fstps sdivz16stepu + +LSpanLoop: +// +// set up the initial s/z, t/z, and 1/z on the FP stack, and generate the +// initial s and t values +// +// FIXME: pipeline FILD? + fildl espan_t_v(%ebx) + fildl espan_t_u(%ebx) + + fld %st(1) // dv | du | dv + fmuls C(d_sdivzstepv) // dv*d_sdivzstepv | du | dv + fld %st(1) // du | dv*d_sdivzstepv | du | dv + fmuls C(d_sdivzstepu) // du*d_sdivzstepu | dv*d_sdivzstepv | du | dv + fld %st(2) // du | du*d_sdivzstepu | dv*d_sdivzstepv | du | dv + fmuls C(d_tdivzstepu) // du*d_tdivzstepu | du*d_sdivzstepu | + // dv*d_sdivzstepv | du | dv + fxch %st(1) // du*d_sdivzstepu | du*d_tdivzstepu | + // dv*d_sdivzstepv | du | dv + faddp %st(0),%st(2) // du*d_tdivzstepu | + // du*d_sdivzstepu + dv*d_sdivzstepv | du | dv + fxch %st(1) // du*d_sdivzstepu + dv*d_sdivzstepv | + // du*d_tdivzstepu | du | dv + fld %st(3) // dv | du*d_sdivzstepu + dv*d_sdivzstepv | + // du*d_tdivzstepu | du | dv + fmuls C(d_tdivzstepv) // dv*d_tdivzstepv | + // du*d_sdivzstepu + dv*d_sdivzstepv | + // du*d_tdivzstepu | du | dv + fxch %st(1) // du*d_sdivzstepu + dv*d_sdivzstepv | + // dv*d_tdivzstepv | du*d_tdivzstepu | du | dv + fadds C(d_sdivzorigin) // sdivz = d_sdivzorigin + dv*d_sdivzstepv + + // du*d_sdivzstepu; stays in %st(2) at end + fxch %st(4) // dv | dv*d_tdivzstepv | du*d_tdivzstepu | du | + // s/z + fmuls C(d_zistepv) // dv*d_zistepv | dv*d_tdivzstepv | + // du*d_tdivzstepu | du | s/z + fxch %st(1) // dv*d_tdivzstepv | dv*d_zistepv | + // du*d_tdivzstepu | du | s/z + faddp %st(0),%st(2) // dv*d_zistepv | + // dv*d_tdivzstepv + du*d_tdivzstepu | du | s/z + fxch %st(2) // du | dv*d_tdivzstepv + du*d_tdivzstepu | + // dv*d_zistepv | s/z + fmuls C(d_zistepu) // du*d_zistepu | + // dv*d_tdivzstepv + du*d_tdivzstepu | + // dv*d_zistepv | s/z + fxch %st(1) // dv*d_tdivzstepv + du*d_tdivzstepu | + // du*d_zistepu | dv*d_zistepv | s/z + fadds C(d_tdivzorigin) // tdivz = d_tdivzorigin + dv*d_tdivzstepv + + // du*d_tdivzstepu; stays in %st(1) at end + fxch %st(2) // dv*d_zistepv | du*d_zistepu | t/z | s/z + faddp %st(0),%st(1) // dv*d_zistepv + du*d_zistepu | t/z | s/z + + flds fp_64k // fp_64k | dv*d_zistepv + du*d_zistepu | t/z | s/z + fxch %st(1) // dv*d_zistepv + du*d_zistepu | fp_64k | t/z | s/z + fadds C(d_ziorigin) // zi = d_ziorigin + dv*d_zistepv + + // du*d_zistepu; stays in %st(0) at end + // 1/z | fp_64k | t/z | s/z +// +// calculate and clamp s & t +// + fdivr %st(0),%st(1) // 1/z | z*64k | t/z | s/z + +// +// point %edi to the first pixel in the span +// + movl C(d_viewbuffer),%ecx + movl espan_t_v(%ebx),%eax + movl %ebx,pspantemp // preserve spans pointer + + movl C(tadjust),%edx + movl C(sadjust),%esi + movl C(d_scantable)(,%eax,4),%edi // v * screenwidth + addl %ecx,%edi + movl espan_t_u(%ebx),%ecx + addl %ecx,%edi // pdest = &pdestspan[scans->u]; + movl espan_t_count(%ebx),%ecx + +// +// now start the FDIV for the end of the span +// + cmpl $16,%ecx + ja LSetupNotLast1 + + decl %ecx + jz LCleanup1 // if only one pixel, no need to start an FDIV + movl %ecx,spancountminus1 + +// finish up the s and t calcs + fxch %st(1) // z*64k | 1/z | t/z | s/z + + fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z + fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z + fxch %st(1) // z*64k | s | 1/z | t/z | s/z + fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z + fxch %st(1) // s | t | 1/z | t/z | s/z + fistpl s // 1/z | t | t/z | s/z + fistpl t // 1/z | t/z | s/z + + fildl spancountminus1 + + flds C(d_tdivzstepu) // C(d_tdivzstepu) | spancountminus1 + flds C(d_zistepu) // C(d_zistepu) | C(d_tdivzstepu) | spancountminus1 + fmul %st(2),%st(0) // C(d_zistepu)*scm1 | C(d_tdivzstepu) | scm1 + fxch %st(1) // C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1 + fmul %st(2),%st(0) // C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1 + fxch %st(2) // scm1 | C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1 + fmuls C(d_sdivzstepu) // C(d_sdivzstepu)*scm1 | C(d_zistepu)*scm1 | + // C(d_tdivzstepu)*scm1 + fxch %st(1) // C(d_zistepu)*scm1 | C(d_sdivzstepu)*scm1 | + // C(d_tdivzstepu)*scm1 + faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1 + fxch %st(1) // C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1 + faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1 + faddp %st(0),%st(3) + + flds fp_64k + fdiv %st(1),%st(0) // this is what we've gone to all this trouble to + // overlap + jmp LFDIVInFlight1 + +LCleanup1: +// finish up the s and t calcs + fxch %st(1) // z*64k | 1/z | t/z | s/z + + fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z + fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z + fxch %st(1) // z*64k | s | 1/z | t/z | s/z + fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z + fxch %st(1) // s | t | 1/z | t/z | s/z + fistpl s // 1/z | t | t/z | s/z + fistpl t // 1/z | t/z | s/z + jmp LFDIVInFlight1 + + .align 4 +LSetupNotLast1: +// finish up the s and t calcs + fxch %st(1) // z*64k | 1/z | t/z | s/z + + fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z + fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z + fxch %st(1) // z*64k | s | 1/z | t/z | s/z + fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z + fxch %st(1) // s | t | 1/z | t/z | s/z + fistpl s // 1/z | t | t/z | s/z + fistpl t // 1/z | t/z | s/z + + fadds zi16stepu + fxch %st(2) + fadds sdivz16stepu + fxch %st(2) + flds tdivz16stepu + faddp %st(0),%st(2) + flds fp_64k + fdiv %st(1),%st(0) // z = 1/1/z + // this is what we've gone to all this trouble to + // overlap +LFDIVInFlight1: + + addl s,%esi + addl t,%edx + movl C(bbextents),%ebx + movl C(bbextentt),%ebp + cmpl %ebx,%esi + ja LClampHighOrLow0 +LClampReentry0: + movl %esi,s + movl pbase,%ebx + shll $16,%esi + cmpl %ebp,%edx + movl %esi,sfracf + ja LClampHighOrLow1 +LClampReentry1: + movl %edx,t + movl s,%esi // sfrac = scans->sfrac; + shll $16,%edx + movl t,%eax // tfrac = scans->tfrac; + sarl $16,%esi + movl %edx,tfracf + +// +// calculate the texture starting address +// + sarl $16,%eax + movl C(cachewidth),%edx + imull %edx,%eax // (tfrac >> 16) * cachewidth + addl %ebx,%esi + addl %eax,%esi // psource = pbase + (sfrac >> 16) + + // ((tfrac >> 16) * cachewidth); +// +// determine whether last span or not +// + cmpl $16,%ecx + jna LLastSegment + +// +// not the last segment; do full 16-wide segment +// +LNotLastSegment: + +// +// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to +// get there +// + +// pick up after the FDIV that was left in flight previously + + fld %st(0) // duplicate it + fmul %st(4),%st(0) // s = s/z * z + fxch %st(1) + fmul %st(3),%st(0) // t = t/z * z + fxch %st(1) + fistpl snext + fistpl tnext + movl snext,%eax + movl tnext,%edx + + movb (%esi),%bl // get first source texel + subl $16,%ecx // count off this segments' pixels + movl C(sadjust),%ebp + movl %ecx,counttemp // remember count of remaining pixels + + movl C(tadjust),%ecx + movb %bl,(%edi) // store first dest pixel + + addl %eax,%ebp + addl %edx,%ecx + + movl C(bbextents),%eax + movl C(bbextentt),%edx + + cmpl $4096,%ebp + jl LClampLow2 + cmpl %eax,%ebp + ja LClampHigh2 +LClampReentry2: + + cmpl $4096,%ecx + jl LClampLow3 + cmpl %edx,%ecx + ja LClampHigh3 +LClampReentry3: + + movl %ebp,snext + movl %ecx,tnext + + subl s,%ebp + subl t,%ecx + +// +// set up advancetable +// + movl %ecx,%eax + movl %ebp,%edx + sarl $20,%eax // tstep >>= 16; + jz LZero + sarl $20,%edx // sstep >>= 16; + movl C(cachewidth),%ebx + imull %ebx,%eax + jmp LSetUp1 + +LZero: + sarl $20,%edx // sstep >>= 16; + movl C(cachewidth),%ebx + +LSetUp1: + + addl %edx,%eax // add in sstep + // (tstep >> 16) * cachewidth + (sstep >> 16); + movl tfracf,%edx + movl %eax,advancetable+4 // advance base in t + addl %ebx,%eax // ((tstep >> 16) + 1) * cachewidth + + // (sstep >> 16); + shll $12,%ebp // left-justify sstep fractional part + movl sfracf,%ebx + shll $12,%ecx // left-justify tstep fractional part + movl %eax,advancetable // advance extra in t + + movl %ecx,tstep + addl %ecx,%edx // advance tfrac fractional part by tstep frac + + sbbl %ecx,%ecx // turn tstep carry into -1 (0 if none) + addl %ebp,%ebx // advance sfrac fractional part by sstep frac + adcl advancetable+4(,%ecx,4),%esi // point to next source texel + + addl tstep,%edx + sbbl %ecx,%ecx + movb (%esi),%al + addl %ebp,%ebx + movb %al,1(%edi) + adcl advancetable+4(,%ecx,4),%esi + + addl tstep,%edx + sbbl %ecx,%ecx + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + + addl tstep,%edx + sbbl %ecx,%ecx + movb %al,2(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + + addl tstep,%edx + sbbl %ecx,%ecx + movb %al,3(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + + addl tstep,%edx + sbbl %ecx,%ecx + movb %al,4(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + + addl tstep,%edx + sbbl %ecx,%ecx + movb %al,5(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + + addl tstep,%edx + sbbl %ecx,%ecx + movb %al,6(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + + addl tstep,%edx + sbbl %ecx,%ecx + movb %al,7(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + + +// +// start FDIV for end of next segment in flight, so it can overlap +// + movl counttemp,%ecx + cmpl $16,%ecx // more than one segment after this? + ja LSetupNotLast2 // yes + + decl %ecx + jz LFDIVInFlight2 // if only one pixel, no need to start an FDIV + movl %ecx,spancountminus1 + fildl spancountminus1 + + flds C(d_zistepu) // C(d_zistepu) | spancountminus1 + fmul %st(1),%st(0) // C(d_zistepu)*scm1 | scm1 + flds C(d_tdivzstepu) // C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1 + fmul %st(2),%st(0) // C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1 + fxch %st(1) // C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1 | scm1 + faddp %st(0),%st(3) // C(d_tdivzstepu)*scm1 | scm1 + fxch %st(1) // scm1 | C(d_tdivzstepu)*scm1 + fmuls C(d_sdivzstepu) // C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1 + fxch %st(1) // C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1 + faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1 + flds fp_64k // 64k | C(d_sdivzstepu)*scm1 + fxch %st(1) // C(d_sdivzstepu)*scm1 | 64k + faddp %st(0),%st(4) // 64k + + fdiv %st(1),%st(0) // this is what we've gone to all this trouble to + // overlap + jmp LFDIVInFlight2 + + .align 4 +LSetupNotLast2: + fadds zi16stepu + fxch %st(2) + fadds sdivz16stepu + fxch %st(2) + flds tdivz16stepu + faddp %st(0),%st(2) + flds fp_64k + fdiv %st(1),%st(0) // z = 1/1/z + // this is what we've gone to all this trouble to + // overlap +LFDIVInFlight2: + movl %ecx,counttemp + + addl tstep,%edx + sbbl %ecx,%ecx + movb %al,8(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + + addl tstep,%edx + sbbl %ecx,%ecx + movb %al,9(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + + addl tstep,%edx + sbbl %ecx,%ecx + movb %al,10(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + + addl tstep,%edx + sbbl %ecx,%ecx + movb %al,11(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + + addl tstep,%edx + sbbl %ecx,%ecx + movb %al,12(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + + addl tstep,%edx + sbbl %ecx,%ecx + movb %al,13(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + + addl tstep,%edx + sbbl %ecx,%ecx + movb %al,14(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + + addl $16,%edi + movl %edx,tfracf + movl snext,%edx + movl %ebx,sfracf + movl tnext,%ebx + movl %edx,s + movl %ebx,t + + movl counttemp,%ecx // retrieve count + +// +// determine whether last span or not +// + cmpl $16,%ecx // are there multiple segments remaining? + movb %al,-1(%edi) + ja LNotLastSegment // yes + +// +// last segment of scan +// +LLastSegment: + +// +// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to +// get there. The number of pixels left is variable, and we want to land on the +// last pixel, not step one past it, so we can't run into arithmetic problems +// + testl %ecx,%ecx + jz LNoSteps // just draw the last pixel and we're done + +// pick up after the FDIV that was left in flight previously + + + fld %st(0) // duplicate it + fmul %st(4),%st(0) // s = s/z * z + fxch %st(1) + fmul %st(3),%st(0) // t = t/z * z + fxch %st(1) + fistpl snext + fistpl tnext + + movb (%esi),%al // load first texel in segment + movl C(tadjust),%ebx + movb %al,(%edi) // store first pixel in segment + movl C(sadjust),%eax + + addl snext,%eax + addl tnext,%ebx + + movl C(bbextents),%ebp + movl C(bbextentt),%edx + + cmpl $4096,%eax + jl LClampLow4 + cmpl %ebp,%eax + ja LClampHigh4 +LClampReentry4: + movl %eax,snext + + cmpl $4096,%ebx + jl LClampLow5 + cmpl %edx,%ebx + ja LClampHigh5 +LClampReentry5: + + cmpl $1,%ecx // don't bother + je LOnlyOneStep // if two pixels in segment, there's only one step, + // of the segment length + subl s,%eax + subl t,%ebx + + addl %eax,%eax // convert to 15.17 format so multiply by 1.31 + addl %ebx,%ebx // reciprocal yields 16.48 + + imull reciprocal_table_16-8(,%ecx,4) // sstep = (snext - s) / + // (spancount-1) + movl %edx,%ebp + + movl %ebx,%eax + imull reciprocal_table_16-8(,%ecx,4) // tstep = (tnext - t) / + // (spancount-1) +LSetEntryvec: +// +// set up advancetable +// + movl entryvec_table_16(,%ecx,4),%ebx + movl %edx,%eax + movl %ebx,jumptemp // entry point into code for RET later + movl %ebp,%ecx + sarl $16,%edx // tstep >>= 16; + movl C(cachewidth),%ebx + sarl $16,%ecx // sstep >>= 16; + imull %ebx,%edx + + addl %ecx,%edx // add in sstep + // (tstep >> 16) * cachewidth + (sstep >> 16); + movl tfracf,%ecx + movl %edx,advancetable+4 // advance base in t + addl %ebx,%edx // ((tstep >> 16) + 1) * cachewidth + + // (sstep >> 16); + shll $16,%ebp // left-justify sstep fractional part + movl sfracf,%ebx + shll $16,%eax // left-justify tstep fractional part + movl %edx,advancetable // advance extra in t + + movl %eax,tstep + movl %ecx,%edx + addl %eax,%edx + sbbl %ecx,%ecx + addl %ebp,%ebx + adcl advancetable+4(,%ecx,4),%esi + + jmp *jumptemp // jump to the number-of-pixels handler + +//---------------------------------------- + +LNoSteps: + movb (%esi),%al // load first texel in segment + subl $15,%edi // adjust for hardwired offset + jmp LEndSpan + + +LOnlyOneStep: + subl s,%eax + subl t,%ebx + movl %eax,%ebp + movl %ebx,%edx + jmp LSetEntryvec + +//---------------------------------------- + +.globl Entry2_16, Entry3_16, Entry4_16, Entry5_16 +.globl Entry6_16, Entry7_16, Entry8_16, Entry9_16 +.globl Entry10_16, Entry11_16, Entry12_16, Entry13_16 +.globl Entry14_16, Entry15_16, Entry16_16 + +Entry2_16: + subl $14,%edi // adjust for hardwired offsets + movb (%esi),%al + jmp LEntry2_16 + +//---------------------------------------- + +Entry3_16: + subl $13,%edi // adjust for hardwired offsets + addl %eax,%edx + movb (%esi),%al + sbbl %ecx,%ecx + addl %ebp,%ebx + adcl advancetable+4(,%ecx,4),%esi + jmp LEntry3_16 + +//---------------------------------------- + +Entry4_16: + subl $12,%edi // adjust for hardwired offsets + addl %eax,%edx + movb (%esi),%al + sbbl %ecx,%ecx + addl %ebp,%ebx + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx + jmp LEntry4_16 + +//---------------------------------------- + +Entry5_16: + subl $11,%edi // adjust for hardwired offsets + addl %eax,%edx + movb (%esi),%al + sbbl %ecx,%ecx + addl %ebp,%ebx + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx + jmp LEntry5_16 + +//---------------------------------------- + +Entry6_16: + subl $10,%edi // adjust for hardwired offsets + addl %eax,%edx + movb (%esi),%al + sbbl %ecx,%ecx + addl %ebp,%ebx + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx + jmp LEntry6_16 + +//---------------------------------------- + +Entry7_16: + subl $9,%edi // adjust for hardwired offsets + addl %eax,%edx + movb (%esi),%al + sbbl %ecx,%ecx + addl %ebp,%ebx + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx + jmp LEntry7_16 + +//---------------------------------------- + +Entry8_16: + subl $8,%edi // adjust for hardwired offsets + addl %eax,%edx + movb (%esi),%al + sbbl %ecx,%ecx + addl %ebp,%ebx + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx + jmp LEntry8_16 + +//---------------------------------------- + +Entry9_16: + subl $7,%edi // adjust for hardwired offsets + addl %eax,%edx + movb (%esi),%al + sbbl %ecx,%ecx + addl %ebp,%ebx + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx + jmp LEntry9_16 + +//---------------------------------------- + +Entry10_16: + subl $6,%edi // adjust for hardwired offsets + addl %eax,%edx + movb (%esi),%al + sbbl %ecx,%ecx + addl %ebp,%ebx + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx + jmp LEntry10_16 + +//---------------------------------------- + +Entry11_16: + subl $5,%edi // adjust for hardwired offsets + addl %eax,%edx + movb (%esi),%al + sbbl %ecx,%ecx + addl %ebp,%ebx + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx + jmp LEntry11_16 + +//---------------------------------------- + +Entry12_16: + subl $4,%edi // adjust for hardwired offsets + addl %eax,%edx + movb (%esi),%al + sbbl %ecx,%ecx + addl %ebp,%ebx + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx + jmp LEntry12_16 + +//---------------------------------------- + +Entry13_16: + subl $3,%edi // adjust for hardwired offsets + addl %eax,%edx + movb (%esi),%al + sbbl %ecx,%ecx + addl %ebp,%ebx + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx + jmp LEntry13_16 + +//---------------------------------------- + +Entry14_16: + subl $2,%edi // adjust for hardwired offsets + addl %eax,%edx + movb (%esi),%al + sbbl %ecx,%ecx + addl %ebp,%ebx + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx + jmp LEntry14_16 + +//---------------------------------------- + +Entry15_16: + decl %edi // adjust for hardwired offsets + addl %eax,%edx + movb (%esi),%al + sbbl %ecx,%ecx + addl %ebp,%ebx + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx + jmp LEntry15_16 + +//---------------------------------------- + +Entry16_16: + addl %eax,%edx + movb (%esi),%al + sbbl %ecx,%ecx + addl %ebp,%ebx + adcl advancetable+4(,%ecx,4),%esi + + addl tstep,%edx + sbbl %ecx,%ecx + movb %al,1(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx +LEntry15_16: + sbbl %ecx,%ecx + movb %al,2(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx +LEntry14_16: + sbbl %ecx,%ecx + movb %al,3(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx +LEntry13_16: + sbbl %ecx,%ecx + movb %al,4(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx +LEntry12_16: + sbbl %ecx,%ecx + movb %al,5(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx +LEntry11_16: + sbbl %ecx,%ecx + movb %al,6(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx +LEntry10_16: + sbbl %ecx,%ecx + movb %al,7(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx +LEntry9_16: + sbbl %ecx,%ecx + movb %al,8(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx +LEntry8_16: + sbbl %ecx,%ecx + movb %al,9(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx +LEntry7_16: + sbbl %ecx,%ecx + movb %al,10(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx +LEntry6_16: + sbbl %ecx,%ecx + movb %al,11(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx +LEntry5_16: + sbbl %ecx,%ecx + movb %al,12(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi + addl tstep,%edx +LEntry4_16: + sbbl %ecx,%ecx + movb %al,13(%edi) + addl %ebp,%ebx + movb (%esi),%al + adcl advancetable+4(,%ecx,4),%esi +LEntry3_16: + movb %al,14(%edi) + movb (%esi),%al +LEntry2_16: + +LEndSpan: + +// +// clear s/z, t/z, 1/z from FP stack +// + fstp %st(0) + fstp %st(0) + fstp %st(0) + + movl pspantemp,%ebx // restore spans pointer + movl espan_t_pnext(%ebx),%ebx // point to next span + testl %ebx,%ebx // any more spans? + movb %al,15(%edi) + jnz LSpanLoop // more spans + + popl %ebx // restore register variables + popl %esi + popl %edi + popl %ebp // restore the caller's stack frame + ret + +#endif // id386 diff --git a/source/d_parta.S b/source/d_parta.S new file mode 100644 index 0000000..560925d --- /dev/null +++ b/source/d_parta.S @@ -0,0 +1,477 @@ +/* +Copyright (C) 1996-1997 Id Software, Inc. + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +*/ +// +// d_parta.s +// x86 assembly-language 8-bpp particle-drawing code. +// + +#include "asm_i386.h" +#include "quakeasm.h" +#include "d_ifacea.h" +#include "asm_draw.h" + +#if id386 + +//---------------------------------------------------------------------- +// 8-bpp particle drawing code. +//---------------------------------------------------------------------- + +//FIXME: comments, full optimization + +//---------------------------------------------------------------------- +// 8-bpp particle queueing code. +//---------------------------------------------------------------------- + + .text + +#define P 12+4 + + .align 4 +.globl C(D_DrawParticle) +C(D_DrawParticle): + pushl %ebp // preserve caller's stack frame + pushl %edi // preserve register variables + pushl %ebx + + movl P(%esp),%edi + +// FIXME: better FP overlap in general here + +// transform point +// VectorSubtract (p->org, r_origin, local); + flds C(r_origin) + fsubrs pt_org(%edi) + flds pt_org+4(%edi) + fsubs C(r_origin)+4 + flds pt_org+8(%edi) + fsubs C(r_origin)+8 + fxch %st(2) // local[0] | local[1] | local[2] + +// transformed[2] = DotProduct(local, r_ppn); + flds C(r_ppn) // r_ppn[0] | local[0] | local[1] | local[2] + fmul %st(1),%st(0) // dot0 | local[0] | local[1] | local[2] + flds C(r_ppn)+4 // r_ppn[1] | dot0 | local[0] | local[1] | local[2] + fmul %st(3),%st(0) // dot1 | dot0 | local[0] | local[1] | local[2] + flds C(r_ppn)+8 // r_ppn[2] | dot1 | dot0 | local[0] | + // local[1] | local[2] + fmul %st(5),%st(0) // dot2 | dot1 | dot0 | local[0] | local[1] | local[2] + fxch %st(2) // dot0 | dot1 | dot2 | local[0] | local[1] | local[2] + faddp %st(0),%st(1) // dot0 + dot1 | dot2 | local[0] | local[1] | + // local[2] + faddp %st(0),%st(1) // z | local[0] | local[1] | local[2] + fld %st(0) // z | z | local[0] | local[1] | + // local[2] + fdivrs float_1 // 1/z | z | local[0] | local[1] | local[2] + fxch %st(1) // z | 1/z | local[0] | local[1] | local[2] + +// if (transformed[2] < PARTICLE_Z_CLIP) +// return; + fcomps float_particle_z_clip // 1/z | local[0] | local[1] | local[2] + fxch %st(3) // local[2] | local[0] | local[1] | 1/z + + flds C(r_pup) // r_pup[0] | local[2] | local[0] | local[1] | 1/z + fmul %st(2),%st(0) // dot0 | local[2] | local[0] | local[1] | 1/z + flds C(r_pup)+4 // r_pup[1] | dot0 | local[2] | local[0] | + // local[1] | 1/z + + fnstsw %ax + testb $1,%ah + jnz LPop6AndDone + +// transformed[1] = DotProduct(local, r_pup); + fmul %st(4),%st(0) // dot1 | dot0 | local[2] | local[0] | local[1] | 1/z + flds C(r_pup)+8 // r_pup[2] | dot1 | dot0 | local[2] | + // local[0] | local[1] | 1/z + fmul %st(3),%st(0) // dot2 | dot1 | dot0 | local[2] | local[0] | + // local[1] | 1/z + fxch %st(2) // dot0 | dot1 | dot2 | local[2] | local[0] | + // local[1] | 1/z + faddp %st(0),%st(1) // dot0 + dot1 | dot2 | local[2] | local[0] | + // local[1] | 1/z + faddp %st(0),%st(1) // y | local[2] | local[0] | local[1] | 1/z + fxch %st(3) // local[1] | local[2] | local[0] | y | 1/z + +// transformed[0] = DotProduct(local, r_pright); + fmuls C(r_pright)+4 // dot1 | local[2] | local[0] | y | 1/z + fxch %st(2) // local[0] | local[2] | dot1 | y | 1/z + fmuls C(r_pright) // dot0 | local[2] | dot1 | y | 1/z + fxch %st(1) // local[2] | dot0 | dot1 | y | 1/z + fmuls C(r_pright)+8 // dot2 | dot0 | dot1 | y | 1/z + fxch %st(2) // dot1 | dot0 | dot2 | y | 1/z + faddp %st(0),%st(1) // dot1 + dot0 | dot2 | y | 1/z + + faddp %st(0),%st(1) // x | y | 1/z + fxch %st(1) // y | x | 1/z + +// project the point + fmul %st(2),%st(0) // y/z | x | 1/z + fxch %st(1) // x | y/z | 1/z + fmul %st(2),%st(0) // x/z | y/z | 1/z + fxch %st(1) // y/z | x/z | 1/z + fsubrs C(ycenter) // v | x/z | 1/z + fxch %st(1) // x/z | v | 1/z + fadds C(xcenter) // u | v | 1/z +// FIXME: preadjust xcenter and ycenter + fxch %st(1) // v | u | 1/z + fadds float_point5 // v | u | 1/z + fxch %st(1) // u | v | 1/z + fadds float_point5 // u | v | 1/z + fxch %st(2) // 1/z | v | u + fmuls DP_32768 // 1/z * 0x8000 | v | u + fxch %st(2) // u | v | 1/z * 0x8000 + +// FIXME: use Terje's fp->int trick here? +// FIXME: check we're getting proper rounding here + fistpl DP_u // v | 1/z * 0x8000 + fistpl DP_v // 1/z * 0x8000 + + movl DP_u,%eax + movl DP_v,%edx + +// if ((v > d_vrectbottom_particle) || +// (u > d_vrectright_particle) || +// (v < d_vrecty) || +// (u < d_vrectx)) +// { +// continue; +// } + + movl C(d_vrectbottom_particle),%ebx + movl C(d_vrectright_particle),%ecx + cmpl %ebx,%edx + jg LPop1AndDone + cmpl %ecx,%eax + jg LPop1AndDone + movl C(d_vrecty),%ebx + movl C(d_vrectx),%ecx + cmpl %ebx,%edx + jl LPop1AndDone + + cmpl %ecx,%eax + jl LPop1AndDone + + flds pt_color(%edi) // color | 1/z * 0x8000 +// FIXME: use Terje's fast fp->int trick? + fistpl DP_Color // 1/z * 0x8000 + + movl C(d_viewbuffer),%ebx + + addl %eax,%ebx + movl C(d_scantable)(,%edx,4),%edi // point to the pixel + + imull C(d_zrowbytes),%edx // point to the z pixel + + leal (%edx,%eax,2),%edx + movl C(d_pzbuffer),%eax + + fistpl izi + + addl %ebx,%edi + addl %eax,%edx + +// pix = izi >> d_pix_shift; + + movl izi,%eax + movl C(d_pix_shift),%ecx + shrl %cl,%eax + movl izi,%ebp + +// if (pix < d_pix_min) +// pix = d_pix_min; +// else if (pix > d_pix_max) +// pix = d_pix_max; + + movl C(d_pix_min),%ebx + movl C(d_pix_max),%ecx + cmpl %ebx,%eax + jnl LTestPixMax + movl %ebx,%eax + jmp LTestDone + +LTestPixMax: + cmpl %ecx,%eax + jng LTestDone + movl %ecx,%eax +LTestDone: + + movb DP_Color,%ch + + movl C(d_y_aspect_shift),%ebx + testl %ebx,%ebx + jnz LDefault + + cmpl $4,%eax + ja LDefault + + jmp DP_EntryTable-4(,%eax,4) + +// 1x1 +.globl DP_1x1 +DP_1x1: + cmpw %bp,(%edx) // just one pixel to do + jg LDone + movw %bp,(%edx) + movb %ch,(%edi) + jmp LDone + +// 2x2 +.globl DP_2x2 +DP_2x2: + pushl %esi + movl C(screenwidth),%ebx + movl C(d_zrowbytes),%esi + + cmpw %bp,(%edx) + jg L2x2_1 + movw %bp,(%edx) + movb %ch,(%edi) +L2x2_1: + cmpw %bp,2(%edx) + jg L2x2_2 + movw %bp,2(%edx) + movb %ch,1(%edi) +L2x2_2: + cmpw %bp,(%edx,%esi,1) + jg L2x2_3 + movw %bp,(%edx,%esi,1) + movb %ch,(%edi,%ebx,1) +L2x2_3: + cmpw %bp,2(%edx,%esi,1) + jg L2x2_4 + movw %bp,2(%edx,%esi,1) + movb %ch,1(%edi,%ebx,1) +L2x2_4: + + popl %esi + jmp LDone + +// 3x3 +.globl DP_3x3 +DP_3x3: + pushl %esi + movl C(screenwidth),%ebx + movl C(d_zrowbytes),%esi + + cmpw %bp,(%edx) + jg L3x3_1 + movw %bp,(%edx) + movb %ch,(%edi) +L3x3_1: + cmpw %bp,2(%edx) + jg L3x3_2 + movw %bp,2(%edx) + movb %ch,1(%edi) +L3x3_2: + cmpw %bp,4(%edx) + jg L3x3_3 + movw %bp,4(%edx) + movb %ch,2(%edi) +L3x3_3: + + cmpw %bp,(%edx,%esi,1) + jg L3x3_4 + movw %bp,(%edx,%esi,1) + movb %ch,(%edi,%ebx,1) +L3x3_4: + cmpw %bp,2(%edx,%esi,1) + jg L3x3_5 + movw %bp,2(%edx,%esi,1) + movb %ch,1(%edi,%ebx,1) +L3x3_5: + cmpw %bp,4(%edx,%esi,1) + jg L3x3_6 + movw %bp,4(%edx,%esi,1) + movb %ch,2(%edi,%ebx,1) +L3x3_6: + + cmpw %bp,(%edx,%esi,2) + jg L3x3_7 + movw %bp,(%edx,%esi,2) + movb %ch,(%edi,%ebx,2) +L3x3_7: + cmpw %bp,2(%edx,%esi,2) + jg L3x3_8 + movw %bp,2(%edx,%esi,2) + movb %ch,1(%edi,%ebx,2) +L3x3_8: + cmpw %bp,4(%edx,%esi,2) + jg L3x3_9 + movw %bp,4(%edx,%esi,2) + movb %ch,2(%edi,%ebx,2) +L3x3_9: + + popl %esi + jmp LDone + + +// 4x4 +.globl DP_4x4 +DP_4x4: + pushl %esi + movl C(screenwidth),%ebx + movl C(d_zrowbytes),%esi + + cmpw %bp,(%edx) + jg L4x4_1 + movw %bp,(%edx) + movb %ch,(%edi) +L4x4_1: + cmpw %bp,2(%edx) + jg L4x4_2 + movw %bp,2(%edx) + movb %ch,1(%edi) +L4x4_2: + cmpw %bp,4(%edx) + jg L4x4_3 + movw %bp,4(%edx) + movb %ch,2(%edi) +L4x4_3: + cmpw %bp,6(%edx) + jg L4x4_4 + movw %bp,6(%edx) + movb %ch,3(%edi) +L4x4_4: + + cmpw %bp,(%edx,%esi,1) + jg L4x4_5 + movw %bp,(%edx,%esi,1) + movb %ch,(%edi,%ebx,1) +L4x4_5: + cmpw %bp,2(%edx,%esi,1) + jg L4x4_6 + movw %bp,2(%edx,%esi,1) + movb %ch,1(%edi,%ebx,1) +L4x4_6: + cmpw %bp,4(%edx,%esi,1) + jg L4x4_7 + movw %bp,4(%edx,%esi,1) + movb %ch,2(%edi,%ebx,1) +L4x4_7: + cmpw %bp,6(%edx,%esi,1) + jg L4x4_8 + movw %bp,6(%edx,%esi,1) + movb %ch,3(%edi,%ebx,1) +L4x4_8: + + leal (%edx,%esi,2),%edx + leal (%edi,%ebx,2),%edi + + cmpw %bp,(%edx) + jg L4x4_9 + movw %bp,(%edx) + movb %ch,(%edi) +L4x4_9: + cmpw %bp,2(%edx) + jg L4x4_10 + movw %bp,2(%edx) + movb %ch,1(%edi) +L4x4_10: + cmpw %bp,4(%edx) + jg L4x4_11 + movw %bp,4(%edx) + movb %ch,2(%edi) +L4x4_11: + cmpw %bp,6(%edx) + jg L4x4_12 + movw %bp,6(%edx) + movb %ch,3(%edi) +L4x4_12: + + cmpw %bp,(%edx,%esi,1) + jg L4x4_13 + movw %bp,(%edx,%esi,1) + movb %ch,(%edi,%ebx,1) +L4x4_13: + cmpw %bp,2(%edx,%esi,1) + jg L4x4_14 + movw %bp,2(%edx,%esi,1) + movb %ch,1(%edi,%ebx,1) +L4x4_14: + cmpw %bp,4(%edx,%esi,1) + jg L4x4_15 + movw %bp,4(%edx,%esi,1) + movb %ch,2(%edi,%ebx,1) +L4x4_15: + cmpw %bp,6(%edx,%esi,1) + jg L4x4_16 + movw %bp,6(%edx,%esi,1) + movb %ch,3(%edi,%ebx,1) +L4x4_16: + + popl %esi + jmp LDone + +// default case, handling any size particle +LDefault: + +// count = pix << d_y_aspect_shift; + + movl %eax,%ebx + movl %eax,DP_Pix + movb C(d_y_aspect_shift),%cl + shll %cl,%ebx + +// for ( ; count ; count--, pz += d_zwidth, pdest += screenwidth) +// { +// for (i=0 ; i> 16) + +// (r_sstepx >> 16); + + movl C(r_sstepx),%eax + movl C(r_tstepx),%edx + shll $16,%eax + shll $16,%edx + movl %eax,C(a_sstepxfrac) + movl %edx,C(a_tstepxfrac) + + movl C(r_sstepx),%ecx + movl C(r_tstepx),%eax + sarl $16,%ecx + sarl $16,%eax + imull skinwidth(%esp) + addl %ecx,%eax + movl %eax,C(a_ststepxwhole) + + ret + + +//---------------------------------------------------------------------- +// recursive subdivision affine triangle drawing code +// +// not C-callable because of stdcall return +//---------------------------------------------------------------------- + +#define lp1 4+16 +#define lp2 8+16 +#define lp3 12+16 + +.globl C(D_PolysetRecursiveTriangle) +C(D_PolysetRecursiveTriangle): + pushl %ebp // preserve caller stack frame pointer + pushl %esi // preserve register variables + pushl %edi + pushl %ebx + +// int *temp; +// int d; +// int new[6]; +// int i; +// int z; +// short *zbuf; + movl lp2(%esp),%esi + movl lp1(%esp),%ebx + movl lp3(%esp),%edi + +// d = lp2[0] - lp1[0]; +// if (d < -1 || d > 1) +// goto split; + movl 0(%esi),%eax + + movl 0(%ebx),%edx + movl 4(%esi),%ebp + + subl %edx,%eax + movl 4(%ebx),%ecx + + subl %ecx,%ebp + incl %eax + + cmpl $2,%eax + ja LSplit + +// d = lp2[1] - lp1[1]; +// if (d < -1 || d > 1) +// goto split; + movl 0(%edi),%eax + incl %ebp + + cmpl $2,%ebp + ja LSplit + +// d = lp3[0] - lp2[0]; +// if (d < -1 || d > 1) +// goto split2; + movl 0(%esi),%edx + movl 4(%edi),%ebp + + subl %edx,%eax + movl 4(%esi),%ecx + + subl %ecx,%ebp + incl %eax + + cmpl $2,%eax + ja LSplit2 + +// d = lp3[1] - lp2[1]; +// if (d < -1 || d > 1) +// goto split2; + movl 0(%ebx),%eax + incl %ebp + + cmpl $2,%ebp + ja LSplit2 + +// d = lp1[0] - lp3[0]; +// if (d < -1 || d > 1) +// goto split3; + movl 0(%edi),%edx + movl 4(%ebx),%ebp + + subl %edx,%eax + movl 4(%edi),%ecx + + subl %ecx,%ebp + incl %eax + + incl %ebp + movl %ebx,%edx + + cmpl $2,%eax + ja LSplit3 + +// d = lp1[1] - lp3[1]; +// if (d < -1 || d > 1) +// { +//split3: +// temp = lp1; +// lp3 = lp2; +// lp1 = lp3; +// lp2 = temp; +// goto split; +// } +// +// return; // entire tri is filled +// + cmpl $2,%ebp + jna LDone + +LSplit3: + movl %edi,%ebx + movl %esi,%edi + movl %edx,%esi + jmp LSplit + +//split2: +LSplit2: + +// temp = lp1; +// lp1 = lp2; +// lp2 = lp3; +// lp3 = temp; + movl %ebx,%eax + movl %esi,%ebx + movl %edi,%esi + movl %eax,%edi + +//split: +LSplit: + + subl $24,%esp // allocate space for a new vertex + +//// split this edge +// new[0] = (lp1[0] + lp2[0]) >> 1; +// new[1] = (lp1[1] + lp2[1]) >> 1; +// new[2] = (lp1[2] + lp2[2]) >> 1; +// new[3] = (lp1[3] + lp2[3]) >> 1; +// new[5] = (lp1[5] + lp2[5]) >> 1; + movl 8(%ebx),%eax + + movl 8(%esi),%edx + movl 12(%ebx),%ecx + + addl %edx,%eax + movl 12(%esi),%edx + + sarl $1,%eax + addl %edx,%ecx + + movl %eax,8(%esp) + movl 20(%ebx),%eax + + sarl $1,%ecx + movl 20(%esi),%edx + + movl %ecx,12(%esp) + addl %edx,%eax + + movl 0(%ebx),%ecx + movl 0(%esi),%edx + + sarl $1,%eax + addl %ecx,%edx + + movl %eax,20(%esp) + movl 4(%ebx),%eax + + sarl $1,%edx + movl 4(%esi),%ebp + + movl %edx,0(%esp) + addl %eax,%ebp + + sarl $1,%ebp + movl %ebp,4(%esp) + +//// draw the point if splitting a leading edge +// if (lp2[1] > lp1[1]) +// goto nodraw; + cmpl %eax,4(%esi) + jg LNoDraw + +// if ((lp2[1] == lp1[1]) && (lp2[0] < lp1[0])) +// goto nodraw; + movl 0(%esi),%edx + jnz LDraw + + cmpl %ecx,%edx + jl LNoDraw + +LDraw: + +// z = new[5] >> 16; + movl 20(%esp),%edx + movl 4(%esp),%ecx + + sarl $16,%edx + movl 0(%esp),%ebp + +// zbuf = zspantable[new[1]] + new[0]; + movl C(zspantable)(,%ecx,4),%eax + +// if (z >= *zbuf) +// { + cmpw (%eax,%ebp,2),%dx + jnge LNoDraw + +// int pix; +// +// *zbuf = z; + movw %dx,(%eax,%ebp,2) + +// pix = d_pcolormap[skintable[new[3]>>16][new[2]>>16]]; + movl 12(%esp),%eax + + sarl $16,%eax + movl 8(%esp),%edx + + sarl $16,%edx + subl %ecx,%ecx + + movl C(skintable)(,%eax,4),%eax + movl 4(%esp),%ebp + + movb (%eax,%edx,),%cl + movl C(d_pcolormap),%edx + + movb (%edx,%ecx,),%dl + movl 0(%esp),%ecx + +// d_viewbuffer[d_scantable[new[1]] + new[0]] = pix; + movl C(d_scantable)(,%ebp,4),%eax + addl %eax,%ecx + movl C(d_viewbuffer),%eax + movb %dl,(%eax,%ecx,1) + +// } +// +//nodraw: +LNoDraw: + +//// recursively continue +// D_PolysetRecursiveTriangle (lp3, lp1, new); + pushl %esp + pushl %ebx + pushl %edi + call C(D_PolysetRecursiveTriangle) + +// D_PolysetRecursiveTriangle (lp3, new, lp2); + movl %esp,%ebx + pushl %esi + pushl %ebx + pushl %edi + call C(D_PolysetRecursiveTriangle) + addl $24,%esp + +LDone: + popl %ebx // restore register variables + popl %edi + popl %esi + popl %ebp // restore caller stack frame pointer + ret $12 + + +//---------------------------------------------------------------------- +// 8-bpp horizontal span drawing code for affine polygons, with smooth +// shading and no transparency +//---------------------------------------------------------------------- + +#define pspans 4+8 + +.globl C(D_PolysetAff8Start) +C(D_PolysetAff8Start): + +.globl C(D_PolysetDrawSpans8) +C(D_PolysetDrawSpans8): + pushl %esi // preserve register variables + pushl %ebx + + movl pspans(%esp),%esi // point to the first span descriptor + movl C(r_zistepx),%ecx + + pushl %ebp // preserve caller's stack frame + pushl %edi + + rorl $16,%ecx // put high 16 bits of 1/z step in low word + movl spanpackage_t_count(%esi),%edx + + movl %ecx,lzistepx + +LSpanLoop: + +// lcount = d_aspancount - pspanpackage->count; +// +// errorterm += erroradjustup; +// if (errorterm >= 0) +// { +// d_aspancount += d_countextrastep; +// errorterm -= erroradjustdown; +// } +// else +// { +// d_aspancount += ubasestep; +// } + movl C(d_aspancount),%eax + subl %edx,%eax + + movl C(erroradjustup),%edx + movl C(errorterm),%ebx + addl %edx,%ebx + js LNoTurnover + + movl C(erroradjustdown),%edx + movl C(d_countextrastep),%edi + subl %edx,%ebx + movl C(d_aspancount),%ebp + movl %ebx,C(errorterm) + addl %edi,%ebp + movl %ebp,C(d_aspancount) + jmp LRightEdgeStepped + +LNoTurnover: + movl C(d_aspancount),%edi + movl C(ubasestep),%edx + movl %ebx,C(errorterm) + addl %edx,%edi + movl %edi,C(d_aspancount) + +LRightEdgeStepped: + cmpl $1,%eax + + jl LNextSpan + jz LExactlyOneLong + +// +// set up advancetable +// + movl C(a_ststepxwhole),%ecx + movl C(r_affinetridesc)+atd_skinwidth,%edx + + movl %ecx,advancetable+4 // advance base in t + addl %edx,%ecx + + movl %ecx,advancetable // advance extra in t + movl C(a_tstepxfrac),%ecx + + movw C(r_lstepx),%cx + movl %eax,%edx // count + + movl %ecx,tstep + addl $7,%edx + + shrl $3,%edx // count of full and partial loops + movl spanpackage_t_sfrac(%esi),%ebx + + movw %dx,%bx + movl spanpackage_t_pz(%esi),%ecx + + negl %eax + + movl spanpackage_t_pdest(%esi),%edi + andl $7,%eax // 0->0, 1->7, 2->6, ... , 7->1 + + subl %eax,%edi // compensate for hardwired offsets + subl %eax,%ecx + + subl %eax,%ecx + movl spanpackage_t_tfrac(%esi),%edx + + movw spanpackage_t_light(%esi),%dx + movl spanpackage_t_zi(%esi),%ebp + + rorl $16,%ebp // put high 16 bits of 1/z in low word + pushl %esi + + movl spanpackage_t_ptex(%esi),%esi + jmp aff8entryvec_table(,%eax,4) + +// %bx = count of full and partial loops +// %ebx high word = sfrac +// %ecx = pz +// %dx = light +// %edx high word = tfrac +// %esi = ptex +// %edi = pdest +// %ebp = 1/z +// tstep low word = C(r_lstepx) +// tstep high word = C(a_tstepxfrac) +// C(a_sstepxfrac) low word = 0 +// C(a_sstepxfrac) high word = C(a_sstepxfrac) + +LDrawLoop: + +// FIXME: do we need to clamp light? We may need at least a buffer bit to +// keep it from poking into tfrac and causing problems + +LDraw8: + cmpw (%ecx),%bp + jl Lp1 + xorl %eax,%eax + movb %dh,%ah + movb (%esi),%al + movw %bp,(%ecx) + movb 0x12345678(%eax),%al +LPatch8: + movb %al,(%edi) +Lp1: + addl tstep,%edx + sbbl %eax,%eax + addl lzistepx,%ebp + adcl $0,%ebp + addl C(a_sstepxfrac),%ebx + adcl advancetable+4(,%eax,4),%esi + +LDraw7: + cmpw 2(%ecx),%bp + jl Lp2 + xorl %eax,%eax + movb %dh,%ah + movb (%esi),%al + movw %bp,2(%ecx) + movb 0x12345678(%eax),%al +LPatch7: + movb %al,1(%edi) +Lp2: + addl tstep,%edx + sbbl %eax,%eax + addl lzistepx,%ebp + adcl $0,%ebp + addl C(a_sstepxfrac),%ebx + adcl advancetable+4(,%eax,4),%esi + +LDraw6: + cmpw 4(%ecx),%bp + jl Lp3 + xorl %eax,%eax + movb %dh,%ah + movb (%esi),%al + movw %bp,4(%ecx) + movb 0x12345678(%eax),%al +LPatch6: + movb %al,2(%edi) +Lp3: + addl tstep,%edx + sbbl %eax,%eax + addl lzistepx,%ebp + adcl $0,%ebp + addl C(a_sstepxfrac),%ebx + adcl advancetable+4(,%eax,4),%esi + +LDraw5: + cmpw 6(%ecx),%bp + jl Lp4 + xorl %eax,%eax + movb %dh,%ah + movb (%esi),%al + movw %bp,6(%ecx) + movb 0x12345678(%eax),%al +LPatch5: + movb %al,3(%edi) +Lp4: + addl tstep,%edx + sbbl %eax,%eax + addl lzistepx,%ebp + adcl $0,%ebp + addl C(a_sstepxfrac),%ebx + adcl advancetable+4(,%eax,4),%esi + +LDraw4: + cmpw 8(%ecx),%bp + jl Lp5 + xorl %eax,%eax + movb %dh,%ah + movb (%esi),%al + movw %bp,8(%ecx) + movb 0x12345678(%eax),%al +LPatch4: + movb %al,4(%edi) +Lp5: + addl tstep,%edx + sbbl %eax,%eax + addl lzistepx,%ebp + adcl $0,%ebp + addl C(a_sstepxfrac),%ebx + adcl advancetable+4(,%eax,4),%esi + +LDraw3: + cmpw 10(%ecx),%bp + jl Lp6 + xorl %eax,%eax + movb %dh,%ah + movb (%esi),%al + movw %bp,10(%ecx) + movb 0x12345678(%eax),%al +LPatch3: + movb %al,5(%edi) +Lp6: + addl tstep,%edx + sbbl %eax,%eax + addl lzistepx,%ebp + adcl $0,%ebp + addl C(a_sstepxfrac),%ebx + adcl advancetable+4(,%eax,4),%esi + +LDraw2: + cmpw 12(%ecx),%bp + jl Lp7 + xorl %eax,%eax + movb %dh,%ah + movb (%esi),%al + movw %bp,12(%ecx) + movb 0x12345678(%eax),%al +LPatch2: + movb %al,6(%edi) +Lp7: + addl tstep,%edx + sbbl %eax,%eax + addl lzistepx,%ebp + adcl $0,%ebp + addl C(a_sstepxfrac),%ebx + adcl advancetable+4(,%eax,4),%esi + +LDraw1: + cmpw 14(%ecx),%bp + jl Lp8 + xorl %eax,%eax + movb %dh,%ah + movb (%esi),%al + movw %bp,14(%ecx) + movb 0x12345678(%eax),%al +LPatch1: + movb %al,7(%edi) +Lp8: + addl tstep,%edx + sbbl %eax,%eax + addl lzistepx,%ebp + adcl $0,%ebp + addl C(a_sstepxfrac),%ebx + adcl advancetable+4(,%eax,4),%esi + + addl $8,%edi + addl $16,%ecx + + decw %bx + jnz LDrawLoop + + popl %esi // restore spans pointer +LNextSpan: + addl $(spanpackage_t_size),%esi // point to next span +LNextSpanESISet: + movl spanpackage_t_count(%esi),%edx + cmpl $-999999,%edx // any more spans? + jnz LSpanLoop // yes + + popl %edi + popl %ebp // restore the caller's stack frame + popl %ebx // restore register variables + popl %esi + ret + + +// draw a one-long span + +LExactlyOneLong: + + movl spanpackage_t_pz(%esi),%ecx + movl spanpackage_t_zi(%esi),%ebp + + rorl $16,%ebp // put high 16 bits of 1/z in low word + movl spanpackage_t_ptex(%esi),%ebx + + cmpw (%ecx),%bp + jl LNextSpan + xorl %eax,%eax + movl spanpackage_t_pdest(%esi),%edi + movb spanpackage_t_light+1(%esi),%ah + addl $(spanpackage_t_size),%esi // point to next span + movb (%ebx),%al + movw %bp,(%ecx) + movb 0x12345678(%eax),%al +LPatch9: + movb %al,(%edi) + + jmp LNextSpanESISet + +.globl C(D_PolysetAff8End) +C(D_PolysetAff8End): + + +#define pcolormap 4 + +.globl C(D_Aff8Patch) +C(D_Aff8Patch): + movl pcolormap(%esp),%eax + movl %eax,LPatch1-4 + movl %eax,LPatch2-4 + movl %eax,LPatch3-4 + movl %eax,LPatch4-4 + movl %eax,LPatch5-4 + movl %eax,LPatch6-4 + movl %eax,LPatch7-4 + movl %eax,LPatch8-4 + movl %eax,LPatch9-4 + + ret + + +//---------------------------------------------------------------------- +// Alias model polygon dispatching code, combined with subdivided affine +// triangle drawing code +//---------------------------------------------------------------------- + +.globl C(D_PolysetDraw) +C(D_PolysetDraw): + +// spanpackage_t spans[DPS_MAXSPANS + 1 + +// ((CACHE_SIZE - 1) / sizeof(spanpackage_t)) + 1]; +// // one extra because of cache line pretouching +// +// a_spans = (spanpackage_t *) +// (((long)&spans[0] + CACHE_SIZE - 1) & ~(CACHE_SIZE - 1)); + subl $(SPAN_SIZE),%esp + movl %esp,%eax + addl $(CACHE_SIZE - 1),%eax + andl $(~(CACHE_SIZE - 1)),%eax + movl %eax,C(a_spans) + +// if (r_affinetridesc.drawtype) +// D_DrawSubdiv (); +// else +// D_DrawNonSubdiv (); + movl C(r_affinetridesc)+atd_drawtype,%eax + testl %eax,%eax + jz C(D_DrawNonSubdiv) + + pushl %ebp // preserve caller stack frame pointer + +// lnumtriangles = r_affinetridesc.numtriangles; + movl C(r_affinetridesc)+atd_numtriangles,%ebp + + pushl %esi // preserve register variables + shll $4,%ebp + + pushl %ebx +// ptri = r_affinetridesc.ptriangles; + movl C(r_affinetridesc)+atd_ptriangles,%ebx + + pushl %edi + +// mtriangle_t *ptri; +// finalvert_t *pfv, *index0, *index1, *index2; +// int i; +// int lnumtriangles; +// int s0, s1, s2; + +// pfv = r_affinetridesc.pfinalverts; + movl C(r_affinetridesc)+atd_pfinalverts,%edi + +// for (i=0 ; iv[1]-index1->v[1]) * +// (index0->v[0]-index2->v[0]) - +// (index0->v[0]-index1->v[0])*(index0->v[1]-index2->v[1])) >= 0) +// { +// continue; +// } +// +// d_pcolormap = &((byte *)acolormap)[index0->v[4] & 0xFF00]; + fildl fv_v+4(%ecx) // i0v1 + fildl fv_v+4(%esi) // i1v1 | i0v1 + fildl fv_v+0(%ecx) // i0v0 | i1v1 | i0v1 + fildl fv_v+0(%edx) // i2v0 | i0v0 | i1v1 | i0v1 + fxch %st(2) // i1v1 | i0v0 | i2v0 | i0v1 + fsubr %st(3),%st(0) // i0v1-i1v1 | i0v0 | i2v0 | i0v1 + fildl fv_v+0(%esi) // i1v0 | i0v1-i1v1 | i0v0 | i2v0 | i0v1 + fxch %st(2) // i0v0 | i0v1-i1v1 | i1v0 | i2v0 | i0v1 + fsub %st(0),%st(3) // i0v0 | i0v1-i1v1 | i1v0 | i0v0-i2v0 | i0v1 + fildl fv_v+4(%edx) // i2v1 | i0v0 | i0v1-i1v1 | i1v0 | i0v0-i2v0| i0v1 + fxch %st(1) // i0v0 | i2v1 | i0v1-i1v1 | i1v0 | i0v0-i2v0| i0v1 + fsubp %st(0),%st(3) // i2v1 | i0v1-i1v1 | i0v0-i1v0 | i0v0-i2v0 | i0v1 + fxch %st(1) // i0v1-i1v1 | i2v1 | i0v0-i1v0 | i0v0-i2v0 | i0v1 + fmulp %st(0),%st(3) // i2v1 | i0v0-i1v0 | i0v1-i1v1*i0v0-i2v0 | i0v1 + fsubrp %st(0),%st(3) // i0v0-i1v0 | i0v1-i1v1*i0v0-i2v0 | i0v1-i2v1 + movl fv_v+16(%ecx),%eax + andl $0xFF00,%eax + fmulp %st(0),%st(2) // i0v1-i1v1*i0v0-i2v0 | i0v0-i1v0*i0v1-i2v1 + addl C(acolormap),%eax + fsubp %st(0),%st(1) // (i0v1-i1v1)*(i0v0-i2v0)-(i0v0-i1v0)*(i0v1-i2v1) + movl %eax,C(d_pcolormap) + fstps Ltemp + movl Ltemp,%eax + subl $0x80000001,%eax + jc Lskip + +// if (ptri[i].facesfront) +// { +// D_PolysetRecursiveTriangle(index0->v, index1->v, index2->v); + movl mtri_facesfront-16(%ebx,%ebp,),%eax + testl %eax,%eax + jz Lfacesback + + pushl %edx + pushl %esi + pushl %ecx + call C(D_PolysetRecursiveTriangle) + + subl $16,%ebp + jnz Llooptop + jmp Ldone2 + +// } +// else +// { +Lfacesback: + +// s0 = index0->v[2]; +// s1 = index1->v[2]; +// s2 = index2->v[2]; + movl fv_v+8(%ecx),%eax + pushl %eax + movl fv_v+8(%esi),%eax + pushl %eax + movl fv_v+8(%edx),%eax + pushl %eax + pushl %ecx + pushl %edx + +// if (index0->flags & ALIAS_ONSEAM) +// index0->v[2] += r_affinetridesc.seamfixupX16; + movl C(r_affinetridesc)+atd_seamfixupX16,%eax + testl $(ALIAS_ONSEAM),fv_flags(%ecx) + jz Lp11 + addl %eax,fv_v+8(%ecx) +Lp11: + +// if (index1->flags & ALIAS_ONSEAM) +// index1->v[2] += r_affinetridesc.seamfixupX16; + testl $(ALIAS_ONSEAM),fv_flags(%esi) + jz Lp12 + addl %eax,fv_v+8(%esi) +Lp12: + +// if (index2->flags & ALIAS_ONSEAM) +// index2->v[2] += r_affinetridesc.seamfixupX16; + testl $(ALIAS_ONSEAM),fv_flags(%edx) + jz Lp13 + addl %eax,fv_v+8(%edx) +Lp13: + +// D_PolysetRecursiveTriangle(index0->v, index1->v, index2->v); + pushl %edx + pushl %esi + pushl %ecx + call C(D_PolysetRecursiveTriangle) + +// index0->v[2] = s0; +// index1->v[2] = s1; +// index2->v[2] = s2; + popl %edx + popl %ecx + popl %eax + movl %eax,fv_v+8(%edx) + popl %eax + movl %eax,fv_v+8(%esi) + popl %eax + movl %eax,fv_v+8(%ecx) + +// } +// } +Lskip: + subl $16,%ebp + jnz Llooptop + +Ldone2: + popl %edi // restore the caller's stack frame + popl %ebx + popl %esi // restore register variables + popl %ebp + + addl $(SPAN_SIZE),%esp + + ret + + +//---------------------------------------------------------------------- +// Alias model triangle left-edge scanning code +//---------------------------------------------------------------------- + +#define height 4+16 + +.globl C(D_PolysetScanLeftEdge) +C(D_PolysetScanLeftEdge): + pushl %ebp // preserve caller stack frame pointer + pushl %esi // preserve register variables + pushl %edi + pushl %ebx + + movl height(%esp),%eax + movl C(d_sfrac),%ecx + andl $0xFFFF,%eax + movl C(d_ptex),%ebx + orl %eax,%ecx + movl C(d_pedgespanpackage),%esi + movl C(d_tfrac),%edx + movl C(d_light),%edi + movl C(d_zi),%ebp + +// %eax: scratch +// %ebx: d_ptex +// %ecx: d_sfrac in high word, count in low word +// %edx: d_tfrac +// %esi: d_pedgespanpackage, errorterm, scratch alternately +// %edi: d_light +// %ebp: d_zi + +// do +// { + +LScanLoop: + +// d_pedgespanpackage->ptex = ptex; +// d_pedgespanpackage->pdest = d_pdest; +// d_pedgespanpackage->pz = d_pz; +// d_pedgespanpackage->count = d_aspancount; +// d_pedgespanpackage->light = d_light; +// d_pedgespanpackage->zi = d_zi; +// d_pedgespanpackage->sfrac = d_sfrac << 16; +// d_pedgespanpackage->tfrac = d_tfrac << 16; + movl %ebx,spanpackage_t_ptex(%esi) + movl C(d_pdest),%eax + movl %eax,spanpackage_t_pdest(%esi) + movl C(d_pz),%eax + movl %eax,spanpackage_t_pz(%esi) + movl C(d_aspancount),%eax + movl %eax,spanpackage_t_count(%esi) + movl %edi,spanpackage_t_light(%esi) + movl %ebp,spanpackage_t_zi(%esi) + movl %ecx,spanpackage_t_sfrac(%esi) + movl %edx,spanpackage_t_tfrac(%esi) + +// pretouch the next cache line + movb spanpackage_t_size(%esi),%al + +// d_pedgespanpackage++; + addl $(spanpackage_t_size),%esi + movl C(erroradjustup),%eax + movl %esi,C(d_pedgespanpackage) + +// errorterm += erroradjustup; + movl C(errorterm),%esi + addl %eax,%esi + movl C(d_pdest),%eax + +// if (errorterm >= 0) +// { + js LNoLeftEdgeTurnover + +// errorterm -= erroradjustdown; +// d_pdest += d_pdestextrastep; + subl C(erroradjustdown),%esi + addl C(d_pdestextrastep),%eax + movl %esi,C(errorterm) + movl %eax,C(d_pdest) + +// d_pz += d_pzextrastep; +// d_aspancount += d_countextrastep; +// d_ptex += d_ptexextrastep; +// d_sfrac += d_sfracextrastep; +// d_ptex += d_sfrac >> 16; +// d_sfrac &= 0xFFFF; +// d_tfrac += d_tfracextrastep; + movl C(d_pz),%eax + movl C(d_aspancount),%esi + addl C(d_pzextrastep),%eax + addl C(d_sfracextrastep),%ecx + adcl C(d_ptexextrastep),%ebx + addl C(d_countextrastep),%esi + movl %eax,C(d_pz) + movl C(d_tfracextrastep),%eax + movl %esi,C(d_aspancount) + addl %eax,%edx + +// if (d_tfrac & 0x10000) +// { + jnc LSkip1 + +// d_ptex += r_affinetridesc.skinwidth; +// d_tfrac &= 0xFFFF; + addl C(r_affinetridesc)+atd_skinwidth,%ebx + +// } + +LSkip1: + +// d_light += d_lightextrastep; +// d_zi += d_ziextrastep; + addl C(d_lightextrastep),%edi + addl C(d_ziextrastep),%ebp + +// } + movl C(d_pedgespanpackage),%esi + decl %ecx + testl $0xFFFF,%ecx + jnz LScanLoop + + popl %ebx + popl %edi + popl %esi + popl %ebp + ret + +// else +// { + +LNoLeftEdgeTurnover: + movl %esi,C(errorterm) + +// d_pdest += d_pdestbasestep; + addl C(d_pdestbasestep),%eax + movl %eax,C(d_pdest) + +// d_pz += d_pzbasestep; +// d_aspancount += ubasestep; +// d_ptex += d_ptexbasestep; +// d_sfrac += d_sfracbasestep; +// d_ptex += d_sfrac >> 16; +// d_sfrac &= 0xFFFF; + movl C(d_pz),%eax + movl C(d_aspancount),%esi + addl C(d_pzbasestep),%eax + addl C(d_sfracbasestep),%ecx + adcl C(d_ptexbasestep),%ebx + addl C(ubasestep),%esi + movl %eax,C(d_pz) + movl %esi,C(d_aspancount) + +// d_tfrac += d_tfracbasestep; + movl C(d_tfracbasestep),%esi + addl %esi,%edx + +// if (d_tfrac & 0x10000) +// { + jnc LSkip2 + +// d_ptex += r_affinetridesc.skinwidth; +// d_tfrac &= 0xFFFF; + addl C(r_affinetridesc)+atd_skinwidth,%ebx + +// } + +LSkip2: + +// d_light += d_lightbasestep; +// d_zi += d_zibasestep; + addl C(d_lightbasestep),%edi + addl C(d_zibasestep),%ebp + +// } +// } while (--height); + movl C(d_pedgespanpackage),%esi + decl %ecx + testl $0xFFFF,%ecx + jnz LScanLoop + + popl %ebx + popl %edi + popl %esi + popl %ebp + ret + + +//---------------------------------------------------------------------- +// Alias model vertex drawing code +//---------------------------------------------------------------------- + +#define fv 4+8 +#define numverts 8+8 + +.globl C(D_PolysetDrawFinalVerts) +C(D_PolysetDrawFinalVerts): + pushl %ebp // preserve caller stack frame pointer + pushl %ebx + +// int i, z; +// short *zbuf; + + movl numverts(%esp),%ecx + movl fv(%esp),%ebx + + pushl %esi // preserve register variables + pushl %edi + +LFVLoop: + +// for (i=0 ; iv[0] < r_refdef.vrectright) && +// (fv->v[1] < r_refdef.vrectbottom)) +// { + movl fv_v+0(%ebx),%eax + movl C(r_refdef)+rd_vrectright,%edx + cmpl %edx,%eax + jge LNextVert + movl fv_v+4(%ebx),%esi + movl C(r_refdef)+rd_vrectbottom,%edx + cmpl %edx,%esi + jge LNextVert + +// zbuf = zspantable[fv->v[1]] + fv->v[0]; + movl C(zspantable)(,%esi,4),%edi + +// z = fv->v[5]>>16; + movl fv_v+20(%ebx),%edx + shrl $16,%edx + +// if (z >= *zbuf) +// { +// int pix; + cmpw (%edi,%eax,2),%dx + jl LNextVert + +// *zbuf = z; + movw %dx,(%edi,%eax,2) + +// pix = skintable[fv->v[3]>>16][fv->v[2]>>16]; + movl fv_v+12(%ebx),%edi + shrl $16,%edi + movl C(skintable)(,%edi,4),%edi + movl fv_v+8(%ebx),%edx + shrl $16,%edx + movb (%edi,%edx),%dl + +// pix = ((byte *)acolormap)[pix + (fv->v[4] & 0xFF00)]; + movl fv_v+16(%ebx),%edi + andl $0xFF00,%edi + andl $0x00FF,%edx + addl %edx,%edi + movl C(acolormap),%edx + movb (%edx,%edi,1),%dl + +// d_viewbuffer[d_scantable[fv->v[1]] + fv->v[0]] = pix; + movl C(d_scantable)(,%esi,4),%edi + movl C(d_viewbuffer),%esi + addl %eax,%edi + movb %dl,(%esi,%edi) + +// } +// } +// } +LNextVert: + addl $(fv_size),%ebx + decl %ecx + jnz LFVLoop + + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + + +//---------------------------------------------------------------------- +// Alias model non-subdivided polygon dispatching code +// +// not C-callable because of stack buffer cleanup +//---------------------------------------------------------------------- + +.globl C(D_DrawNonSubdiv) +C(D_DrawNonSubdiv): + pushl %ebp // preserve caller stack frame pointer + movl C(r_affinetridesc)+atd_numtriangles,%ebp + pushl %ebx + shll $(mtri_shift),%ebp + pushl %esi // preserve register variables + movl C(r_affinetridesc)+atd_ptriangles,%esi + pushl %edi + +// mtriangle_t *ptri; +// finalvert_t *pfv, *index0, *index1, *index2; +// int i; +// int lnumtriangles; + +// pfv = r_affinetridesc.pfinalverts; +// ptri = r_affinetridesc.ptriangles; +// lnumtriangles = r_affinetridesc.numtriangles; + +LNDLoop: + +// for (i=0 ; ivertindex[0]; +// index1 = pfv + ptri->vertindex[1]; +// index2 = pfv + ptri->vertindex[2]; + movl C(r_affinetridesc)+atd_pfinalverts,%edi + movl mtri_vertindex+0-mtri_size(%esi,%ebp,1),%ecx + shll $(fv_shift),%ecx + movl mtri_vertindex+4-mtri_size(%esi,%ebp,1),%edx + shll $(fv_shift),%edx + movl mtri_vertindex+8-mtri_size(%esi,%ebp,1),%ebx + shll $(fv_shift),%ebx + addl %edi,%ecx + addl %edi,%edx + addl %edi,%ebx + +// d_xdenom = (index0->v[1]-index1->v[1]) * +// (index0->v[0]-index2->v[0]) - +// (index0->v[0]-index1->v[0])*(index0->v[1]-index2->v[1]); + movl fv_v+4(%ecx),%eax + movl fv_v+0(%ecx),%esi + subl fv_v+4(%edx),%eax + subl fv_v+0(%ebx),%esi + imull %esi,%eax + movl fv_v+0(%ecx),%esi + movl fv_v+4(%ecx),%edi + subl fv_v+0(%edx),%esi + subl fv_v+4(%ebx),%edi + imull %esi,%edi + subl %edi,%eax + +// if (d_xdenom >= 0) +// { +// continue; + jns LNextTri + +// } + + movl %eax,C(d_xdenom) + fildl C(d_xdenom) + +// r_p0[0] = index0->v[0]; // u +// r_p0[1] = index0->v[1]; // v +// r_p0[2] = index0->v[2]; // s +// r_p0[3] = index0->v[3]; // t +// r_p0[4] = index0->v[4]; // light +// r_p0[5] = index0->v[5]; // iz + movl fv_v+0(%ecx),%eax + movl fv_v+4(%ecx),%esi + movl %eax,C(r_p0)+0 + movl %esi,C(r_p0)+4 + movl fv_v+8(%ecx),%eax + movl fv_v+12(%ecx),%esi + movl %eax,C(r_p0)+8 + movl %esi,C(r_p0)+12 + movl fv_v+16(%ecx),%eax + movl fv_v+20(%ecx),%esi + movl %eax,C(r_p0)+16 + movl %esi,C(r_p0)+20 + + fdivrs float_1 + +// r_p1[0] = index1->v[0]; +// r_p1[1] = index1->v[1]; +// r_p1[2] = index1->v[2]; +// r_p1[3] = index1->v[3]; +// r_p1[4] = index1->v[4]; +// r_p1[5] = index1->v[5]; + movl fv_v+0(%edx),%eax + movl fv_v+4(%edx),%esi + movl %eax,C(r_p1)+0 + movl %esi,C(r_p1)+4 + movl fv_v+8(%edx),%eax + movl fv_v+12(%edx),%esi + movl %eax,C(r_p1)+8 + movl %esi,C(r_p1)+12 + movl fv_v+16(%edx),%eax + movl fv_v+20(%edx),%esi + movl %eax,C(r_p1)+16 + movl %esi,C(r_p1)+20 + +// r_p2[0] = index2->v[0]; +// r_p2[1] = index2->v[1]; +// r_p2[2] = index2->v[2]; +// r_p2[3] = index2->v[3]; +// r_p2[4] = index2->v[4]; +// r_p2[5] = index2->v[5]; + movl fv_v+0(%ebx),%eax + movl fv_v+4(%ebx),%esi + movl %eax,C(r_p2)+0 + movl %esi,C(r_p2)+4 + movl fv_v+8(%ebx),%eax + movl fv_v+12(%ebx),%esi + movl %eax,C(r_p2)+8 + movl %esi,C(r_p2)+12 + movl fv_v+16(%ebx),%eax + movl fv_v+20(%ebx),%esi + movl %eax,C(r_p2)+16 + movl C(r_affinetridesc)+atd_ptriangles,%edi + movl %esi,C(r_p2)+20 + movl mtri_facesfront-mtri_size(%edi,%ebp,1),%eax + +// if (!ptri->facesfront) +// { + testl %eax,%eax + jnz LFacesFront + +// if (index0->flags & ALIAS_ONSEAM) +// r_p0[2] += r_affinetridesc.seamfixupX16; + movl fv_flags(%ecx),%eax + movl fv_flags(%edx),%esi + movl fv_flags(%ebx),%edi + testl $(ALIAS_ONSEAM),%eax + movl C(r_affinetridesc)+atd_seamfixupX16,%eax + jz LOnseamDone0 + addl %eax,C(r_p0)+8 +LOnseamDone0: + +// if (index1->flags & ALIAS_ONSEAM) +// r_p1[2] += r_affinetridesc.seamfixupX16; + testl $(ALIAS_ONSEAM),%esi + jz LOnseamDone1 + addl %eax,C(r_p1)+8 +LOnseamDone1: + +// if (index2->flags & ALIAS_ONSEAM) +// r_p2[2] += r_affinetridesc.seamfixupX16; + testl $(ALIAS_ONSEAM),%edi + jz LOnseamDone2 + addl %eax,C(r_p2)+8 +LOnseamDone2: + +// } + +LFacesFront: + + fstps C(d_xdenom) + +// D_PolysetSetEdgeTable (); +// D_RasterizeAliasPolySmooth (); + call C(D_PolysetSetEdgeTable) + call C(D_RasterizeAliasPolySmooth) + +LNextTri: + movl C(r_affinetridesc)+atd_ptriangles,%esi + subl $16,%ebp + jnz LNDLoop +// } + + popl %edi + popl %esi + popl %ebx + popl %ebp + + addl $(SPAN_SIZE),%esp + + ret + + +#endif // id386 + diff --git a/source/d_scana.S b/source/d_scana.S new file mode 100644 index 0000000..3f4b91d --- /dev/null +++ b/source/d_scana.S @@ -0,0 +1,89 @@ +/* +Copyright (C) 1996-1997 Id Software, Inc. + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +*/ +// +// d_scana.s +// x86 assembly-language turbulent texture mapping code +// + +#include "asm_i386.h" +#include "quakeasm.h" +#include "asm_draw.h" +#include "d_ifacea.h" + +#if id386 + + .data + + .text + +//---------------------------------------------------------------------- +// turbulent texture mapping code +//---------------------------------------------------------------------- + + .align 4 +.globl C(D_DrawTurbulent8Span) +C(D_DrawTurbulent8Span): + pushl %ebp // preserve caller's stack frame pointer + pushl %esi // preserve register variables + pushl %edi + pushl %ebx + + movl C(r_turb_s),%esi + movl C(r_turb_t),%ecx + movl C(r_turb_pdest),%edi + movl C(r_turb_spancount),%ebx + +Llp: + movl %ecx,%eax + movl %esi,%edx + sarl $16,%eax + movl C(r_turb_turb),%ebp + sarl $16,%edx + andl $(CYCLE-1),%eax + andl $(CYCLE-1),%edx + movl (%ebp,%eax,4),%eax + movl (%ebp,%edx,4),%edx + addl %esi,%eax + sarl $16,%eax + addl %ecx,%edx + sarl $16,%edx + andl $(TURB_TEX_SIZE-1),%eax + andl $(TURB_TEX_SIZE-1),%edx + shll $6,%edx + movl C(r_turb_pbase),%ebp + addl %eax,%edx + incl %edi + addl C(r_turb_sstep),%esi + addl C(r_turb_tstep),%ecx + movb (%ebp,%edx,1),%dl + decl %ebx + movb %dl,-1(%edi) + jnz Llp + + movl %edi,C(r_turb_pdest) + + popl %ebx // restore register variables + popl %edi + popl %esi + popl %ebp // restore caller's stack frame pointer + ret + +#endif // id386 + diff --git a/source/d_spr8.S b/source/d_spr8.S new file mode 100644 index 0000000..42ccb62 --- /dev/null +++ b/source/d_spr8.S @@ -0,0 +1,900 @@ +/* +Copyright (C) 1996-1997 Id Software, Inc. + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +*/ +// +// d_spr8.s +// x86 assembly-language horizontal 8-bpp transparent span-drawing code. +// + +#include "asm_i386.h" +#include "quakeasm.h" +#include "asm_draw.h" + +#if id386 + +//---------------------------------------------------------------------- +// 8-bpp horizontal span drawing code for polygons, with transparency. +//---------------------------------------------------------------------- + + .text + +// out-of-line, rarely-needed clamping code + +LClampHigh0: + movl C(bbextents),%esi + jmp LClampReentry0 +LClampHighOrLow0: + jg LClampHigh0 + xorl %esi,%esi + jmp LClampReentry0 + +LClampHigh1: + movl C(bbextentt),%edx + jmp LClampReentry1 +LClampHighOrLow1: + jg LClampHigh1 + xorl %edx,%edx + jmp LClampReentry1 + +LClampLow2: + movl $2048,%ebp + jmp LClampReentry2 +LClampHigh2: + movl C(bbextents),%ebp + jmp LClampReentry2 + +LClampLow3: + movl $2048,%ecx + jmp LClampReentry3 +LClampHigh3: + movl C(bbextentt),%ecx + jmp LClampReentry3 + +LClampLow4: + movl $2048,%eax + jmp LClampReentry4 +LClampHigh4: + movl C(bbextents),%eax + jmp LClampReentry4 + +LClampLow5: + movl $2048,%ebx + jmp LClampReentry5 +LClampHigh5: + movl C(bbextentt),%ebx + jmp LClampReentry5 + + +#define pspans 4+16 + + .align 4 +.globl C(D_SpriteDrawSpans) +C(D_SpriteDrawSpans): + pushl %ebp // preserve caller's stack frame + pushl %edi + pushl %esi // preserve register variables + pushl %ebx + +// +// set up scaled-by-8 steps, for 8-long segments; also set up cacheblock +// and span list pointers, and 1/z step in 0.32 fixed-point +// +// FIXME: any overlap from rearranging? + flds C(d_sdivzstepu) + fmuls fp_8 + movl C(cacheblock),%edx + flds C(d_tdivzstepu) + fmuls fp_8 + movl pspans(%esp),%ebx // point to the first span descriptor + flds C(d_zistepu) + fmuls fp_8 + movl %edx,pbase // pbase = cacheblock + flds C(d_zistepu) + fmuls fp_64kx64k + fxch %st(3) + fstps sdivz8stepu + fstps zi8stepu + fstps tdivz8stepu + fistpl izistep + movl izistep,%eax + rorl $16,%eax // put upper 16 bits in low word + movl sspan_t_count(%ebx),%ecx + movl %eax,izistep + + cmpl $0,%ecx + jle LNextSpan + +LSpanLoop: + +// +// set up the initial s/z, t/z, and 1/z on the FP stack, and generate the +// initial s and t values +// +// FIXME: pipeline FILD? + fildl sspan_t_v(%ebx) + fildl sspan_t_u(%ebx) + + fld %st(1) // dv | du | dv + fmuls C(d_sdivzstepv) // dv*d_sdivzstepv | du | dv + fld %st(1) // du | dv*d_sdivzstepv | du | dv + fmuls C(d_sdivzstepu) // du*d_sdivzstepu | dv*d_sdivzstepv | du | dv + fld %st(2) // du | du*d_sdivzstepu | dv*d_sdivzstepv | du | dv + fmuls C(d_tdivzstepu) // du*d_tdivzstepu | du*d_sdivzstepu | + // dv*d_sdivzstepv | du | dv + fxch %st(1) // du*d_sdivzstepu | du*d_tdivzstepu | + // dv*d_sdivzstepv | du | dv + faddp %st(0),%st(2) // du*d_tdivzstepu | + // du*d_sdivzstepu + dv*d_sdivzstepv | du | dv + fxch %st(1) // du*d_sdivzstepu + dv*d_sdivzstepv | + // du*d_tdivzstepu | du | dv + fld %st(3) // dv | du*d_sdivzstepu + dv*d_sdivzstepv | + // du*d_tdivzstepu | du | dv + fmuls C(d_tdivzstepv) // dv*d_tdivzstepv | + // du*d_sdivzstepu + dv*d_sdivzstepv | + // du*d_tdivzstepu | du | dv + fxch %st(1) // du*d_sdivzstepu + dv*d_sdivzstepv | + // dv*d_tdivzstepv | du*d_tdivzstepu | du | dv + fadds C(d_sdivzorigin) // sdivz = d_sdivzorigin + dv*d_sdivzstepv + + // du*d_sdivzstepu; stays in %st(2) at end + fxch %st(4) // dv | dv*d_tdivzstepv | du*d_tdivzstepu | du | + // s/z + fmuls C(d_zistepv) // dv*d_zistepv | dv*d_tdivzstepv | + // du*d_tdivzstepu | du | s/z + fxch %st(1) // dv*d_tdivzstepv | dv*d_zistepv | + // du*d_tdivzstepu | du | s/z + faddp %st(0),%st(2) // dv*d_zistepv | + // dv*d_tdivzstepv + du*d_tdivzstepu | du | s/z + fxch %st(2) // du | dv*d_tdivzstepv + du*d_tdivzstepu | + // dv*d_zistepv | s/z + fmuls C(d_zistepu) // du*d_zistepu | + // dv*d_tdivzstepv + du*d_tdivzstepu | + // dv*d_zistepv | s/z + fxch %st(1) // dv*d_tdivzstepv + du*d_tdivzstepu | + // du*d_zistepu | dv*d_zistepv | s/z + fadds C(d_tdivzorigin) // tdivz = d_tdivzorigin + dv*d_tdivzstepv + + // du*d_tdivzstepu; stays in %st(1) at end + fxch %st(2) // dv*d_zistepv | du*d_zistepu | t/z | s/z + faddp %st(0),%st(1) // dv*d_zistepv + du*d_zistepu | t/z | s/z + + flds fp_64k // fp_64k | dv*d_zistepv + du*d_zistepu | t/z | s/z + fxch %st(1) // dv*d_zistepv + du*d_zistepu | fp_64k | t/z | s/z + fadds C(d_ziorigin) // zi = d_ziorigin + dv*d_zistepv + + // du*d_zistepu; stays in %st(0) at end + // 1/z | fp_64k | t/z | s/z + + fld %st(0) // FIXME: get rid of stall on FMUL? + fmuls fp_64kx64k + fxch %st(1) + +// +// calculate and clamp s & t +// + fdivr %st(0),%st(2) // 1/z | z*64k | t/z | s/z + fxch %st(1) + + fistpl izi // 0.32 fixed-point 1/z + movl izi,%ebp + +// +// set pz to point to the first z-buffer pixel in the span +// + rorl $16,%ebp // put upper 16 bits in low word + movl sspan_t_v(%ebx),%eax + movl %ebp,izi + movl sspan_t_u(%ebx),%ebp + imull C(d_zrowbytes) + shll $1,%ebp // a word per pixel + addl C(d_pzbuffer),%eax + addl %ebp,%eax + movl %eax,pz + +// +// point %edi to the first pixel in the span +// + movl C(d_viewbuffer),%ebp + movl sspan_t_v(%ebx),%eax + pushl %ebx // preserve spans pointer + movl C(tadjust),%edx + movl C(sadjust),%esi + movl C(d_scantable)(,%eax,4),%edi // v * screenwidth + addl %ebp,%edi + movl sspan_t_u(%ebx),%ebp + addl %ebp,%edi // pdest = &pdestspan[scans->u]; + +// +// now start the FDIV for the end of the span +// + cmpl $8,%ecx + ja LSetupNotLast1 + + decl %ecx + jz LCleanup1 // if only one pixel, no need to start an FDIV + movl %ecx,spancountminus1 + +// finish up the s and t calcs + fxch %st(1) // z*64k | 1/z | t/z | s/z + + fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z + fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z + fxch %st(1) // z*64k | s | 1/z | t/z | s/z + fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z + fxch %st(1) // s | t | 1/z | t/z | s/z + fistpl s // 1/z | t | t/z | s/z + fistpl t // 1/z | t/z | s/z + + fildl spancountminus1 + + flds C(d_tdivzstepu) // _d_tdivzstepu | spancountminus1 + flds C(d_zistepu) // _d_zistepu | _d_tdivzstepu | spancountminus1 + fmul %st(2),%st(0) // _d_zistepu*scm1 | _d_tdivzstepu | scm1 + fxch %st(1) // _d_tdivzstepu | _d_zistepu*scm1 | scm1 + fmul %st(2),%st(0) // _d_tdivzstepu*scm1 | _d_zistepu*scm1 | scm1 + fxch %st(2) // scm1 | _d_zistepu*scm1 | _d_tdivzstepu*scm1 + fmuls C(d_sdivzstepu) // _d_sdivzstepu*scm1 | _d_zistepu*scm1 | + // _d_tdivzstepu*scm1 + fxch %st(1) // _d_zistepu*scm1 | _d_sdivzstepu*scm1 | + // _d_tdivzstepu*scm1 + faddp %st(0),%st(3) // _d_sdivzstepu*scm1 | _d_tdivzstepu*scm1 + fxch %st(1) // _d_tdivzstepu*scm1 | _d_sdivzstepu*scm1 + faddp %st(0),%st(3) // _d_sdivzstepu*scm1 + faddp %st(0),%st(3) + + flds fp_64k + fdiv %st(1),%st(0) // this is what we've gone to all this trouble to + // overlap + jmp LFDIVInFlight1 + +LCleanup1: +// finish up the s and t calcs + fxch %st(1) // z*64k | 1/z | t/z | s/z + + fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z + fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z + fxch %st(1) // z*64k | s | 1/z | t/z | s/z + fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z + fxch %st(1) // s | t | 1/z | t/z | s/z + fistpl s // 1/z | t | t/z | s/z + fistpl t // 1/z | t/z | s/z + jmp LFDIVInFlight1 + + .align 4 +LSetupNotLast1: +// finish up the s and t calcs + fxch %st(1) // z*64k | 1/z | t/z | s/z + + fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z + fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z + fxch %st(1) // z*64k | s | 1/z | t/z | s/z + fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z + fxch %st(1) // s | t | 1/z | t/z | s/z + fistpl s // 1/z | t | t/z | s/z + fistpl t // 1/z | t/z | s/z + + fadds zi8stepu + fxch %st(2) + fadds sdivz8stepu + fxch %st(2) + flds tdivz8stepu + faddp %st(0),%st(2) + flds fp_64k + fdiv %st(1),%st(0) // z = 1/1/z + // this is what we've gone to all this trouble to + // overlap +LFDIVInFlight1: + + addl s,%esi + addl t,%edx + movl C(bbextents),%ebx + movl C(bbextentt),%ebp + cmpl %ebx,%esi + ja LClampHighOrLow0 +LClampReentry0: + movl %esi,s + movl pbase,%ebx + shll $16,%esi + cmpl %ebp,%edx + movl %esi,sfracf + ja LClampHighOrLow1 +LClampReentry1: + movl %edx,t + movl s,%esi // sfrac = scans->sfrac; + shll $16,%edx + movl t,%eax // tfrac = scans->tfrac; + sarl $16,%esi + movl %edx,tfracf + +// +// calculate the texture starting address +// + sarl $16,%eax + addl %ebx,%esi + imull C(cachewidth),%eax // (tfrac >> 16) * cachewidth + addl %eax,%esi // psource = pbase + (sfrac >> 16) + + // ((tfrac >> 16) * cachewidth); + +// +// determine whether last span or not +// + cmpl $8,%ecx + jna LLastSegment + +// +// not the last segment; do full 8-wide segment +// +LNotLastSegment: + +// +// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to +// get there +// + +// pick up after the FDIV that was left in flight previously + + fld %st(0) // duplicate it + fmul %st(4),%st(0) // s = s/z * z + fxch %st(1) + fmul %st(3),%st(0) // t = t/z * z + fxch %st(1) + fistpl snext + fistpl tnext + movl snext,%eax + movl tnext,%edx + + subl $8,%ecx // count off this segments' pixels + movl C(sadjust),%ebp + pushl %ecx // remember count of remaining pixels + movl C(tadjust),%ecx + + addl %eax,%ebp + addl %edx,%ecx + + movl C(bbextents),%eax + movl C(bbextentt),%edx + + cmpl $2048,%ebp + jl LClampLow2 + cmpl %eax,%ebp + ja LClampHigh2 +LClampReentry2: + + cmpl $2048,%ecx + jl LClampLow3 + cmpl %edx,%ecx + ja LClampHigh3 +LClampReentry3: + + movl %ebp,snext + movl %ecx,tnext + + subl s,%ebp + subl t,%ecx + +// +// set up advancetable +// + movl %ecx,%eax + movl %ebp,%edx + sarl $19,%edx // sstep >>= 16; + movl C(cachewidth),%ebx + sarl $19,%eax // tstep >>= 16; + jz LIsZero + imull %ebx,%eax // (tstep >> 16) * cachewidth; +LIsZero: + addl %edx,%eax // add in sstep + // (tstep >> 16) * cachewidth + (sstep >> 16); + movl tfracf,%edx + movl %eax,advancetable+4 // advance base in t + addl %ebx,%eax // ((tstep >> 16) + 1) * cachewidth + + // (sstep >> 16); + shll $13,%ebp // left-justify sstep fractional part + movl %ebp,sstep + movl sfracf,%ebx + shll $13,%ecx // left-justify tstep fractional part + movl %eax,advancetable // advance extra in t + movl %ecx,tstep + + movl pz,%ecx + movl izi,%ebp + + cmpw (%ecx),%bp + jl Lp1 + movb (%esi),%al // get first source texel + cmpb $(TRANSPARENT_COLOR),%al + jz Lp1 + movw %bp,(%ecx) + movb %al,(%edi) // store first dest pixel +Lp1: + addl izistep,%ebp + adcl $0,%ebp + addl tstep,%edx // advance tfrac fractional part by tstep frac + + sbbl %eax,%eax // turn tstep carry into -1 (0 if none) + addl sstep,%ebx // advance sfrac fractional part by sstep frac + adcl advancetable+4(,%eax,4),%esi // point to next source texel + + cmpw 2(%ecx),%bp + jl Lp2 + movb (%esi),%al + cmpb $(TRANSPARENT_COLOR),%al + jz Lp2 + movw %bp,2(%ecx) + movb %al,1(%edi) +Lp2: + addl izistep,%ebp + adcl $0,%ebp + addl tstep,%edx + sbbl %eax,%eax + addl sstep,%ebx + adcl advancetable+4(,%eax,4),%esi + + cmpw 4(%ecx),%bp + jl Lp3 + movb (%esi),%al + cmpb $(TRANSPARENT_COLOR),%al + jz Lp3 + movw %bp,4(%ecx) + movb %al,2(%edi) +Lp3: + addl izistep,%ebp + adcl $0,%ebp + addl tstep,%edx + sbbl %eax,%eax + addl sstep,%ebx + adcl advancetable+4(,%eax,4),%esi + + cmpw 6(%ecx),%bp + jl Lp4 + movb (%esi),%al + cmpb $(TRANSPARENT_COLOR),%al + jz Lp4 + movw %bp,6(%ecx) + movb %al,3(%edi) +Lp4: + addl izistep,%ebp + adcl $0,%ebp + addl tstep,%edx + sbbl %eax,%eax + addl sstep,%ebx + adcl advancetable+4(,%eax,4),%esi + + cmpw 8(%ecx),%bp + jl Lp5 + movb (%esi),%al + cmpb $(TRANSPARENT_COLOR),%al + jz Lp5 + movw %bp,8(%ecx) + movb %al,4(%edi) +Lp5: + addl izistep,%ebp + adcl $0,%ebp + addl tstep,%edx + sbbl %eax,%eax + addl sstep,%ebx + adcl advancetable+4(,%eax,4),%esi + +// +// start FDIV for end of next segment in flight, so it can overlap +// + popl %eax + cmpl $8,%eax // more than one segment after this? + ja LSetupNotLast2 // yes + + decl %eax + jz LFDIVInFlight2 // if only one pixel, no need to start an FDIV + movl %eax,spancountminus1 + fildl spancountminus1 + + flds C(d_zistepu) // _d_zistepu | spancountminus1 + fmul %st(1),%st(0) // _d_zistepu*scm1 | scm1 + flds C(d_tdivzstepu) // _d_tdivzstepu | _d_zistepu*scm1 | scm1 + fmul %st(2),%st(0) // _d_tdivzstepu*scm1 | _d_zistepu*scm1 | scm1 + fxch %st(1) // _d_zistepu*scm1 | _d_tdivzstepu*scm1 | scm1 + faddp %st(0),%st(3) // _d_tdivzstepu*scm1 | scm1 + fxch %st(1) // scm1 | _d_tdivzstepu*scm1 + fmuls C(d_sdivzstepu) // _d_sdivzstepu*scm1 | _d_tdivzstepu*scm1 + fxch %st(1) // _d_tdivzstepu*scm1 | _d_sdivzstepu*scm1 + faddp %st(0),%st(3) // _d_sdivzstepu*scm1 + flds fp_64k // 64k | _d_sdivzstepu*scm1 + fxch %st(1) // _d_sdivzstepu*scm1 | 64k + faddp %st(0),%st(4) // 64k + + fdiv %st(1),%st(0) // this is what we've gone to all this trouble to + // overlap + jmp LFDIVInFlight2 + + .align 4 +LSetupNotLast2: + fadds zi8stepu + fxch %st(2) + fadds sdivz8stepu + fxch %st(2) + flds tdivz8stepu + faddp %st(0),%st(2) + flds fp_64k + fdiv %st(1),%st(0) // z = 1/1/z + // this is what we've gone to all this trouble to + // overlap +LFDIVInFlight2: + pushl %eax + + cmpw 10(%ecx),%bp + jl Lp6 + movb (%esi),%al + cmpb $(TRANSPARENT_COLOR),%al + jz Lp6 + movw %bp,10(%ecx) + movb %al,5(%edi) +Lp6: + addl izistep,%ebp + adcl $0,%ebp + addl tstep,%edx + sbbl %eax,%eax + addl sstep,%ebx + adcl advancetable+4(,%eax,4),%esi + + cmpw 12(%ecx),%bp + jl Lp7 + movb (%esi),%al + cmpb $(TRANSPARENT_COLOR),%al + jz Lp7 + movw %bp,12(%ecx) + movb %al,6(%edi) +Lp7: + addl izistep,%ebp + adcl $0,%ebp + addl tstep,%edx + sbbl %eax,%eax + addl sstep,%ebx + adcl advancetable+4(,%eax,4),%esi + + cmpw 14(%ecx),%bp + jl Lp8 + movb (%esi),%al + cmpb $(TRANSPARENT_COLOR),%al + jz Lp8 + movw %bp,14(%ecx) + movb %al,7(%edi) +Lp8: + addl izistep,%ebp + adcl $0,%ebp + addl tstep,%edx + sbbl %eax,%eax + addl sstep,%ebx + adcl advancetable+4(,%eax,4),%esi + + addl $8,%edi + addl $16,%ecx + movl %edx,tfracf + movl snext,%edx + movl %ebx,sfracf + movl tnext,%ebx + movl %edx,s + movl %ebx,t + + movl %ecx,pz + movl %ebp,izi + + popl %ecx // retrieve count + +// +// determine whether last span or not +// + cmpl $8,%ecx // are there multiple segments remaining? + ja LNotLastSegment // yes + +// +// last segment of scan +// +LLastSegment: + +// +// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to +// get there. The number of pixels left is variable, and we want to land on the +// last pixel, not step one past it, so we can't run into arithmetic problems +// + testl %ecx,%ecx + jz LNoSteps // just draw the last pixel and we're done + +// pick up after the FDIV that was left in flight previously + + + fld %st(0) // duplicate it + fmul %st(4),%st(0) // s = s/z * z + fxch %st(1) + fmul %st(3),%st(0) // t = t/z * z + fxch %st(1) + fistpl snext + fistpl tnext + + movl C(tadjust),%ebx + movl C(sadjust),%eax + + addl snext,%eax + addl tnext,%ebx + + movl C(bbextents),%ebp + movl C(bbextentt),%edx + + cmpl $2048,%eax + jl LClampLow4 + cmpl %ebp,%eax + ja LClampHigh4 +LClampReentry4: + movl %eax,snext + + cmpl $2048,%ebx + jl LClampLow5 + cmpl %edx,%ebx + ja LClampHigh5 +LClampReentry5: + + cmpl $1,%ecx // don't bother + je LOnlyOneStep // if two pixels in segment, there's only one step, + // of the segment length + subl s,%eax + subl t,%ebx + + addl %eax,%eax // convert to 15.17 format so multiply by 1.31 + addl %ebx,%ebx // reciprocal yields 16.48 + imull reciprocal_table-8(,%ecx,4) // sstep = (snext - s) / (spancount-1) + movl %edx,%ebp + + movl %ebx,%eax + imull reciprocal_table-8(,%ecx,4) // tstep = (tnext - t) / (spancount-1) + +LSetEntryvec: +// +// set up advancetable +// + movl spr8entryvec_table(,%ecx,4),%ebx + movl %edx,%eax + pushl %ebx // entry point into code for RET later + movl %ebp,%ecx + sarl $16,%ecx // sstep >>= 16; + movl C(cachewidth),%ebx + sarl $16,%edx // tstep >>= 16; + jz LIsZeroLast + imull %ebx,%edx // (tstep >> 16) * cachewidth; +LIsZeroLast: + addl %ecx,%edx // add in sstep + // (tstep >> 16) * cachewidth + (sstep >> 16); + movl tfracf,%ecx + movl %edx,advancetable+4 // advance base in t + addl %ebx,%edx // ((tstep >> 16) + 1) * cachewidth + + // (sstep >> 16); + shll $16,%ebp // left-justify sstep fractional part + movl sfracf,%ebx + shll $16,%eax // left-justify tstep fractional part + movl %edx,advancetable // advance extra in t + + movl %eax,tstep + movl %ebp,sstep + movl %ecx,%edx + + movl pz,%ecx + movl izi,%ebp + + ret // jump to the number-of-pixels handler + +//---------------------------------------- + +LNoSteps: + movl pz,%ecx + subl $7,%edi // adjust for hardwired offset + subl $14,%ecx + jmp LEndSpan + + +LOnlyOneStep: + subl s,%eax + subl t,%ebx + movl %eax,%ebp + movl %ebx,%edx + jmp LSetEntryvec + +//---------------------------------------- + +.globl Spr8Entry2_8 +Spr8Entry2_8: + subl $6,%edi // adjust for hardwired offsets + subl $12,%ecx + movb (%esi),%al + jmp LLEntry2_8 + +//---------------------------------------- + +.globl Spr8Entry3_8 +Spr8Entry3_8: + subl $5,%edi // adjust for hardwired offsets + subl $10,%ecx + jmp LLEntry3_8 + +//---------------------------------------- + +.globl Spr8Entry4_8 +Spr8Entry4_8: + subl $4,%edi // adjust for hardwired offsets + subl $8,%ecx + jmp LLEntry4_8 + +//---------------------------------------- + +.globl Spr8Entry5_8 +Spr8Entry5_8: + subl $3,%edi // adjust for hardwired offsets + subl $6,%ecx + jmp LLEntry5_8 + +//---------------------------------------- + +.globl Spr8Entry6_8 +Spr8Entry6_8: + subl $2,%edi // adjust for hardwired offsets + subl $4,%ecx + jmp LLEntry6_8 + +//---------------------------------------- + +.globl Spr8Entry7_8 +Spr8Entry7_8: + decl %edi // adjust for hardwired offsets + subl $2,%ecx + jmp LLEntry7_8 + +//---------------------------------------- + +.globl Spr8Entry8_8 +Spr8Entry8_8: + cmpw (%ecx),%bp + jl Lp9 + movb (%esi),%al + cmpb $(TRANSPARENT_COLOR),%al + jz Lp9 + movw %bp,(%ecx) + movb %al,(%edi) +Lp9: + addl izistep,%ebp + adcl $0,%ebp + addl tstep,%edx + sbbl %eax,%eax + addl sstep,%ebx + adcl advancetable+4(,%eax,4),%esi +LLEntry7_8: + cmpw 2(%ecx),%bp + jl Lp10 + movb (%esi),%al + cmpb $(TRANSPARENT_COLOR),%al + jz Lp10 + movw %bp,2(%ecx) + movb %al,1(%edi) +Lp10: + addl izistep,%ebp + adcl $0,%ebp + addl tstep,%edx + sbbl %eax,%eax + addl sstep,%ebx + adcl advancetable+4(,%eax,4),%esi +LLEntry6_8: + cmpw 4(%ecx),%bp + jl Lp11 + movb (%esi),%al + cmpb $(TRANSPARENT_COLOR),%al + jz Lp11 + movw %bp,4(%ecx) + movb %al,2(%edi) +Lp11: + addl izistep,%ebp + adcl $0,%ebp + addl tstep,%edx + sbbl %eax,%eax + addl sstep,%ebx + adcl advancetable+4(,%eax,4),%esi +LLEntry5_8: + cmpw 6(%ecx),%bp + jl Lp12 + movb (%esi),%al + cmpb $(TRANSPARENT_COLOR),%al + jz Lp12 + movw %bp,6(%ecx) + movb %al,3(%edi) +Lp12: + addl izistep,%ebp + adcl $0,%ebp + addl tstep,%edx + sbbl %eax,%eax + addl sstep,%ebx + adcl advancetable+4(,%eax,4),%esi +LLEntry4_8: + cmpw 8(%ecx),%bp + jl Lp13 + movb (%esi),%al + cmpb $(TRANSPARENT_COLOR),%al + jz Lp13 + movw %bp,8(%ecx) + movb %al,4(%edi) +Lp13: + addl izistep,%ebp + adcl $0,%ebp + addl tstep,%edx + sbbl %eax,%eax + addl sstep,%ebx + adcl advancetable+4(,%eax,4),%esi +LLEntry3_8: + cmpw 10(%ecx),%bp + jl Lp14 + movb (%esi),%al + cmpb $(TRANSPARENT_COLOR),%al + jz Lp14 + movw %bp,10(%ecx) + movb %al,5(%edi) +Lp14: + addl izistep,%ebp + adcl $0,%ebp + addl tstep,%edx + sbbl %eax,%eax + addl sstep,%ebx + adcl advancetable+4(,%eax,4),%esi +LLEntry2_8: + cmpw 12(%ecx),%bp + jl Lp15 + movb (%esi),%al + cmpb $(TRANSPARENT_COLOR),%al + jz Lp15 + movw %bp,12(%ecx) + movb %al,6(%edi) +Lp15: + addl izistep,%ebp + adcl $0,%ebp + addl tstep,%edx + sbbl %eax,%eax + addl sstep,%ebx + adcl advancetable+4(,%eax,4),%esi + +LEndSpan: + cmpw 14(%ecx),%bp + jl Lp16 + movb (%esi),%al // load first texel in segment + cmpb $(TRANSPARENT_COLOR),%al + jz Lp16 + movw %bp,14(%ecx) + movb %al,7(%edi) +Lp16: + +// +// clear s/z, t/z, 1/z from FP stack +// + fstp %st(0) + fstp %st(0) + fstp %st(0) + + popl %ebx // restore spans pointer +LNextSpan: + addl $(sspan_t_size),%ebx // point to next span + movl sspan_t_count(%ebx),%ecx + cmpl $0,%ecx // any more spans? + jg LSpanLoop // yes + jz LNextSpan // yes, but this one's empty + + popl %ebx // restore register variables + popl %esi + popl %edi + popl %ebp // restore the caller's stack frame + ret + +#endif // id386 diff --git a/source/d_varsa.S b/source/d_varsa.S new file mode 100644 index 0000000..76d67eb --- /dev/null +++ b/source/d_varsa.S @@ -0,0 +1,213 @@ +/* +Copyright (C) 1996-1997 Id Software, Inc. + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +*/ +// +// d_varsa.s +// + +#include "asm_i386.h" +#include "quakeasm.h" +#include "asm_draw.h" +#include "d_ifacea.h" + +#if id386 + + .data + +//------------------------------------------------------- +// global refresh variables +//------------------------------------------------------- + +// FIXME: put all refresh variables into one contiguous block. Make into one +// big structure, like cl or sv? + + .align 4 +.globl C(d_sdivzstepu) +.globl C(d_tdivzstepu) +.globl C(d_zistepu) +.globl C(d_sdivzstepv) +.globl C(d_tdivzstepv) +.globl C(d_zistepv) +.globl C(d_sdivzorigin) +.globl C(d_tdivzorigin) +.globl C(d_ziorigin) +C(d_sdivzstepu): .single 0 +C(d_tdivzstepu): .single 0 +C(d_zistepu): .single 0 +C(d_sdivzstepv): .single 0 +C(d_tdivzstepv): .single 0 +C(d_zistepv): .single 0 +C(d_sdivzorigin): .single 0 +C(d_tdivzorigin): .single 0 +C(d_ziorigin): .single 0 + +.globl C(sadjust) +.globl C(tadjust) +.globl C(bbextents) +.globl C(bbextentt) +C(sadjust): .long 0 +C(tadjust): .long 0 +C(bbextents): .long 0 +C(bbextentt): .long 0 + +.globl C(cacheblock) +.globl C(d_viewbuffer) +.globl C(cachewidth) +.globl C(d_pzbuffer) +.globl C(d_zrowbytes) +.globl C(d_zwidth) +C(cacheblock): .long 0 +C(cachewidth): .long 0 +C(d_viewbuffer): .long 0 +C(d_pzbuffer): .long 0 +C(d_zrowbytes): .long 0 +C(d_zwidth): .long 0 + + +//------------------------------------------------------- +// ASM-only variables +//------------------------------------------------------- +.globl izi +izi: .long 0 + +.globl pbase, s, t, sfracf, tfracf, snext, tnext +.globl spancountminus1, zi16stepu, sdivz16stepu, tdivz16stepu +.globl zi8stepu, sdivz8stepu, tdivz8stepu, pz +s: .long 0 +t: .long 0 +snext: .long 0 +tnext: .long 0 +sfracf: .long 0 +tfracf: .long 0 +pbase: .long 0 +zi8stepu: .long 0 +sdivz8stepu: .long 0 +tdivz8stepu: .long 0 +zi16stepu: .long 0 +sdivz16stepu: .long 0 +tdivz16stepu: .long 0 +spancountminus1: .long 0 +pz: .long 0 + +.globl izistep +izistep: .long 0 + +//------------------------------------------------------- +// local variables for d_draw16.s +//------------------------------------------------------- + +.globl reciprocal_table_16, entryvec_table_16 +// 1/2, 1/3, 1/4, 1/5, 1/6, 1/7, 1/8, 1/9, 1/10, 1/11, 1/12, 1/13, +// 1/14, and 1/15 in 0.32 form +reciprocal_table_16: .long 0x40000000, 0x2aaaaaaa, 0x20000000 + .long 0x19999999, 0x15555555, 0x12492492 + .long 0x10000000, 0xe38e38e, 0xccccccc, 0xba2e8ba + .long 0xaaaaaaa, 0x9d89d89, 0x9249249, 0x8888888 + +#ifndef NeXT + .extern Entry2_16 + .extern Entry3_16 + .extern Entry4_16 + .extern Entry5_16 + .extern Entry6_16 + .extern Entry7_16 + .extern Entry8_16 + .extern Entry9_16 + .extern Entry10_16 + .extern Entry11_16 + .extern Entry12_16 + .extern Entry13_16 + .extern Entry14_16 + .extern Entry15_16 + .extern Entry16_16 +#endif + +entryvec_table_16: .long 0, Entry2_16, Entry3_16, Entry4_16 + .long Entry5_16, Entry6_16, Entry7_16, Entry8_16 + .long Entry9_16, Entry10_16, Entry11_16, Entry12_16 + .long Entry13_16, Entry14_16, Entry15_16, Entry16_16 + +//------------------------------------------------------- +// local variables for d_parta.s +//------------------------------------------------------- +.globl DP_Count, DP_u, DP_v, DP_32768, DP_Color, DP_Pix, DP_EntryTable +DP_Count: .long 0 +DP_u: .long 0 +DP_v: .long 0 +DP_32768: .single 32768.0 +DP_Color: .long 0 +DP_Pix: .long 0 + + +#ifndef NeXT + .extern DP_1x1 + .extern DP_2x2 + .extern DP_3x3 + .extern DP_4x4 +#endif + +DP_EntryTable: .long DP_1x1, DP_2x2, DP_3x3, DP_4x4 + +// +// advancetable is 8 bytes, but points to the middle of that range so negative +// offsets will work +// +.globl advancetable, sstep, tstep, pspantemp, counttemp, jumptemp +advancetable: .long 0, 0 +sstep: .long 0 +tstep: .long 0 + +pspantemp: .long 0 +counttemp: .long 0 +jumptemp: .long 0 + +// 1/2, 1/3, 1/4, 1/5, 1/6, and 1/7 in 0.32 form +.globl reciprocal_table, entryvec_table +reciprocal_table: .long 0x40000000, 0x2aaaaaaa, 0x20000000 + .long 0x19999999, 0x15555555, 0x12492492 + +#ifndef NeXT + .extern Entry2_8 + .extern Entry3_8 + .extern Entry4_8 + .extern Entry5_8 + .extern Entry6_8 + .extern Entry7_8 + .extern Entry8_8 +#endif + +entryvec_table: .long 0, Entry2_8, Entry3_8, Entry4_8 + .long Entry5_8, Entry6_8, Entry7_8, Entry8_8 + +#ifndef NeXT + .extern Spr8Entry2_8 + .extern Spr8Entry3_8 + .extern Spr8Entry4_8 + .extern Spr8Entry5_8 + .extern Spr8Entry6_8 + .extern Spr8Entry7_8 + .extern Spr8Entry8_8 +#endif + +.globl spr8entryvec_table +spr8entryvec_table: .long 0, Spr8Entry2_8, Spr8Entry3_8, Spr8Entry4_8 + .long Spr8Entry5_8, Spr8Entry6_8, Spr8Entry7_8, Spr8Entry8_8 + +#endif // id386 + diff --git a/source/math.S b/source/math.S index 465f914..725216b 100644 --- a/source/math.S +++ b/source/math.S @@ -1,3 +1,22 @@ +/* +Copyright (C) 1996-1997 Id Software, Inc. + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +*/ // // math.s // x86 assembly-language math routines. @@ -6,7 +25,7 @@ #include "quakeasm.h" -#if id386 +#if id386 .data @@ -16,6 +35,73 @@ Ljmptab: .long Lcase0, Lcase1, Lcase2, Lcase3 .text +// TODO: rounding needed? +// stack parameter offset +#define val 4 + +.globl C(Invert24To16) +C(Invert24To16): + + movl val(%esp),%ecx + movl $0x100,%edx // 0x10000000000 as dividend + cmpl %edx,%ecx + jle LOutOfRange + + subl %eax,%eax + divl %ecx + + ret + +LOutOfRange: + movl $0xFFFFFFFF,%eax + ret + +#define in 4 +#define out 8 + + .align 2 +.globl C(TransformVector) +C(TransformVector): + movl in(%esp),%eax + movl out(%esp),%edx + + flds (%eax) // in[0] + fmuls C(vright) // in[0]*vright[0] + flds (%eax) // in[0] | in[0]*vright[0] + fmuls C(vup) // in[0]*vup[0] | in[0]*vright[0] + flds (%eax) // in[0] | in[0]*vup[0] | in[0]*vright[0] + fmuls C(vpn) // in[0]*vpn[0] | in[0]*vup[0] | in[0]*vright[0] + + flds 4(%eax) // in[1] | ... + fmuls C(vright)+4 // in[1]*vright[1] | ... + flds 4(%eax) // in[1] | in[1]*vright[1] | ... + fmuls C(vup)+4 // in[1]*vup[1] | in[1]*vright[1] | ... + flds 4(%eax) // in[1] | in[1]*vup[1] | in[1]*vright[1] | ... + fmuls C(vpn)+4 // in[1]*vpn[1] | in[1]*vup[1] | in[1]*vright[1] | ... + fxch %st(2) // in[1]*vright[1] | in[1]*vup[1] | in[1]*vpn[1] | ... + + faddp %st(0),%st(5) // in[1]*vup[1] | in[1]*vpn[1] | ... + faddp %st(0),%st(3) // in[1]*vpn[1] | ... + faddp %st(0),%st(1) // vpn_accum | vup_accum | vright_accum + + flds 8(%eax) // in[2] | ... + fmuls C(vright)+8 // in[2]*vright[2] | ... + flds 8(%eax) // in[2] | in[2]*vright[2] | ... + fmuls C(vup)+8 // in[2]*vup[2] | in[2]*vright[2] | ... + flds 8(%eax) // in[2] | in[2]*vup[2] | in[2]*vright[2] | ... + fmuls C(vpn)+8 // in[2]*vpn[2] | in[2]*vup[2] | in[2]*vright[2] | ... + fxch %st(2) // in[2]*vright[2] | in[2]*vup[2] | in[2]*vpn[2] | ... + + faddp %st(0),%st(5) // in[2]*vup[2] | in[2]*vpn[2] | ... + faddp %st(0),%st(3) // in[2]*vpn[2] | ... + faddp %st(0),%st(1) // vpn_accum | vup_accum | vright_accum + + fstps 8(%edx) // out[2] + fstps 4(%edx) // out[1] + fstps (%edx) // out[0] + + ret + #define EMINS 4+4 #define EMAXS 4+8 @@ -35,7 +121,7 @@ C(BoxOnPlaneSide): jge Lerror flds pl_normal(%edx) // p->normal[0] fld %st(0) // p->normal[0] | p->normal[0] - jmp *Ljmptab(,%eax,4) + jmp Ljmptab(,%eax,4) //dist1= p->normal[0]*emaxs[0] + p->normal[1]*emaxs[1] + p->normal[2]*emaxs[2]; diff --git a/source/r_aclipa.S b/source/r_aclipa.S new file mode 100644 index 0000000..418306a --- /dev/null +++ b/source/r_aclipa.S @@ -0,0 +1,216 @@ +/* +Copyright (C) 1996-1997 Id Software, Inc. + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +*/ +// +// r_aliasa.s +// x86 assembly-language Alias model transform and project code. +// + +#include "asm_i386.h" +#include "quakeasm.h" +#include "asm_draw.h" +#include "d_ifacea.h" + +#if id386 + + .data +Ltemp0: .long 0 +Ltemp1: .long 0 + + .text + +#define pfv0 8+4 +#define pfv1 8+8 +#define out 8+12 + +.globl C(R_Alias_clip_bottom) +C(R_Alias_clip_bottom): + pushl %esi + pushl %edi + + movl pfv0(%esp),%esi + movl pfv1(%esp),%edi + + movl C(r_refdef)+rd_aliasvrectbottom,%eax + +LDoForwardOrBackward: + + movl fv_v+4(%esi),%edx + movl fv_v+4(%edi),%ecx + + cmpl %ecx,%edx + jl LDoForward + + movl fv_v+4(%esi),%ecx + movl fv_v+4(%edi),%edx + movl pfv0(%esp),%edi + movl pfv1(%esp),%esi + +LDoForward: + + subl %edx,%ecx + subl %edx,%eax + movl %ecx,Ltemp1 + movl %eax,Ltemp0 + fildl Ltemp1 + fildl Ltemp0 + movl out(%esp),%edx + movl $2,%eax + + fdivp %st(0),%st(1) // scale + +LDo3Forward: + fildl fv_v+0(%esi) // fv0v0 | scale + fildl fv_v+0(%edi) // fv1v0 | fv0v0 | scale + fildl fv_v+4(%esi) // fv0v1 | fv1v0 | fv0v0 | scale + fildl fv_v+4(%edi) // fv1v1 | fv0v1 | fv1v0 | fv0v0 | scale + fildl fv_v+8(%esi) // fv0v2 | fv1v1 | fv0v1 | fv1v0 | fv0v0 | scale + fildl fv_v+8(%edi) // fv1v2 | fv0v2 | fv1v1 | fv0v1 | fv1v0 | fv0v0 | + // scale + fxch %st(5) // fv0v0 | fv0v2 | fv1v1 | fv0v1 | fv1v0 | fv1v2 | + // scale + fsubr %st(0),%st(4) // fv0v0 | fv0v2 | fv1v1 | fv0v1 | fv1v0-fv0v0 | + // fv1v2 | scale + fxch %st(3) // fv0v1 | fv0v2 | fv1v1 | fv0v0 | fv1v0-fv0v0 | + // fv1v2 | scale + fsubr %st(0),%st(2) // fv0v1 | fv0v2 | fv1v1-fv0v1 | fv0v0 | + // fv1v0-fv0v0 | fv1v2 | scale + fxch %st(1) // fv0v2 | fv0v1 | fv1v1-fv0v1 | fv0v0 | + // fv1v0-fv0v0 | fv1v2 | scale + fsubr %st(0),%st(5) // fv0v2 | fv0v1 | fv1v1-fv0v1 | fv0v0 | + // fv1v0-fv0v0 | fv1v2-fv0v2 | scale + fxch %st(6) // scale | fv0v1 | fv1v1-fv0v1 | fv0v0 | + // fv1v0-fv0v0 | fv1v2-fv0v2 | fv0v2 + fmul %st(0),%st(4) // scale | fv0v1 | fv1v1-fv0v1 | fv0v0 | + // (fv1v0-fv0v0)*scale | fv1v2-fv0v2 | fv0v2 + addl $12,%edi + fmul %st(0),%st(2) // scale | fv0v1 | (fv1v1-fv0v1)*scale | fv0v0 | + // (fv1v0-fv0v0)*scale | fv1v2-fv0v2 | fv0v2 + addl $12,%esi + addl $12,%edx + fmul %st(0),%st(5) // scale | fv0v1 | (fv1v1-fv0v1)*scale | fv0v0 | + // (fv1v0-fv0v0)*scale | (fv1v2-fv0v2)*scale | + // fv0v2 + fxch %st(3) // fv0v0 | fv0v1 | (fv1v1-fv0v1)*scale | scale | + // (fv1v0-fv0v0)*scale | (fv1v2-fv0v2)*scale | + // fv0v2 + faddp %st(0),%st(4) // fv0v1 | (fv1v1-fv0v1)*scale | scale | + // fv0v0+(fv1v0-fv0v0)*scale | + // (fv1v2-fv0v2)*scale | fv0v2 + faddp %st(0),%st(1) // fv0v1+(fv1v1-fv0v1)*scale | scale | + // fv0v0+(fv1v0-fv0v0)*scale | + // (fv1v2-fv0v2)*scale | fv0v2 + fxch %st(4) // fv0v2 | scale | fv0v0+(fv1v0-fv0v0)*scale | + // (fv1v2-fv0v2)*scale | fv0v1+(fv1v1-fv0v1)*scale + faddp %st(0),%st(3) // scale | fv0v0+(fv1v0-fv0v0)*scale | + // fv0v2+(fv1v2-fv0v2)*scale | + // fv0v1+(fv1v1-fv0v1)*scale + fxch %st(1) // fv0v0+(fv1v0-fv0v0)*scale | scale | + // fv0v2+(fv1v2-fv0v2)*scale | + // fv0v1+(fv1v1-fv0v1)*scale + fadds float_point5 + fxch %st(3) // fv0v1+(fv1v1-fv0v1)*scale | scale | + // fv0v2+(fv1v2-fv0v2)*scale | + // fv0v0+(fv1v0-fv0v0)*scale + fadds float_point5 + fxch %st(2) // fv0v2+(fv1v2-fv0v2)*scale | scale | + // fv0v1+(fv1v1-fv0v1)*scale | + // fv0v0+(fv1v0-fv0v0)*scale + fadds float_point5 + fxch %st(3) // fv0v0+(fv1v0-fv0v0)*scale | scale | + // fv0v1+(fv1v1-fv0v1)*scale | + // fv0v2+(fv1v2-fv0v2)*scale + fistpl fv_v+0-12(%edx) // scale | fv0v1+(fv1v1-fv0v1)*scale | + // fv0v2+(fv1v2-fv0v2)*scale + fxch %st(1) // fv0v1+(fv1v1-fv0v1)*scale | scale | + // fv0v2+(fv1v2-fv0v2)*scale | scale + fistpl fv_v+4-12(%edx) // scale | fv0v2+(fv1v2-fv0v2)*scale + fxch %st(1) // fv0v2+(fv1v2-fv0v2)*sc | scale + fistpl fv_v+8-12(%edx) // scale + + decl %eax + jnz LDo3Forward + + fstp %st(0) + + popl %edi + popl %esi + + ret + + +.globl C(R_Alias_clip_top) +C(R_Alias_clip_top): + pushl %esi + pushl %edi + + movl pfv0(%esp),%esi + movl pfv1(%esp),%edi + + movl C(r_refdef)+rd_aliasvrect+4,%eax + jmp LDoForwardOrBackward + + + +.globl C(R_Alias_clip_right) +C(R_Alias_clip_right): + pushl %esi + pushl %edi + + movl pfv0(%esp),%esi + movl pfv1(%esp),%edi + + movl C(r_refdef)+rd_aliasvrectright,%eax + +LRightLeftEntry: + + + movl fv_v+4(%esi),%edx + movl fv_v+4(%edi),%ecx + + cmpl %ecx,%edx + movl fv_v+0(%esi),%edx + + movl fv_v+0(%edi),%ecx + jl LDoForward2 + + movl fv_v+0(%esi),%ecx + movl fv_v+0(%edi),%edx + movl pfv0(%esp),%edi + movl pfv1(%esp),%esi + +LDoForward2: + + jmp LDoForward + + +.globl C(R_Alias_clip_left) +C(R_Alias_clip_left): + pushl %esi + pushl %edi + + movl pfv0(%esp),%esi + movl pfv1(%esp),%edi + + movl C(r_refdef)+rd_aliasvrect+0,%eax + jmp LRightLeftEntry + + +#endif // id386 + diff --git a/source/r_aliasa.S b/source/r_aliasa.S new file mode 100644 index 0000000..123cde2 --- /dev/null +++ b/source/r_aliasa.S @@ -0,0 +1,237 @@ +/* +Copyright (C) 1996-1997 Id Software, Inc. + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +*/ +// +// r_aliasa.s +// x86 assembly-language Alias model transform and project code. +// + +#include "asm_i386.h" +#include "quakeasm.h" +#include "asm_draw.h" +#include "d_ifacea.h" + +#if id386 + + .data + +Lfloat_1: .single 1.0 +Ltemp: .long 0 +Lcoords: .long 0, 0, 0 + + .text + +#define fv 12+4 +#define pstverts 12+8 + +.globl C(R_AliasTransformAndProjectFinalVerts) +C(R_AliasTransformAndProjectFinalVerts): + pushl %ebp // preserve caller's stack frame + pushl %edi + pushl %esi // preserve register variables + +// int i, temp; +// float lightcos, *plightnormal, zi; +// trivertx_t *pverts; + +// pverts = r_apverts; + movl C(r_apverts),%esi + +// for (i=0 ; iv, aliastransform[2]) + +// aliastransform[2][3]); + movb (%esi),%dl + movb %dl,Lcoords + fildl Lcoords // v[0] + movb 1(%esi),%dl + movb %dl,Lcoords+4 + fildl Lcoords+4 // v[1] | v[0] + movb 2(%esi),%dl + movb %dl,Lcoords+8 + fildl Lcoords+8 // v[2] | v[1] | v[0] + + fld %st(2) // v[0] | v[2] | v[1] | v[0] + fmuls C(aliastransform)+32 // accum | v[2] | v[1] | v[0] + fld %st(2) // v[1] | accum | v[2] | v[1] | v[0] + fmuls C(aliastransform)+36 // accum2 | accum | v[2] | v[1] | v[0] + fxch %st(1) // accum | accum2 | v[2] | v[1] | v[0] + fadds C(aliastransform)+44 // accum | accum2 | v[2] | v[1] | v[0] + fld %st(2) // v[2] | accum | accum2 | v[2] | v[1] | v[0] + fmuls C(aliastransform)+40 // accum3 | accum | accum2 | v[2] | v[1] | + // v[0] + fxch %st(1) // accum | accum3 | accum2 | v[2] | v[1] | v[0] + faddp %st(0),%st(2) // accum3 | accum | v[2] | v[1] | v[0] + movb tv_lightnormalindex(%esi),%dl + movl stv_s(%ebp),%eax + movl %eax,fv_v+8(%edi) + faddp %st(0),%st(1) // z | v[2] | v[1] | v[0] + + movl stv_t(%ebp),%eax + movl %eax,fv_v+12(%edi) + +// // lighting +// plightnormal = r_avertexnormals[pverts->lightnormalindex]; + + fdivrs Lfloat_1 // zi | v[2] | v[1] | v[0] + +// fv->v[2] = pstverts->s; +// fv->v[3] = pstverts->t; +// fv->flags = pstverts->onseam; + movl stv_onseam(%ebp),%eax + movl %eax,fv_flags(%edi) + + movl fv_size(%edi),%eax + movl stv_size(%ebp),%eax + movl 4(%esi),%eax + + leal (%edx,%edx,2),%eax // index*3 + + fxch %st(3) // v[0] | v[2] | v[1] | zi + +// lightcos = DotProduct (plightnormal, r_plightvec); + flds C(r_avertexnormals)(,%eax,4) + fmuls C(r_plightvec) + flds C(r_avertexnormals)+4(,%eax,4) + fmuls C(r_plightvec)+4 + flds C(r_avertexnormals)+8(,%eax,4) + fmuls C(r_plightvec)+8 + fxch %st(1) + faddp %st(0),%st(2) + fld %st(2) // v[0] | laccum | laccum2 | v[0] | v[2] | + // v[1] | zi + fmuls C(aliastransform)+0 // xaccum | laccum | laccum2 | v[0] | v[2] | + // v[1] | zi + fxch %st(2) // laccum2 | laccum | xaccum | v[0] | v[2] | + // v[1] | zi + faddp %st(0),%st(1) // laccum | xaccum | v[0] | v[2] | v[1] | zi + +// temp = r_ambientlight; +// if (lightcos < 0) +// { + fsts Ltemp + movl C(r_ambientlight),%eax + movb Ltemp+3,%dl + testb $0x80,%dl + jz Lsavelight // no need to clamp if only ambient lit, because + // r_ambientlight is preclamped + +// temp += (int)(r_shadelight * lightcos); + fmuls C(r_shadelight) +// FIXME: fast float->int conversion? + fistpl Ltemp + addl Ltemp,%eax + +// // clamp; because we limited the minimum ambient and shading light, we +// // don't have to clamp low light, just bright +// if (temp < 0) +// temp = 0; + jns Lp1 + subl %eax,%eax + +// } + +Lp1: + +// fv->v[4] = temp; +// +// // x, y, and z are scaled down by 1/2**31 in the transform, so 1/z is +// // scaled up by 1/2**31, and the scaling cancels out for x and y in the +// // projection +// fv->v[0] = ((DotProduct(pverts->v, aliastransform[0]) + +// aliastransform[0][3]) * zi) + aliasxcenter; +// fv->v[1] = ((DotProduct(pverts->v, aliastransform[1]) + +// aliastransform[1][3]) * zi) + aliasycenter; +// fv->v[5] = zi; + fxch %st(1) // v[0] | xaccum | v[2] | v[1] | zi + fmuls C(aliastransform)+16 // yaccum | xaccum | v[2] | v[1] | zi + fxch %st(3) // v[1] | xaccum | v[2] | yaccum | zi + fld %st(0) // v[1] | v[1] | xaccum | v[2] | yaccum | zi + fmuls C(aliastransform)+4 // xaccum2 | v[1] | xaccum | v[2] | yaccum |zi + fxch %st(1) // v[1] | xaccum2 | xaccum | v[2] | yaccum |zi + movl %eax,fv_v+16(%edi) + fmuls C(aliastransform)+20 // yaccum2 | xaccum2 | xaccum | v[2] | yaccum| + // zi + fxch %st(2) // xaccum | xaccum2 | yaccum2 | v[2] | yaccum| + // zi + fadds C(aliastransform)+12 // xaccum | xaccum2 | yaccum2 | v[2] | yaccum| + // zi + fxch %st(4) // yaccum | xaccum2 | yaccum2 | v[2] | xaccum| + // zi + fadds C(aliastransform)+28 // yaccum | xaccum2 | yaccum2 | v[2] | xaccum| + // zi + fxch %st(3) // v[2] | xaccum2 | yaccum2 | yaccum | xaccum| + // zi + fld %st(0) // v[2] | v[2] | xaccum2 | yaccum2 | yaccum | + // xaccum | zi + fmuls C(aliastransform)+8 // xaccum3 | v[2] | xaccum2 | yaccum2 |yaccum| + // xaccum | zi + fxch %st(1) // v[2] | xaccum3 | xaccum2 | yaccum2 |yaccum| + // xaccum | zi + fmuls C(aliastransform)+24 // yaccum3 | xaccum3 | xaccum2 | yaccum2 | + // yaccum | xaccum | zi + fxch %st(5) // xaccum | xaccum3 | xaccum2 | yaccum2 | + // yaccum | yaccum3 | zi + faddp %st(0),%st(2) // xaccum3 | xaccum | yaccum2 | yaccum | + // yaccum3 | zi + fxch %st(3) // yaccum | xaccum | yaccum2 | xaccum3 | + // yaccum3 | zi + faddp %st(0),%st(2) // xaccum | yaccum | xaccum3 | yaccum3 | zi + addl $(tv_size),%esi + faddp %st(0),%st(2) // yaccum | x | yaccum3 | zi + faddp %st(0),%st(2) // x | y | zi + addl $(stv_size),%ebp + fmul %st(2),%st(0) // x/z | y | zi + fxch %st(1) // y | x/z | zi + fmul %st(2),%st(0) // y/z | x/z | zi + fxch %st(1) // x/z | y/z | zi + fadds C(aliasxcenter) // u | y/z | zi + fxch %st(1) // y/z | u | zi + fadds C(aliasycenter) // v | u | zi + fxch %st(2) // zi | u | v +// FIXME: fast float->int conversion? + fistpl fv_v+20(%edi) // u | v + fistpl fv_v+0(%edi) // v + fistpl fv_v+4(%edi) + +// } + + addl $(fv_size),%edi + decl %ecx + jnz Lloop + + popl %esi // restore register variables + popl %edi + popl %ebp // restore the caller's stack frame + ret + +Lsavelight: + fstp %st(0) + jmp Lp1 + +#endif // id386 + diff --git a/source/r_drawa.S b/source/r_drawa.S new file mode 100644 index 0000000..60874e8 --- /dev/null +++ b/source/r_drawa.S @@ -0,0 +1,838 @@ +/* +Copyright (C) 1996-1997 Id Software, Inc. + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +*/ +// +// r_drawa.s +// x86 assembly-language edge clipping and emission code +// + +#include "asm_i386.h" +#include "quakeasm.h" +#include "asm_draw.h" +#include "d_ifacea.h" + +#if id386 + +// !!! if these are changed, they must be changed in r_draw.c too !!! +#define FULLY_CLIPPED_CACHED 0x80000000 +#define FRAMECOUNT_MASK 0x7FFFFFFF + + .data + +Ld0: .single 0.0 +Ld1: .single 0.0 +Lstack: .long 0 +Lfp_near_clip: .single NEAR_CLIP +Lceilv0: .long 0 +Lv: .long 0 +Lu0: .long 0 +Lv0: .long 0 +Lzi0: .long 0 + + .text + +//---------------------------------------------------------------------- +// edge clipping code +//---------------------------------------------------------------------- + +#define pv0 4+12 +#define pv1 8+12 +#define clip 12+12 + + .align 4 +.globl C(R_ClipEdge) +C(R_ClipEdge): + pushl %esi // preserve register variables + pushl %edi + pushl %ebx + movl %esp,Lstack // for clearing the stack later + +// float d0, d1, f; +// mvertex_t clipvert; + + movl clip(%esp),%ebx + movl pv0(%esp),%esi + movl pv1(%esp),%edx + +// if (clip) +// { + testl %ebx,%ebx + jz Lemit + +// do +// { + +Lcliploop: + +// d0 = DotProduct (pv0->position, clip->normal) - clip->dist; +// d1 = DotProduct (pv1->position, clip->normal) - clip->dist; + flds mv_position+0(%esi) + fmuls cp_normal+0(%ebx) + flds mv_position+4(%esi) + fmuls cp_normal+4(%ebx) + flds mv_position+8(%esi) + fmuls cp_normal+8(%ebx) + fxch %st(1) + faddp %st(0),%st(2) // d0mul2 | d0add0 + + flds mv_position+0(%edx) + fmuls cp_normal+0(%ebx) + flds mv_position+4(%edx) + fmuls cp_normal+4(%ebx) + flds mv_position+8(%edx) + fmuls cp_normal+8(%ebx) + fxch %st(1) + faddp %st(0),%st(2) // d1mul2 | d1add0 | d0mul2 | d0add0 + fxch %st(3) // d0add0 | d1add0 | d0mul2 | d1mul2 + + faddp %st(0),%st(2) // d1add0 | dot0 | d1mul2 + faddp %st(0),%st(2) // dot0 | dot1 + + fsubs cp_dist(%ebx) // d0 | dot1 + fxch %st(1) // dot1 | d0 + fsubs cp_dist(%ebx) // d1 | d0 + fxch %st(1) + fstps Ld0 + fstps Ld1 + +// if (d0 >= 0) +// { + movl Ld0,%eax + movl Ld1,%ecx + orl %eax,%ecx + js Lp2 + +// both points are unclipped + +Lcontinue: + +// +// R_ClipEdge (&clipvert, pv1, clip->next); +// return; +// } +// } while ((clip = clip->next) != NULL); + movl cp_next(%ebx),%ebx + testl %ebx,%ebx + jnz Lcliploop + +// } + +//// add the edge +// R_EmitEdge (pv0, pv1); +Lemit: + +// +// set integer rounding to ceil mode, set to single precision +// +// FIXME: do away with by manually extracting integers from floats? +// FIXME: set less often + fldcw ceil_cw + +// edge_t *edge, *pcheck; +// int u_check; +// float u, u_step; +// vec3_t local, transformed; +// float *world; +// int v, v2, ceilv0; +// float scale, lzi0, u0, v0; +// int side; + +// if (r_lastvertvalid) +// { + cmpl $0,C(r_lastvertvalid) + jz LCalcFirst + +// u0 = r_u1; +// v0 = r_v1; +// lzi0 = r_lzi1; +// ceilv0 = r_ceilv1; + movl C(r_lzi1),%eax + movl C(r_u1),%ecx + movl %eax,Lzi0 + movl %ecx,Lu0 + movl C(r_v1),%ecx + movl C(r_ceilv1),%eax + movl %ecx,Lv0 + movl %eax,Lceilv0 + jmp LCalcSecond + +// } + +LCalcFirst: + +// else +// { +// world = &pv0->position[0]; + + call LTransformAndProject // v0 | lzi0 | u0 + + fsts Lv0 + fxch %st(2) // u0 | lzi0 | v0 + fstps Lu0 // lzi0 | v0 + fstps Lzi0 // v0 + +// ceilv0 = (int)(v0 - 2000) + 2000; // ceil(v0); + fistpl Lceilv0 + +// } + +LCalcSecond: + +// world = &pv1->position[0]; + movl %edx,%esi + + call LTransformAndProject // v1 | lzi1 | u1 + + flds Lu0 // u0 | v1 | lzi1 | u1 + fxch %st(3) // u1 | v1 | lzi1 | u0 + flds Lzi0 // lzi0 | u1 | v1 | lzi1 | u0 + fxch %st(3) // lzi1 | u1 | v1 | lzi0 | u0 + flds Lv0 // v0 | lzi1 | u1 | v1 | lzi0 | u0 + fxch %st(3) // v1 | lzi1 | u1 | v0 | lzi0 | u0 + +// r_ceilv1 = (int)(r_v1 - 2000) + 2000; // ceil(r_v1); + fistl C(r_ceilv1) + + fldcw single_cw // put back normal floating-point state + + fsts C(r_v1) + fxch %st(4) // lzi0 | lzi1 | u1 | v0 | v1 | u0 + +// if (r_lzi1 > lzi0) +// lzi0 = r_lzi1; + fcom %st(1) + fnstsw %ax + testb $1,%ah + jz LP0 + fstp %st(0) + fld %st(0) +LP0: + + fxch %st(1) // lzi1 | lzi0 | u1 | v0 | v1 | u0 + fstps C(r_lzi1) // lzi0 | u1 | v0 | v1 | u0 + fxch %st(1) + fsts C(r_u1) + fxch %st(1) + +// if (lzi0 > r_nearzi) // for mipmap finding +// r_nearzi = lzi0; + fcoms C(r_nearzi) + fnstsw %ax + testb $0x45,%ah + jnz LP1 + fsts C(r_nearzi) +LP1: + +// // for right edges, all we want is the effect on 1/z +// if (r_nearzionly) +// return; + movl C(r_nearzionly),%eax + testl %eax,%eax + jz LP2 +LPop5AndDone: + movl C(cacheoffset),%eax + movl C(r_framecount),%edx + cmpl $0x7FFFFFFF,%eax + jz LDoPop + andl $(FRAMECOUNT_MASK),%edx + orl $(FULLY_CLIPPED_CACHED),%edx + movl %edx,C(cacheoffset) + +LDoPop: + fstp %st(0) // u1 | v0 | v1 | u0 + fstp %st(0) // v0 | v1 | u0 + fstp %st(0) // v1 | u0 + fstp %st(0) // u0 + fstp %st(0) + jmp Ldone + +LP2: + +// // create the edge +// if (ceilv0 == r_ceilv1) +// return; // horizontal edge + movl Lceilv0,%ebx + movl C(edge_p),%edi + movl C(r_ceilv1),%ecx + movl %edi,%edx + movl C(r_pedge),%esi + addl $(et_size),%edx + cmpl %ecx,%ebx + jz LPop5AndDone + + movl C(r_pedge),%eax + movl %eax,et_owner(%edi) + +// side = ceilv0 > r_ceilv1; +// +// edge->nearzi = lzi0; + fstps et_nearzi(%edi) // u1 | v0 | v1 | u0 + +// if (side == 1) +// { + jc LSide0 + +LSide1: + +// // leading edge (go from p2 to p1) + +// u_step = ((u0 - r_u1) / (v0 - r_v1)); + fsubrp %st(0),%st(3) // v0 | v1 | u0-u1 + fsub %st(1),%st(0) // v0-v1 | v1 | u0-u1 + fdivrp %st(0),%st(2) // v1 | ustep + +// r_emitted = 1; + movl $1,C(r_emitted) + +// edge = edge_p++; + movl %edx,C(edge_p) + +// pretouch next edge + movl (%edx),%eax + +// v2 = ceilv0 - 1; +// v = r_ceilv1; + movl %ecx,%eax + leal -1(%ebx),%ecx + movl %eax,%ebx + +// edge->surfs[0] = 0; +// edge->surfs[1] = surface_p - surfaces; + movl C(surface_p),%eax + movl C(surfaces),%esi + subl %edx,%edx + subl %esi,%eax + shrl $(SURF_T_SHIFT),%eax + movl %edx,et_surfs(%edi) + movl %eax,et_surfs+2(%edi) + + subl %esi,%esi + +// u = r_u1 + ((float)v - r_v1) * u_step; + movl %ebx,Lv + fildl Lv // v | v1 | ustep + fsubp %st(0),%st(1) // v-v1 | ustep + fmul %st(1),%st(0) // (v-v1)*ustep | ustep + fadds C(r_u1) // u | ustep + + jmp LSideDone + +// } + +LSide0: + +// else +// { +// // trailing edge (go from p1 to p2) + +// u_step = ((r_u1 - u0) / (r_v1 - v0)); + fsub %st(3),%st(0) // u1-u0 | v0 | v1 | u0 + fxch %st(2) // v1 | v0 | u1-u0 | u0 + fsub %st(1),%st(0) // v1-v0 | v0 | u1-u0 | u0 + fdivrp %st(0),%st(2) // v0 | ustep | u0 + +// r_emitted = 1; + movl $1,C(r_emitted) + +// edge = edge_p++; + movl %edx,C(edge_p) + +// pretouch next edge + movl (%edx),%eax + +// v = ceilv0; +// v2 = r_ceilv1 - 1; + decl %ecx + +// edge->surfs[0] = surface_p - surfaces; +// edge->surfs[1] = 0; + movl C(surface_p),%eax + movl C(surfaces),%esi + subl %edx,%edx + subl %esi,%eax + shrl $(SURF_T_SHIFT),%eax + movl %edx,et_surfs+2(%edi) + movl %eax,et_surfs(%edi) + + movl $1,%esi + +// u = u0 + ((float)v - v0) * u_step; + movl %ebx,Lv + fildl Lv // v | v0 | ustep | u0 + fsubp %st(0),%st(1) // v-v0 | ustep | u0 + fmul %st(1),%st(0) // (v-v0)*ustep | ustep | u0 + faddp %st(0),%st(2) // ustep | u + fxch %st(1) // u | ustep + +// } + +LSideDone: + +// edge->u_step = u_step*0x100000; +// edge->u = u*0x100000 + 0xFFFFF; + + fmuls fp_1m // u*0x100000 | ustep + fxch %st(1) // ustep | u*0x100000 + fmuls fp_1m // ustep*0x100000 | u*0x100000 + fxch %st(1) // u*0x100000 | ustep*0x100000 + fadds fp_1m_minus_1 // u*0x100000 + 0xFFFFF | ustep*0x100000 + fxch %st(1) // ustep*0x100000 | u*0x100000 + 0xFFFFF + fistpl et_u_step(%edi) // u*0x100000 + 0xFFFFF + fistpl et_u(%edi) + +// // we need to do this to avoid stepping off the edges if a very nearly +// // horizontal edge is less than epsilon above a scan, and numeric error +// // causes it to incorrectly extend to the scan, and the extension of the +// // line goes off the edge of the screen +// // FIXME: is this actually needed? +// if (edge->u < r_refdef.vrect_x_adj_shift20) +// edge->u = r_refdef.vrect_x_adj_shift20; +// if (edge->u > r_refdef.vrectright_adj_shift20) +// edge->u = r_refdef.vrectright_adj_shift20; + movl et_u(%edi),%eax + movl C(r_refdef)+rd_vrect_x_adj_shift20,%edx + cmpl %edx,%eax + jl LP4 + movl C(r_refdef)+rd_vrectright_adj_shift20,%edx + cmpl %edx,%eax + jng LP5 +LP4: + movl %edx,et_u(%edi) + movl %edx,%eax +LP5: + +// // sort the edge in normally +// u_check = edge->u; +// +// if (edge->surfs[0]) +// u_check++; // sort trailers after leaders + addl %esi,%eax + +// if (!newedges[v] || newedges[v]->u >= u_check) +// { + movl C(newedges)(,%ebx,4),%esi + testl %esi,%esi + jz LDoFirst + cmpl %eax,et_u(%esi) + jl LNotFirst +LDoFirst: + +// edge->next = newedges[v]; +// newedges[v] = edge; + movl %esi,et_next(%edi) + movl %edi,C(newedges)(,%ebx,4) + + jmp LSetRemove + +// } + +LNotFirst: + +// else +// { +// pcheck = newedges[v]; +// +// while (pcheck->next && pcheck->next->u < u_check) +// pcheck = pcheck->next; +LFindInsertLoop: + movl %esi,%edx + movl et_next(%esi),%esi + testl %esi,%esi + jz LInsertFound + cmpl %eax,et_u(%esi) + jl LFindInsertLoop + +LInsertFound: + +// edge->next = pcheck->next; +// pcheck->next = edge; + movl %esi,et_next(%edi) + movl %edi,et_next(%edx) + +// } + +LSetRemove: + +// edge->nextremove = removeedges[v2]; +// removeedges[v2] = edge; + movl C(removeedges)(,%ecx,4),%eax + movl %edi,C(removeedges)(,%ecx,4) + movl %eax,et_nextremove(%edi) + +Ldone: + movl Lstack,%esp // clear temporary variables from stack + + popl %ebx // restore register variables + popl %edi + popl %esi + ret + +// at least one point is clipped + +Lp2: + testl %eax,%eax + jns Lp1 + +// else +// { +// // point 0 is clipped + +// if (d1 < 0) +// { + movl Ld1,%eax + testl %eax,%eax + jns Lp3 + +// // both points are clipped +// // we do cache fully clipped edges +// if (!leftclipped) + movl C(r_leftclipped),%eax + movl C(r_pedge),%ecx + testl %eax,%eax + jnz Ldone + +// r_pedge->framecount = r_framecount; + movl C(r_framecount),%eax + andl $(FRAMECOUNT_MASK),%eax + orl $(FULLY_CLIPPED_CACHED),%eax + movl %eax,C(cacheoffset) + +// return; + jmp Ldone + +// } + +Lp1: + +// // point 0 is unclipped +// if (d1 >= 0) +// { +// // both points are unclipped +// continue; + +// // only point 1 is clipped + +// f = d0 / (d0 - d1); + flds Ld0 + flds Ld1 + fsubr %st(1),%st(0) + +// // we don't cache partially clipped edges + movl $0x7FFFFFFF,C(cacheoffset) + + fdivrp %st(0),%st(1) + + subl $(mv_size),%esp // allocate space for clipvert + +// clipvert.position[0] = pv0->position[0] + +// f * (pv1->position[0] - pv0->position[0]); +// clipvert.position[1] = pv0->position[1] + +// f * (pv1->position[1] - pv0->position[1]); +// clipvert.position[2] = pv0->position[2] + +// f * (pv1->position[2] - pv0->position[2]); + flds mv_position+8(%edx) + fsubs mv_position+8(%esi) + flds mv_position+4(%edx) + fsubs mv_position+4(%esi) + flds mv_position+0(%edx) + fsubs mv_position+0(%esi) // 0 | 1 | 2 + +// replace pv1 with the clip point + movl %esp,%edx + movl cp_leftedge(%ebx),%eax + testb %al,%al + + fmul %st(3),%st(0) + fxch %st(1) // 1 | 0 | 2 + fmul %st(3),%st(0) + fxch %st(2) // 2 | 0 | 1 + fmulp %st(0),%st(3) // 0 | 1 | 2 + fadds mv_position+0(%esi) + fxch %st(1) // 1 | 0 | 2 + fadds mv_position+4(%esi) + fxch %st(2) // 2 | 0 | 1 + fadds mv_position+8(%esi) + fxch %st(1) // 0 | 2 | 1 + fstps mv_position+0(%esp) // 2 | 1 + fstps mv_position+8(%esp) // 1 + fstps mv_position+4(%esp) + +// if (clip->leftedge) +// { + jz Ltestright + +// r_leftclipped = true; +// r_leftexit = clipvert; + movl $1,C(r_leftclipped) + movl mv_position+0(%esp),%eax + movl %eax,C(r_leftexit)+mv_position+0 + movl mv_position+4(%esp),%eax + movl %eax,C(r_leftexit)+mv_position+4 + movl mv_position+8(%esp),%eax + movl %eax,C(r_leftexit)+mv_position+8 + + jmp Lcontinue + +// } + +Ltestright: +// else if (clip->rightedge) +// { + testb %ah,%ah + jz Lcontinue + +// r_rightclipped = true; +// r_rightexit = clipvert; + movl $1,C(r_rightclipped) + movl mv_position+0(%esp),%eax + movl %eax,C(r_rightexit)+mv_position+0 + movl mv_position+4(%esp),%eax + movl %eax,C(r_rightexit)+mv_position+4 + movl mv_position+8(%esp),%eax + movl %eax,C(r_rightexit)+mv_position+8 + +// } +// +// R_ClipEdge (pv0, &clipvert, clip->next); +// return; +// } + jmp Lcontinue + +// } + +Lp3: + +// // only point 0 is clipped +// r_lastvertvalid = false; + + movl $0,C(r_lastvertvalid) + +// f = d0 / (d0 - d1); + flds Ld0 + flds Ld1 + fsubr %st(1),%st(0) + +// // we don't cache partially clipped edges + movl $0x7FFFFFFF,C(cacheoffset) + + fdivrp %st(0),%st(1) + + subl $(mv_size),%esp // allocate space for clipvert + +// clipvert.position[0] = pv0->position[0] + +// f * (pv1->position[0] - pv0->position[0]); +// clipvert.position[1] = pv0->position[1] + +// f * (pv1->position[1] - pv0->position[1]); +// clipvert.position[2] = pv0->position[2] + +// f * (pv1->position[2] - pv0->position[2]); + flds mv_position+8(%edx) + fsubs mv_position+8(%esi) + flds mv_position+4(%edx) + fsubs mv_position+4(%esi) + flds mv_position+0(%edx) + fsubs mv_position+0(%esi) // 0 | 1 | 2 + + movl cp_leftedge(%ebx),%eax + testb %al,%al + + fmul %st(3),%st(0) + fxch %st(1) // 1 | 0 | 2 + fmul %st(3),%st(0) + fxch %st(2) // 2 | 0 | 1 + fmulp %st(0),%st(3) // 0 | 1 | 2 + fadds mv_position+0(%esi) + fxch %st(1) // 1 | 0 | 2 + fadds mv_position+4(%esi) + fxch %st(2) // 2 | 0 | 1 + fadds mv_position+8(%esi) + fxch %st(1) // 0 | 2 | 1 + fstps mv_position+0(%esp) // 2 | 1 + fstps mv_position+8(%esp) // 1 + fstps mv_position+4(%esp) + +// replace pv0 with the clip point + movl %esp,%esi + +// if (clip->leftedge) +// { + jz Ltestright2 + +// r_leftclipped = true; +// r_leftenter = clipvert; + movl $1,C(r_leftclipped) + movl mv_position+0(%esp),%eax + movl %eax,C(r_leftenter)+mv_position+0 + movl mv_position+4(%esp),%eax + movl %eax,C(r_leftenter)+mv_position+4 + movl mv_position+8(%esp),%eax + movl %eax,C(r_leftenter)+mv_position+8 + + jmp Lcontinue + +// } + +Ltestright2: +// else if (clip->rightedge) +// { + testb %ah,%ah + jz Lcontinue + +// r_rightclipped = true; +// r_rightenter = clipvert; + movl $1,C(r_rightclipped) + movl mv_position+0(%esp),%eax + movl %eax,C(r_rightenter)+mv_position+0 + movl mv_position+4(%esp),%eax + movl %eax,C(r_rightenter)+mv_position+4 + movl mv_position+8(%esp),%eax + movl %eax,C(r_rightenter)+mv_position+8 + +// } + jmp Lcontinue + +// %esi = vec3_t point to transform and project +// %edx preserved +LTransformAndProject: + +// // transform and project +// VectorSubtract (world, modelorg, local); + flds mv_position+0(%esi) + fsubs C(modelorg)+0 + flds mv_position+4(%esi) + fsubs C(modelorg)+4 + flds mv_position+8(%esi) + fsubs C(modelorg)+8 + fxch %st(2) // local[0] | local[1] | local[2] + +// TransformVector (local, transformed); +// +// if (transformed[2] < NEAR_CLIP) +// transformed[2] = NEAR_CLIP; +// +// lzi0 = 1.0 / transformed[2]; + fld %st(0) // local[0] | local[0] | local[1] | local[2] + fmuls C(vpn)+0 // zm0 | local[0] | local[1] | local[2] + fld %st(1) // local[0] | zm0 | local[0] | local[1] | + // local[2] + fmuls C(vright)+0 // xm0 | zm0 | local[0] | local[1] | local[2] + fxch %st(2) // local[0] | zm0 | xm0 | local[1] | local[2] + fmuls C(vup)+0 // ym0 | zm0 | xm0 | local[1] | local[2] + fld %st(3) // local[1] | ym0 | zm0 | xm0 | local[1] | + // local[2] + fmuls C(vpn)+4 // zm1 | ym0 | zm0 | xm0 | local[1] | + // local[2] + fld %st(4) // local[1] | zm1 | ym0 | zm0 | xm0 | + // local[1] | local[2] + fmuls C(vright)+4 // xm1 | zm1 | ym0 | zm0 | xm0 | + // local[1] | local[2] + fxch %st(5) // local[1] | zm1 | ym0 | zm0 | xm0 | + // xm1 | local[2] + fmuls C(vup)+4 // ym1 | zm1 | ym0 | zm0 | xm0 | + // xm1 | local[2] + fxch %st(1) // zm1 | ym1 | ym0 | zm0 | xm0 | + // xm1 | local[2] + faddp %st(0),%st(3) // ym1 | ym0 | zm2 | xm0 | xm1 | local[2] + fxch %st(3) // xm0 | ym0 | zm2 | ym1 | xm1 | local[2] + faddp %st(0),%st(4) // ym0 | zm2 | ym1 | xm2 | local[2] + faddp %st(0),%st(2) // zm2 | ym2 | xm2 | local[2] + fld %st(3) // local[2] | zm2 | ym2 | xm2 | local[2] + fmuls C(vpn)+8 // zm3 | zm2 | ym2 | xm2 | local[2] + fld %st(4) // local[2] | zm3 | zm2 | ym2 | xm2 | local[2] + fmuls C(vright)+8 // xm3 | zm3 | zm2 | ym2 | xm2 | local[2] + fxch %st(5) // local[2] | zm3 | zm2 | ym2 | xm2 | xm3 + fmuls C(vup)+8 // ym3 | zm3 | zm2 | ym2 | xm2 | xm3 + fxch %st(1) // zm3 | ym3 | zm2 | ym2 | xm2 | xm3 + faddp %st(0),%st(2) // ym3 | zm4 | ym2 | xm2 | xm3 + fxch %st(4) // xm3 | zm4 | ym2 | xm2 | ym3 + faddp %st(0),%st(3) // zm4 | ym2 | xm4 | ym3 + fxch %st(1) // ym2 | zm4 | xm4 | ym3 + faddp %st(0),%st(3) // zm4 | xm4 | ym4 + + fcoms Lfp_near_clip + fnstsw %ax + testb $1,%ah + jz LNoClip + fstp %st(0) + flds Lfp_near_clip + +LNoClip: + + fdivrs float_1 // lzi0 | x | y + fxch %st(1) // x | lzi0 | y + +// // FIXME: build x/yscale into transform? +// scale = xscale * lzi0; +// u0 = (xcenter + scale*transformed[0]); + flds C(xscale) // xscale | x | lzi0 | y + fmul %st(2),%st(0) // scale | x | lzi0 | y + fmulp %st(0),%st(1) // scale*x | lzi0 | y + fadds C(xcenter) // u0 | lzi0 | y + +// if (u0 < r_refdef.fvrectx_adj) +// u0 = r_refdef.fvrectx_adj; +// if (u0 > r_refdef.fvrectright_adj) +// u0 = r_refdef.fvrectright_adj; +// FIXME: use integer compares of floats? + fcoms C(r_refdef)+rd_fvrectx_adj + fnstsw %ax + testb $1,%ah + jz LClampP0 + fstp %st(0) + flds C(r_refdef)+rd_fvrectx_adj +LClampP0: + fcoms C(r_refdef)+rd_fvrectright_adj + fnstsw %ax + testb $0x45,%ah + jnz LClampP1 + fstp %st(0) + flds C(r_refdef)+rd_fvrectright_adj +LClampP1: + + fld %st(1) // lzi0 | u0 | lzi0 | y + +// scale = yscale * lzi0; +// v0 = (ycenter - scale*transformed[1]); + fmuls C(yscale) // scale | u0 | lzi0 | y + fmulp %st(0),%st(3) // u0 | lzi0 | scale*y + fxch %st(2) // scale*y | lzi0 | u0 + fsubrs C(ycenter) // v0 | lzi0 | u0 + +// if (v0 < r_refdef.fvrecty_adj) +// v0 = r_refdef.fvrecty_adj; +// if (v0 > r_refdef.fvrectbottom_adj) +// v0 = r_refdef.fvrectbottom_adj; +// FIXME: use integer compares of floats? + fcoms C(r_refdef)+rd_fvrecty_adj + fnstsw %ax + testb $1,%ah + jz LClampP2 + fstp %st(0) + flds C(r_refdef)+rd_fvrecty_adj +LClampP2: + fcoms C(r_refdef)+rd_fvrectbottom_adj + fnstsw %ax + testb $0x45,%ah + jnz LClampP3 + fstp %st(0) + flds C(r_refdef)+rd_fvrectbottom_adj +LClampP3: + ret + +#endif // id386 + diff --git a/source/r_edgea.S b/source/r_edgea.S new file mode 100644 index 0000000..8507a5b --- /dev/null +++ b/source/r_edgea.S @@ -0,0 +1,750 @@ +/* +Copyright (C) 1996-1997 Id Software, Inc. + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +*/ +// +// r_edgea.s +// x86 assembly-language edge-processing code. +// + +#include "asm_i386.h" +#include "quakeasm.h" +#include "asm_draw.h" + +#if id386 + + .data +Ltemp: .long 0 +float_1_div_0100000h: .long 0x35800000 // 1.0/(float)0x100000 +float_point_999: .single 0.999 +float_1_point_001: .single 1.001 + + .text + +//-------------------------------------------------------------------- + +#define edgestoadd 4+8 // note odd stack offsets because of interleaving +#define edgelist 8+12 // with pushes + +.globl C(R_EdgeCodeStart) +C(R_EdgeCodeStart): + +.globl C(R_InsertNewEdges) +C(R_InsertNewEdges): + pushl %edi + pushl %esi // preserve register variables + movl edgestoadd(%esp),%edx + pushl %ebx + movl edgelist(%esp),%ecx + +LDoNextEdge: + movl et_u(%edx),%eax + movl %edx,%edi + +LContinueSearch: + movl et_u(%ecx),%ebx + movl et_next(%ecx),%esi + cmpl %ebx,%eax + jle LAddedge + movl et_u(%esi),%ebx + movl et_next(%esi),%ecx + cmpl %ebx,%eax + jle LAddedge2 + movl et_u(%ecx),%ebx + movl et_next(%ecx),%esi + cmpl %ebx,%eax + jle LAddedge + movl et_u(%esi),%ebx + movl et_next(%esi),%ecx + cmpl %ebx,%eax + jg LContinueSearch + +LAddedge2: + movl et_next(%edx),%edx + movl et_prev(%esi),%ebx + movl %esi,et_next(%edi) + movl %ebx,et_prev(%edi) + movl %edi,et_next(%ebx) + movl %edi,et_prev(%esi) + movl %esi,%ecx + + cmpl $0,%edx + jnz LDoNextEdge + jmp LDone + + .align 4 +LAddedge: + movl et_next(%edx),%edx + movl et_prev(%ecx),%ebx + movl %ecx,et_next(%edi) + movl %ebx,et_prev(%edi) + movl %edi,et_next(%ebx) + movl %edi,et_prev(%ecx) + + cmpl $0,%edx + jnz LDoNextEdge + +LDone: + popl %ebx // restore register variables + popl %esi + popl %edi + + ret + +//-------------------------------------------------------------------- + +#define predge 4+4 + +.globl C(R_RemoveEdges) +C(R_RemoveEdges): + pushl %ebx + movl predge(%esp),%eax + +Lre_loop: + movl et_next(%eax),%ecx + movl et_nextremove(%eax),%ebx + movl et_prev(%eax),%edx + testl %ebx,%ebx + movl %edx,et_prev(%ecx) + jz Lre_done + movl %ecx,et_next(%edx) + + movl et_next(%ebx),%ecx + movl et_prev(%ebx),%edx + movl et_nextremove(%ebx),%eax + movl %edx,et_prev(%ecx) + testl %eax,%eax + movl %ecx,et_next(%edx) + jnz Lre_loop + + popl %ebx + ret + +Lre_done: + movl %ecx,et_next(%edx) + popl %ebx + + ret + +//-------------------------------------------------------------------- + +#define pedgelist 4+4 // note odd stack offset because of interleaving + // with pushes + +.globl C(R_StepActiveU) +C(R_StepActiveU): + pushl %edi + movl pedgelist(%esp),%edx + pushl %esi // preserve register variables + pushl %ebx + + movl et_prev(%edx),%esi + +LNewEdge: + movl et_u(%esi),%edi + +LNextEdge: + movl et_u(%edx),%eax + movl et_u_step(%edx),%ebx + addl %ebx,%eax + movl et_next(%edx),%esi + movl %eax,et_u(%edx) + cmpl %edi,%eax + jl LPushBack + + movl et_u(%esi),%edi + movl et_u_step(%esi),%ebx + addl %ebx,%edi + movl et_next(%esi),%edx + movl %edi,et_u(%esi) + cmpl %eax,%edi + jl LPushBack2 + + movl et_u(%edx),%eax + movl et_u_step(%edx),%ebx + addl %ebx,%eax + movl et_next(%edx),%esi + movl %eax,et_u(%edx) + cmpl %edi,%eax + jl LPushBack + + movl et_u(%esi),%edi + movl et_u_step(%esi),%ebx + addl %ebx,%edi + movl et_next(%esi),%edx + movl %edi,et_u(%esi) + cmpl %eax,%edi + jnl LNextEdge + +LPushBack2: + movl %edx,%ebx + movl %edi,%eax + movl %esi,%edx + movl %ebx,%esi + +LPushBack: +// push it back to keep it sorted + movl et_prev(%edx),%ecx + movl et_next(%edx),%ebx + +// done if the -1 in edge_aftertail triggered this + cmpl $(C(edge_aftertail)),%edx + jz LUDone + +// pull the edge out of the edge list + movl et_prev(%ecx),%edi + movl %ecx,et_prev(%esi) + movl %ebx,et_next(%ecx) + +// find out where the edge goes in the edge list +LPushBackLoop: + movl et_prev(%edi),%ecx + movl et_u(%edi),%ebx + cmpl %ebx,%eax + jnl LPushBackFound + + movl et_prev(%ecx),%edi + movl et_u(%ecx),%ebx + cmpl %ebx,%eax + jl LPushBackLoop + + movl %ecx,%edi + +// put the edge back into the edge list +LPushBackFound: + movl et_next(%edi),%ebx + movl %edi,et_prev(%edx) + movl %ebx,et_next(%edx) + movl %edx,et_next(%edi) + movl %edx,et_prev(%ebx) + + movl %esi,%edx + movl et_prev(%esi),%esi + + cmpl $(C(edge_tail)),%edx + jnz LNewEdge + +LUDone: + popl %ebx // restore register variables + popl %esi + popl %edi + + ret + +//-------------------------------------------------------------------- + +#define surf 4 // note this is loaded before any pushes + + .align 4 +TrailingEdge: + movl st_spanstate(%esi),%eax // check for edge inversion + decl %eax + jnz LInverted + + movl %eax,st_spanstate(%esi) + movl st_insubmodel(%esi),%ecx + movl 0x12345678,%edx // surfaces[1].st_next +LPatch0: + movl C(r_bmodelactive),%eax + subl %ecx,%eax + cmpl %esi,%edx + movl %eax,C(r_bmodelactive) + jnz LNoEmit // surface isn't on top, just remove + +// emit a span (current top going away) + movl et_u(%ebx),%eax + shrl $20,%eax // iu = integral pixel u + movl st_last_u(%esi),%edx + movl st_next(%esi),%ecx + cmpl %edx,%eax + jle LNoEmit2 // iu <= surf->last_u, so nothing to emit + + movl %eax,st_last_u(%ecx) // surf->next->last_u = iu; + subl %edx,%eax + movl %edx,espan_t_u(%ebp) // span->u = surf->last_u; + + movl %eax,espan_t_count(%ebp) // span->count = iu - span->u; + movl C(current_iv),%eax + movl %eax,espan_t_v(%ebp) // span->v = current_iv; + movl st_spans(%esi),%eax + movl %eax,espan_t_pnext(%ebp) // span->pnext = surf->spans; + movl %ebp,st_spans(%esi) // surf->spans = span; + addl $(espan_t_size),%ebp + + movl st_next(%esi),%edx // remove the surface from the surface + movl st_prev(%esi),%esi // stack + + movl %edx,st_next(%esi) + movl %esi,st_prev(%edx) + ret + +LNoEmit2: + movl %eax,st_last_u(%ecx) // surf->next->last_u = iu; + movl st_next(%esi),%edx // remove the surface from the surface + movl st_prev(%esi),%esi // stack + + movl %edx,st_next(%esi) + movl %esi,st_prev(%edx) + ret + +LNoEmit: + movl st_next(%esi),%edx // remove the surface from the surface + movl st_prev(%esi),%esi // stack + + movl %edx,st_next(%esi) + movl %esi,st_prev(%edx) + ret + +LInverted: + movl %eax,st_spanstate(%esi) + ret + +//-------------------------------------------------------------------- + +// trailing edge only +Lgs_trailing: + pushl $Lgs_nextedge + jmp TrailingEdge + + +.globl C(R_GenerateSpans) +C(R_GenerateSpans): + pushl %ebp // preserve caller's stack frame + pushl %edi + pushl %esi // preserve register variables + pushl %ebx + +// clear active surfaces to just the background surface + movl C(surfaces),%eax + movl C(edge_head_u_shift20),%edx + addl $(st_size),%eax +// %ebp = span_p throughout + movl C(span_p),%ebp + + movl $0,C(r_bmodelactive) + + movl %eax,st_next(%eax) + movl %eax,st_prev(%eax) + movl %edx,st_last_u(%eax) + movl C(edge_head)+et_next,%ebx // edge=edge_head.next + +// generate spans + cmpl $(C(edge_tail)),%ebx // done if empty list + jz Lgs_lastspan + +Lgs_edgeloop: + + movl et_surfs(%ebx),%edi + movl C(surfaces),%eax + movl %edi,%esi + andl $0xFFFF0000,%edi + andl $0xFFFF,%esi + jz Lgs_leading // not a trailing edge + +// it has a left surface, so a surface is going away for this span + shll $(SURF_T_SHIFT),%esi + addl %eax,%esi + testl %edi,%edi + jz Lgs_trailing + +// both leading and trailing + call TrailingEdge + movl C(surfaces),%eax + +// --------------------------------------------------------------- +// handle a leading edge +// --------------------------------------------------------------- + +Lgs_leading: + shrl $16-SURF_T_SHIFT,%edi + movl C(surfaces),%eax + addl %eax,%edi + movl 0x12345678,%esi // surf2 = surfaces[1].next; +LPatch2: + movl st_spanstate(%edi),%edx + movl st_insubmodel(%edi),%eax + testl %eax,%eax + jnz Lbmodel_leading + +// handle a leading non-bmodel edge + +// don't start a span if this is an inverted span, with the end edge preceding +// the start edge (that is, we've already seen the end edge) + testl %edx,%edx + jnz Lxl_done + + +// if (surf->key < surf2->key) +// goto newtop; + incl %edx + movl st_key(%edi),%eax + movl %edx,st_spanstate(%edi) + movl st_key(%esi),%ecx + cmpl %ecx,%eax + jl Lnewtop + +// main sorting loop to search through surface stack until insertion point +// found. Always terminates because background surface is sentinel +// do +// { +// surf2 = surf2->next; +// } while (surf->key >= surf2->key); +Lsortloopnb: + movl st_next(%esi),%esi + movl st_key(%esi),%ecx + cmpl %ecx,%eax + jge Lsortloopnb + + jmp LInsertAndExit + + +// handle a leading bmodel edge + .align 4 +Lbmodel_leading: + +// don't start a span if this is an inverted span, with the end edge preceding +// the start edge (that is, we've already seen the end edge) + testl %edx,%edx + jnz Lxl_done + + movl C(r_bmodelactive),%ecx + incl %edx + incl %ecx + movl %edx,st_spanstate(%edi) + movl %ecx,C(r_bmodelactive) + +// if (surf->key < surf2->key) +// goto newtop; + movl st_key(%edi),%eax + movl st_key(%esi),%ecx + cmpl %ecx,%eax + jl Lnewtop + +// if ((surf->key == surf2->key) && surf->insubmodel) +// { + jz Lzcheck_for_newtop + +// main sorting loop to search through surface stack until insertion point +// found. Always terminates because background surface is sentinel +// do +// { +// surf2 = surf2->next; +// } while (surf->key > surf2->key); +Lsortloop: + movl st_next(%esi),%esi + movl st_key(%esi),%ecx + cmpl %ecx,%eax + jg Lsortloop + + jne LInsertAndExit + +// Do 1/z sorting to see if we've arrived in the right position + movl et_u(%ebx),%eax + subl $0xFFFFF,%eax + movl %eax,Ltemp + fildl Ltemp + + fmuls float_1_div_0100000h // fu = (float)(edge->u - 0xFFFFF) * + // (1.0 / 0x100000); + + fld %st(0) // fu | fu + fmuls st_d_zistepu(%edi) // fu*surf->d_zistepu | fu + flds C(fv) // fv | fu*surf->d_zistepu | fu + fmuls st_d_zistepv(%edi) // fv*surf->d_zistepv | fu*surf->d_zistepu | fu + fxch %st(1) // fu*surf->d_zistepu | fv*surf->d_zistepv | fu + fadds st_d_ziorigin(%edi) // fu*surf->d_zistepu + surf->d_ziorigin | + // fv*surf->d_zistepv | fu + + flds st_d_zistepu(%esi) // surf2->d_zistepu | + // fu*surf->d_zistepu + surf->d_ziorigin | + // fv*surf->d_zistepv | fu + fmul %st(3),%st(0) // fu*surf2->d_zistepu | + // fu*surf->d_zistepu + surf->d_ziorigin | + // fv*surf->d_zistepv | fu + fxch %st(1) // fu*surf->d_zistepu + surf->d_ziorigin | + // fu*surf2->d_zistepu | + // fv*surf->d_zistepv | fu + faddp %st(0),%st(2) // fu*surf2->d_zistepu | newzi | fu + + flds C(fv) // fv | fu*surf2->d_zistepu | newzi | fu + fmuls st_d_zistepv(%esi) // fv*surf2->d_zistepv | + // fu*surf2->d_zistepu | newzi | fu + fld %st(2) // newzi | fv*surf2->d_zistepv | + // fu*surf2->d_zistepu | newzi | fu + fmuls float_point_999 // newzibottom | fv*surf2->d_zistepv | + // fu*surf2->d_zistepu | newzi | fu + + fxch %st(2) // fu*surf2->d_zistepu | fv*surf2->d_zistepv | + // newzibottom | newzi | fu + fadds st_d_ziorigin(%esi) // fu*surf2->d_zistepu + surf2->d_ziorigin | + // fv*surf2->d_zistepv | newzibottom | newzi | + // fu + faddp %st(0),%st(1) // testzi | newzibottom | newzi | fu + fxch %st(1) // newzibottom | testzi | newzi | fu + +// if (newzibottom >= testzi) +// goto Lgotposition; + + fcomp %st(1) // testzi | newzi | fu + + fxch %st(1) // newzi | testzi | fu + fmuls float_1_point_001 // newzitop | testzi | fu + fxch %st(1) // testzi | newzitop | fu + + fnstsw %ax + testb $0x01,%ah + jz Lgotposition_fpop3 + +// if (newzitop >= testzi) +// { + + fcomp %st(1) // newzitop | fu + fnstsw %ax + testb $0x45,%ah + jz Lsortloop_fpop2 + +// if (surf->d_zistepu >= surf2->d_zistepu) +// goto newtop; + + flds st_d_zistepu(%edi) // surf->d_zistepu | newzitop| fu + fcomps st_d_zistepu(%esi) // newzitop | fu + fnstsw %ax + testb $0x01,%ah + jz Lgotposition_fpop2 + + fstp %st(0) // clear the FPstack + fstp %st(0) + movl st_key(%edi),%eax + jmp Lsortloop + + +Lgotposition_fpop3: + fstp %st(0) +Lgotposition_fpop2: + fstp %st(0) + fstp %st(0) + jmp LInsertAndExit + + +// emit a span (obscures current top) + +Lnewtop_fpop3: + fstp %st(0) +Lnewtop_fpop2: + fstp %st(0) + fstp %st(0) + movl st_key(%edi),%eax // reload the sorting key + +Lnewtop: + movl et_u(%ebx),%eax + movl st_last_u(%esi),%edx + shrl $20,%eax // iu = integral pixel u + movl %eax,st_last_u(%edi) // surf->last_u = iu; + cmpl %edx,%eax + jle LInsertAndExit // iu <= surf->last_u, so nothing to emit + + subl %edx,%eax + movl %edx,espan_t_u(%ebp) // span->u = surf->last_u; + + movl %eax,espan_t_count(%ebp) // span->count = iu - span->u; + movl C(current_iv),%eax + movl %eax,espan_t_v(%ebp) // span->v = current_iv; + movl st_spans(%esi),%eax + movl %eax,espan_t_pnext(%ebp) // span->pnext = surf->spans; + movl %ebp,st_spans(%esi) // surf->spans = span; + addl $(espan_t_size),%ebp + +LInsertAndExit: +// insert before surf2 + movl %esi,st_next(%edi) // surf->next = surf2; + movl st_prev(%esi),%eax + movl %eax,st_prev(%edi) // surf->prev = surf2->prev; + movl %edi,st_prev(%esi) // surf2->prev = surf; + movl %edi,st_next(%eax) // surf2->prev->next = surf; + +// --------------------------------------------------------------- +// leading edge done +// --------------------------------------------------------------- + +// --------------------------------------------------------------- +// see if there are any more edges +// --------------------------------------------------------------- + +Lgs_nextedge: + movl et_next(%ebx),%ebx + cmpl $(C(edge_tail)),%ebx + jnz Lgs_edgeloop + +// clean up at the right edge +Lgs_lastspan: + +// now that we've reached the right edge of the screen, we're done with any +// unfinished surfaces, so emit a span for whatever's on top + movl 0x12345678,%esi // surfaces[1].st_next +LPatch3: + movl C(edge_tail_u_shift20),%eax + xorl %ecx,%ecx + movl st_last_u(%esi),%edx + subl %edx,%eax + jle Lgs_resetspanstate + + movl %edx,espan_t_u(%ebp) + movl %eax,espan_t_count(%ebp) + movl C(current_iv),%eax + movl %eax,espan_t_v(%ebp) + movl st_spans(%esi),%eax + movl %eax,espan_t_pnext(%ebp) + movl %ebp,st_spans(%esi) + addl $(espan_t_size),%ebp + +// reset spanstate for all surfaces in the surface stack +Lgs_resetspanstate: + movl %ecx,st_spanstate(%esi) + movl st_next(%esi),%esi + cmpl $0x12345678,%esi // &surfaces[1] +LPatch4: + jnz Lgs_resetspanstate + +// store the final span_p + movl %ebp,C(span_p) + + popl %ebx // restore register variables + popl %esi + popl %edi + popl %ebp // restore the caller's stack frame + ret + + +// --------------------------------------------------------------- +// 1/z sorting for bmodels in the same leaf +// --------------------------------------------------------------- + .align 4 +Lxl_done: + incl %edx + movl %edx,st_spanstate(%edi) + + jmp Lgs_nextedge + + + .align 4 +Lzcheck_for_newtop: + movl et_u(%ebx),%eax + subl $0xFFFFF,%eax + movl %eax,Ltemp + fildl Ltemp + + fmuls float_1_div_0100000h // fu = (float)(edge->u - 0xFFFFF) * + // (1.0 / 0x100000); + + fld %st(0) // fu | fu + fmuls st_d_zistepu(%edi) // fu*surf->d_zistepu | fu + flds C(fv) // fv | fu*surf->d_zistepu | fu + fmuls st_d_zistepv(%edi) // fv*surf->d_zistepv | fu*surf->d_zistepu | fu + fxch %st(1) // fu*surf->d_zistepu | fv*surf->d_zistepv | fu + fadds st_d_ziorigin(%edi) // fu*surf->d_zistepu + surf->d_ziorigin | + // fv*surf->d_zistepv | fu + + flds st_d_zistepu(%esi) // surf2->d_zistepu | + // fu*surf->d_zistepu + surf->d_ziorigin | + // fv*surf->d_zistepv | fu + fmul %st(3),%st(0) // fu*surf2->d_zistepu | + // fu*surf->d_zistepu + surf->d_ziorigin | + // fv*surf->d_zistepv | fu + fxch %st(1) // fu*surf->d_zistepu + surf->d_ziorigin | + // fu*surf2->d_zistepu | + // fv*surf->d_zistepv | fu + faddp %st(0),%st(2) // fu*surf2->d_zistepu | newzi | fu + + flds C(fv) // fv | fu*surf2->d_zistepu | newzi | fu + fmuls st_d_zistepv(%esi) // fv*surf2->d_zistepv | + // fu*surf2->d_zistepu | newzi | fu + fld %st(2) // newzi | fv*surf2->d_zistepv | + // fu*surf2->d_zistepu | newzi | fu + fmuls float_point_999 // newzibottom | fv*surf2->d_zistepv | + // fu*surf2->d_zistepu | newzi | fu + + fxch %st(2) // fu*surf2->d_zistepu | fv*surf2->d_zistepv | + // newzibottom | newzi | fu + fadds st_d_ziorigin(%esi) // fu*surf2->d_zistepu + surf2->d_ziorigin | + // fv*surf2->d_zistepv | newzibottom | newzi | + // fu + faddp %st(0),%st(1) // testzi | newzibottom | newzi | fu + fxch %st(1) // newzibottom | testzi | newzi | fu + +// if (newzibottom >= testzi) +// goto newtop; + + fcomp %st(1) // testzi | newzi | fu + + fxch %st(1) // newzi | testzi | fu + fmuls float_1_point_001 // newzitop | testzi | fu + fxch %st(1) // testzi | newzitop | fu + + fnstsw %ax + testb $0x01,%ah + jz Lnewtop_fpop3 + +// if (newzitop >= testzi) +// { + + fcomp %st(1) // newzitop | fu + fnstsw %ax + testb $0x45,%ah + jz Lsortloop_fpop2 + +// if (surf->d_zistepu >= surf2->d_zistepu) +// goto newtop; + + flds st_d_zistepu(%edi) // surf->d_zistepu | newzitop | fu + fcomps st_d_zistepu(%esi) // newzitop | fu + fnstsw %ax + testb $0x01,%ah + jz Lnewtop_fpop2 + +Lsortloop_fpop2: + fstp %st(0) // clear the FP stack + fstp %st(0) + movl st_key(%edi),%eax + jmp Lsortloop + + +.globl C(R_EdgeCodeEnd) +C(R_EdgeCodeEnd): + + +//---------------------------------------------------------------------- +// Surface array address code patching routine +//---------------------------------------------------------------------- + + .align 4 +.globl C(R_SurfacePatch) +C(R_SurfacePatch): + + movl C(surfaces),%eax + addl $(st_size),%eax + movl %eax,LPatch4-4 + + addl $(st_next),%eax + movl %eax,LPatch0-4 + movl %eax,LPatch2-4 + movl %eax,LPatch3-4 + + ret + +#endif // id386 + diff --git a/source/r_varsa.S b/source/r_varsa.S new file mode 100644 index 0000000..0b1cda0 --- /dev/null +++ b/source/r_varsa.S @@ -0,0 +1,64 @@ +/* +Copyright (C) 1996-1997 Id Software, Inc. + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +*/ +// +// r_varsa.s +// + +#include "asm_i386.h" +#include "quakeasm.h" +#include "asm_draw.h" +#include "d_ifacea.h" + +#if id386 + + .data + +//------------------------------------------------------- +// ASM-only variables +//------------------------------------------------------- +.globl float_1, float_particle_z_clip, float_point5 +.globl float_minus_1, float_0 +float_0: .single 0.0 +float_1: .single 1.0 +float_minus_1: .single -1.0 +float_particle_z_clip: .single PARTICLE_Z_CLIP +float_point5: .single 0.5 + +.globl fp_16, fp_64k, fp_1m, fp_64kx64k +.globl fp_1m_minus_1 +.globl fp_8 +fp_1m: .single 1048576.0 +fp_1m_minus_1: .single 1048575.0 +fp_64k: .single 65536.0 +fp_8: .single 8.0 +fp_16: .single 16.0 +fp_64kx64k: .long 0x4f000000 // (float)0x8000*0x10000 + + +.globl FloatZero, Float2ToThe31nd, FloatMinus2ToThe31nd +FloatZero: .long 0 +Float2ToThe31nd: .long 0x4f000000 +FloatMinus2ToThe31nd: .long 0xcf000000 + +.globl C(r_bmodelactive) +C(r_bmodelactive): .long 0 + +#endif // id386 + diff --git a/source/snd_mixa.S b/source/snd_mixa.S new file mode 100644 index 0000000..6abb8a5 --- /dev/null +++ b/source/snd_mixa.S @@ -0,0 +1,218 @@ +/* +Copyright (C) 1996-1997 Id Software, Inc. + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +*/ +// +// snd_mixa.s +// x86 assembly-language sound code +// + +#include "asm_i386.h" +#include "quakeasm.h" + +#if id386 + + .text + +//---------------------------------------------------------------------- +// 8-bit sound-mixing code +//---------------------------------------------------------------------- + +#define ch 4+16 +#define sc 8+16 +#define count 12+16 + +.globl C(SND_PaintChannelFrom8) +C(SND_PaintChannelFrom8): + pushl %esi // preserve register variables + pushl %edi + pushl %ebx + pushl %ebp + +// int data; +// short *lscale, *rscale; +// unsigned char *sfx; +// int i; + + movl ch(%esp),%ebx + movl sc(%esp),%esi + +// if (ch->leftvol > 255) +// ch->leftvol = 255; +// if (ch->rightvol > 255) +// ch->rightvol = 255; + movl ch_leftvol(%ebx),%eax + movl ch_rightvol(%ebx),%edx + cmpl $255,%eax + jna LLeftSet + movl $255,%eax +LLeftSet: + cmpl $255,%edx + jna LRightSet + movl $255,%edx +LRightSet: + +// lscale = snd_scaletable[ch->leftvol >> 3]; +// rscale = snd_scaletable[ch->rightvol >> 3]; +// sfx = (signed char *)sc->data + ch->pos; +// ch->pos += count; + andl $0xF8,%eax + addl $(sfxc_data),%esi + andl $0xF8,%edx + movl ch_pos(%ebx),%edi + movl count(%esp),%ecx + addl %edi,%esi + shll $7,%eax + addl %ecx,%edi + shll $7,%edx + movl %edi,ch_pos(%ebx) + addl $(C(snd_scaletable)),%eax + addl $(C(snd_scaletable)),%edx + subl %ebx,%ebx + movb -1(%esi,%ecx,1),%bl + + testl $1,%ecx + jz LMix8Loop + + movl (%eax,%ebx,4),%edi + movl (%edx,%ebx,4),%ebp + addl C(paintbuffer)+psp_left-psp_size(,%ecx,psp_size),%edi + addl C(paintbuffer)+psp_right-psp_size(,%ecx,psp_size),%ebp + movl %edi,C(paintbuffer)+psp_left-psp_size(,%ecx,psp_size) + movl %ebp,C(paintbuffer)+psp_right-psp_size(,%ecx,psp_size) + movb -2(%esi,%ecx,1),%bl + + decl %ecx + jz LDone + +// for (i=0 ; i>8; +// if (val > 0x7fff) +// snd_out[i] = 0x7fff; +// else if (val < (short)0x8000) +// snd_out[i] = (short)0x8000; +// else +// snd_out[i] = val; + movl -8(%ebx,%ecx,4),%eax + imull %esi,%eax + sarl $8,%eax + cmpl $0x7FFF,%eax + jg LClampHigh + cmpl $0xFFFF8000,%eax + jnl LClampDone + movl $0xFFFF8000,%eax + jmp LClampDone +LClampHigh: + movl $0x7FFF,%eax +LClampDone: + +// val = (snd_p[i+1]*snd_vol)>>8; +// if (val > 0x7fff) +// snd_out[i+1] = 0x7fff; +// else if (val < (short)0x8000) +// snd_out[i+1] = (short)0x8000; +// else +// snd_out[i+1] = val; + movl -4(%ebx,%ecx,4),%edx + imull %esi,%edx + sarl $8,%edx + cmpl $0x7FFF,%edx + jg LClampHigh2 + cmpl $0xFFFF8000,%edx + jnl LClampDone2 + movl $0xFFFF8000,%edx + jmp LClampDone2 +LClampHigh2: + movl $0x7FFF,%edx +LClampDone2: + shll $16,%edx + andl $0xFFFF,%eax + orl %eax,%edx + movl %edx,-4(%edi,%ecx,2) + +// } + subl $2,%ecx + jnz LWLBLoopTop + +// snd_p += snd_linear_count; + + popl %ebx + popl %edi + popl %esi + + ret + + +#endif // id386 + diff --git a/source/surf16.S b/source/surf16.S new file mode 100644 index 0000000..8fffc40 --- /dev/null +++ b/source/surf16.S @@ -0,0 +1,172 @@ +/* +Copyright (C) 1996-1997 Id Software, Inc. + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +*/ +// +// surf16.s +// x86 assembly-language 16 bpp surface block drawing code. +// + +#include "asm_i386.h" +#include "quakeasm.h" +#include "asm_draw.h" + +#if id386 + +//---------------------------------------------------------------------- +// Surface block drawer +//---------------------------------------------------------------------- + + .data + +k: .long 0 +loopentry: .long 0 + + .align 4 +blockjumptable16: + .long LEnter2_16 + .long LEnter4_16 + .long 0, LEnter8_16 + .long 0, 0, 0, LEnter16_16 + + + .text + + .align 4 +.globl C(R_Surf16Start) +C(R_Surf16Start): + + .align 4 +.globl C(R_DrawSurfaceBlock16) +C(R_DrawSurfaceBlock16): + pushl %ebp // preserve caller's stack frame + pushl %edi + pushl %esi // preserve register variables + pushl %ebx + + movl C(blocksize),%eax + movl C(prowdestbase),%edi + movl C(pbasesource),%esi + movl C(sourcesstep),%ebx + movl blockjumptable16-4(,%eax,2),%ecx + movl %eax,k + movl %ecx,loopentry + movl C(lightleft),%edx + movl C(lightright),%ebp + +Lblockloop16: + + subl %edx,%ebp + movb C(blockdivshift),%cl + sarl %cl,%ebp + jns Lp1_16 + testl C(blockdivmask),%ebp + jz Lp1_16 + incl %ebp +Lp1_16: + + subl %eax,%eax + subl %ecx,%ecx // high words must be 0 in loop for addressing + + jmp *loopentry + + .align 4 + +#include "block16.h" + + movl C(pbasesource),%esi + movl C(lightleft),%edx + movl C(lightright),%ebp + movl C(sourcetstep),%eax + movl C(lightrightstep),%ecx + movl C(prowdestbase),%edi + + addl %eax,%esi + addl %ecx,%ebp + + movl C(lightleftstep),%eax + movl C(surfrowbytes),%ecx + + addl %eax,%edx + addl %ecx,%edi + + movl %esi,C(pbasesource) + movl %ebp,C(lightright) + movl k,%eax + movl %edx,C(lightleft) + decl %eax + movl %edi,C(prowdestbase) + movl %eax,k + jnz Lblockloop16 + + popl %ebx // restore register variables + popl %esi + popl %edi + popl %ebp // restore the caller's stack frame + ret + +.globl C(R_Surf16End) +C(R_Surf16End): + +//---------------------------------------------------------------------- +// Code patching routines +//---------------------------------------------------------------------- + .data + + .align 4 +LPatchTable16: + .long LBPatch0-4 + .long LBPatch1-4 + .long LBPatch2-4 + .long LBPatch3-4 + .long LBPatch4-4 + .long LBPatch5-4 + .long LBPatch6-4 + .long LBPatch7-4 + .long LBPatch8-4 + .long LBPatch9-4 + .long LBPatch10-4 + .long LBPatch11-4 + .long LBPatch12-4 + .long LBPatch13-4 + .long LBPatch14-4 + .long LBPatch15-4 + + .text + + .align 4 +.globl C(R_Surf16Patch) +C(R_Surf16Patch): + pushl %ebx + + movl C(colormap),%eax + movl $LPatchTable16,%ebx + movl $16,%ecx +LPatchLoop16: + movl (%ebx),%edx + addl $4,%ebx + movl %eax,(%edx) + decl %ecx + jnz LPatchLoop16 + + popl %ebx + + ret + + +#endif // id386 diff --git a/source/surf8.S b/source/surf8.S new file mode 100644 index 0000000..7229e15 --- /dev/null +++ b/source/surf8.S @@ -0,0 +1,783 @@ +/* +Copyright (C) 1996-1997 Id Software, Inc. + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +*/ +// +// surf8.s +// x86 assembly-language 8 bpp surface block drawing code. +// + +#include "asm_i386.h" +#include "quakeasm.h" +#include "asm_draw.h" + +#if id386 + + .data + +sb_v: .long 0 + + .text + + .align 4 +.globl C(R_Surf8Start) +C(R_Surf8Start): + +//---------------------------------------------------------------------- +// Surface block drawer for mip level 0 +//---------------------------------------------------------------------- + + .align 4 +.globl C(R_DrawSurfaceBlock8_mip0) +C(R_DrawSurfaceBlock8_mip0): + pushl %ebp // preserve caller's stack frame + pushl %edi + pushl %esi // preserve register variables + pushl %ebx + +// for (v=0 ; v> blockdivshift; +// lightrightstep = (lightptr[1] - lightright) >> blockdivshift; +// lightdeltastep = ((lightleftstep - lightrightstep) & 0xFFFFF) | +// 0xF0000000; + movl 4(%ebx),%ecx // lightptr[1] + movl (%ebx),%ebx // lightptr[0] + + subl %eax,%ebx + subl %edx,%ecx + + sarl $4,%ecx + orl $0xF0000000,%ebp + + sarl $4,%ebx + movl %ecx,C(lightrightstep) + + subl %ecx,%ebx + andl $0xFFFFF,%ebx + + orl $0xF0000000,%ebx + subl %ecx,%ecx // high word must be 0 in loop for addressing + + movl %ebx,C(lightdeltastep) + subl %ebx,%ebx // high word must be 0 in loop for addressing + +Lblockloop8_mip0: + movl %ebp,C(lightdelta) + movb 14(%esi),%cl + + sarl $4,%ebp + movb %dh,%bh + + movb 15(%esi),%bl + addl %ebp,%edx + + movb %dh,%ch + addl %ebp,%edx + + movb 0x12345678(%ebx),%ah +LBPatch0: + movb 13(%esi),%bl + + movb 0x12345678(%ecx),%al +LBPatch1: + movb 12(%esi),%cl + + movb %dh,%bh + addl %ebp,%edx + + rorl $16,%eax + movb %dh,%ch + + addl %ebp,%edx + movb 0x12345678(%ebx),%ah +LBPatch2: + + movb 11(%esi),%bl + movb 0x12345678(%ecx),%al +LBPatch3: + + movb 10(%esi),%cl + movl %eax,12(%edi) + + movb %dh,%bh + addl %ebp,%edx + + movb %dh,%ch + addl %ebp,%edx + + movb 0x12345678(%ebx),%ah +LBPatch4: + movb 9(%esi),%bl + + movb 0x12345678(%ecx),%al +LBPatch5: + movb 8(%esi),%cl + + movb %dh,%bh + addl %ebp,%edx + + rorl $16,%eax + movb %dh,%ch + + addl %ebp,%edx + movb 0x12345678(%ebx),%ah +LBPatch6: + + movb 7(%esi),%bl + movb 0x12345678(%ecx),%al +LBPatch7: + + movb 6(%esi),%cl + movl %eax,8(%edi) + + movb %dh,%bh + addl %ebp,%edx + + movb %dh,%ch + addl %ebp,%edx + + movb 0x12345678(%ebx),%ah +LBPatch8: + movb 5(%esi),%bl + + movb 0x12345678(%ecx),%al +LBPatch9: + movb 4(%esi),%cl + + movb %dh,%bh + addl %ebp,%edx + + rorl $16,%eax + movb %dh,%ch + + addl %ebp,%edx + movb 0x12345678(%ebx),%ah +LBPatch10: + + movb 3(%esi),%bl + movb 0x12345678(%ecx),%al +LBPatch11: + + movb 2(%esi),%cl + movl %eax,4(%edi) + + movb %dh,%bh + addl %ebp,%edx + + movb %dh,%ch + addl %ebp,%edx + + movb 0x12345678(%ebx),%ah +LBPatch12: + movb 1(%esi),%bl + + movb 0x12345678(%ecx),%al +LBPatch13: + movb (%esi),%cl + + movb %dh,%bh + addl %ebp,%edx + + rorl $16,%eax + movb %dh,%ch + + movb 0x12345678(%ebx),%ah +LBPatch14: + movl C(lightright),%edx + + movb 0x12345678(%ecx),%al +LBPatch15: + movl C(lightdelta),%ebp + + movl %eax,(%edi) + + addl C(sourcetstep),%esi + addl C(surfrowbytes),%edi + + addl C(lightrightstep),%edx + addl C(lightdeltastep),%ebp + + movl %edx,C(lightright) + jc Lblockloop8_mip0 + +// if (pbasesource >= r_sourcemax) +// pbasesource -= stepback; + + cmpl C(r_sourcemax),%esi + jb LSkip_mip0 + subl C(r_stepback),%esi +LSkip_mip0: + + movl C(r_lightptr),%ebx + decl sb_v + + jnz Lv_loop_mip0 + + popl %ebx // restore register variables + popl %esi + popl %edi + popl %ebp // restore the caller's stack frame + ret + + +//---------------------------------------------------------------------- +// Surface block drawer for mip level 1 +//---------------------------------------------------------------------- + + .align 4 +.globl C(R_DrawSurfaceBlock8_mip1) +C(R_DrawSurfaceBlock8_mip1): + pushl %ebp // preserve caller's stack frame + pushl %edi + pushl %esi // preserve register variables + pushl %ebx + +// for (v=0 ; v> blockdivshift; +// lightrightstep = (lightptr[1] - lightright) >> blockdivshift; +// lightdeltastep = ((lightleftstep - lightrightstep) & 0xFFFFF) | +// 0xF0000000; + movl 4(%ebx),%ecx // lightptr[1] + movl (%ebx),%ebx // lightptr[0] + + subl %eax,%ebx + subl %edx,%ecx + + sarl $3,%ecx + orl $0x70000000,%ebp + + sarl $3,%ebx + movl %ecx,C(lightrightstep) + + subl %ecx,%ebx + andl $0xFFFFF,%ebx + + orl $0xF0000000,%ebx + subl %ecx,%ecx // high word must be 0 in loop for addressing + + movl %ebx,C(lightdeltastep) + subl %ebx,%ebx // high word must be 0 in loop for addressing + +Lblockloop8_mip1: + movl %ebp,C(lightdelta) + movb 6(%esi),%cl + + sarl $3,%ebp + movb %dh,%bh + + movb 7(%esi),%bl + addl %ebp,%edx + + movb %dh,%ch + addl %ebp,%edx + + movb 0x12345678(%ebx),%ah +LBPatch22: + movb 5(%esi),%bl + + movb 0x12345678(%ecx),%al +LBPatch23: + movb 4(%esi),%cl + + movb %dh,%bh + addl %ebp,%edx + + rorl $16,%eax + movb %dh,%ch + + addl %ebp,%edx + movb 0x12345678(%ebx),%ah +LBPatch24: + + movb 3(%esi),%bl + movb 0x12345678(%ecx),%al +LBPatch25: + + movb 2(%esi),%cl + movl %eax,4(%edi) + + movb %dh,%bh + addl %ebp,%edx + + movb %dh,%ch + addl %ebp,%edx + + movb 0x12345678(%ebx),%ah +LBPatch26: + movb 1(%esi),%bl + + movb 0x12345678(%ecx),%al +LBPatch27: + movb (%esi),%cl + + movb %dh,%bh + addl %ebp,%edx + + rorl $16,%eax + movb %dh,%ch + + movb 0x12345678(%ebx),%ah +LBPatch28: + movl C(lightright),%edx + + movb 0x12345678(%ecx),%al +LBPatch29: + movl C(lightdelta),%ebp + + movl %eax,(%edi) + movl C(sourcetstep),%eax + + addl %eax,%esi + movl C(surfrowbytes),%eax + + addl %eax,%edi + movl C(lightrightstep),%eax + + addl %eax,%edx + movl C(lightdeltastep),%eax + + addl %eax,%ebp + movl %edx,C(lightright) + + jc Lblockloop8_mip1 + +// if (pbasesource >= r_sourcemax) +// pbasesource -= stepback; + + cmpl C(r_sourcemax),%esi + jb LSkip_mip1 + subl C(r_stepback),%esi +LSkip_mip1: + + movl C(r_lightptr),%ebx + decl sb_v + + jnz Lv_loop_mip1 + + popl %ebx // restore register variables + popl %esi + popl %edi + popl %ebp // restore the caller's stack frame + ret + + +//---------------------------------------------------------------------- +// Surface block drawer for mip level 2 +//---------------------------------------------------------------------- + + .align 4 +.globl C(R_DrawSurfaceBlock8_mip2) +C(R_DrawSurfaceBlock8_mip2): + pushl %ebp // preserve caller's stack frame + pushl %edi + pushl %esi // preserve register variables + pushl %ebx + +// for (v=0 ; v> blockdivshift; +// lightrightstep = (lightptr[1] - lightright) >> blockdivshift; +// lightdeltastep = ((lightleftstep - lightrightstep) & 0xFFFFF) | +// 0xF0000000; + movl 4(%ebx),%ecx // lightptr[1] + movl (%ebx),%ebx // lightptr[0] + + subl %eax,%ebx + subl %edx,%ecx + + sarl $2,%ecx + orl $0x30000000,%ebp + + sarl $2,%ebx + movl %ecx,C(lightrightstep) + + subl %ecx,%ebx + + andl $0xFFFFF,%ebx + + orl $0xF0000000,%ebx + subl %ecx,%ecx // high word must be 0 in loop for addressing + + movl %ebx,C(lightdeltastep) + subl %ebx,%ebx // high word must be 0 in loop for addressing + +Lblockloop8_mip2: + movl %ebp,C(lightdelta) + movb 2(%esi),%cl + + sarl $2,%ebp + movb %dh,%bh + + movb 3(%esi),%bl + addl %ebp,%edx + + movb %dh,%ch + addl %ebp,%edx + + movb 0x12345678(%ebx),%ah +LBPatch18: + movb 1(%esi),%bl + + movb 0x12345678(%ecx),%al +LBPatch19: + movb (%esi),%cl + + movb %dh,%bh + addl %ebp,%edx + + rorl $16,%eax + movb %dh,%ch + + movb 0x12345678(%ebx),%ah +LBPatch20: + movl C(lightright),%edx + + movb 0x12345678(%ecx),%al +LBPatch21: + movl C(lightdelta),%ebp + + movl %eax,(%edi) + movl C(sourcetstep),%eax + + addl %eax,%esi + movl C(surfrowbytes),%eax + + addl %eax,%edi + movl C(lightrightstep),%eax + + addl %eax,%edx + movl C(lightdeltastep),%eax + + addl %eax,%ebp + movl %edx,C(lightright) + + jc Lblockloop8_mip2 + +// if (pbasesource >= r_sourcemax) +// pbasesource -= stepback; + + cmpl C(r_sourcemax),%esi + jb LSkip_mip2 + subl C(r_stepback),%esi +LSkip_mip2: + + movl C(r_lightptr),%ebx + decl sb_v + + jnz Lv_loop_mip2 + + popl %ebx // restore register variables + popl %esi + popl %edi + popl %ebp // restore the caller's stack frame + ret + + +//---------------------------------------------------------------------- +// Surface block drawer for mip level 3 +//---------------------------------------------------------------------- + + .align 4 +.globl C(R_DrawSurfaceBlock8_mip3) +C(R_DrawSurfaceBlock8_mip3): + pushl %ebp // preserve caller's stack frame + pushl %edi + pushl %esi // preserve register variables + pushl %ebx + +// for (v=0 ; v> blockdivshift; +// lightrightstep = (lightptr[1] - lightright) >> blockdivshift; +// lightdeltastep = ((lightleftstep - lightrightstep) & 0xFFFFF) | +// 0xF0000000; + movl 4(%ebx),%ecx // lightptr[1] + movl (%ebx),%ebx // lightptr[0] + + subl %eax,%ebx + subl %edx,%ecx + + sarl $1,%ecx + + sarl $1,%ebx + movl %ecx,C(lightrightstep) + + subl %ecx,%ebx + andl $0xFFFFF,%ebx + + sarl $1,%ebp + orl $0xF0000000,%ebx + + movl %ebx,C(lightdeltastep) + subl %ebx,%ebx // high word must be 0 in loop for addressing + + movb 1(%esi),%bl + subl %ecx,%ecx // high word must be 0 in loop for addressing + + movb %dh,%bh + movb (%esi),%cl + + addl %ebp,%edx + movb %dh,%ch + + movb 0x12345678(%ebx),%al +LBPatch16: + movl C(lightright),%edx + + movb %al,1(%edi) + movb 0x12345678(%ecx),%al +LBPatch17: + + movb %al,(%edi) + movl C(sourcetstep),%eax + + addl %eax,%esi + movl C(surfrowbytes),%eax + + addl %eax,%edi + movl C(lightdeltastep),%eax + + movl C(lightdelta),%ebp + movb (%esi),%cl + + addl %eax,%ebp + movl C(lightrightstep),%eax + + sarl $1,%ebp + addl %eax,%edx + + movb %dh,%bh + movb 1(%esi),%bl + + addl %ebp,%edx + movb %dh,%ch + + movb 0x12345678(%ebx),%al +LBPatch30: + movl C(sourcetstep),%edx + + movb %al,1(%edi) + movb 0x12345678(%ecx),%al +LBPatch31: + + movb %al,(%edi) + movl C(surfrowbytes),%ebp + + addl %edx,%esi + addl %ebp,%edi + +// if (pbasesource >= r_sourcemax) +// pbasesource -= stepback; + + cmpl C(r_sourcemax),%esi + jb LSkip_mip3 + subl C(r_stepback),%esi +LSkip_mip3: + + movl C(r_lightptr),%ebx + decl sb_v + + jnz Lv_loop_mip3 + + popl %ebx // restore register variables + popl %esi + popl %edi + popl %ebp // restore the caller's stack frame + ret + + +.globl C(R_Surf8End) +C(R_Surf8End): + +//---------------------------------------------------------------------- +// Code patching routines +//---------------------------------------------------------------------- + .data + + .align 4 +LPatchTable8: + .long LBPatch0-4 + .long LBPatch1-4 + .long LBPatch2-4 + .long LBPatch3-4 + .long LBPatch4-4 + .long LBPatch5-4 + .long LBPatch6-4 + .long LBPatch7-4 + .long LBPatch8-4 + .long LBPatch9-4 + .long LBPatch10-4 + .long LBPatch11-4 + .long LBPatch12-4 + .long LBPatch13-4 + .long LBPatch14-4 + .long LBPatch15-4 + .long LBPatch16-4 + .long LBPatch17-4 + .long LBPatch18-4 + .long LBPatch19-4 + .long LBPatch20-4 + .long LBPatch21-4 + .long LBPatch22-4 + .long LBPatch23-4 + .long LBPatch24-4 + .long LBPatch25-4 + .long LBPatch26-4 + .long LBPatch27-4 + .long LBPatch28-4 + .long LBPatch29-4 + .long LBPatch30-4 + .long LBPatch31-4 + + .text + + .align 4 +.globl C(R_Surf8Patch) +C(R_Surf8Patch): + pushl %ebx + + movl C(colormap),%eax + movl $LPatchTable8,%ebx + movl $32,%ecx +LPatchLoop8: + movl (%ebx),%edx + addl $4,%ebx + movl %eax,(%edx) + decl %ecx + jnz LPatchLoop8 + + popl %ebx + + ret + +#endif // id386 diff --git a/source/sv_misc.c b/source/sv_misc.c index 1f01f95..2186573 100644 --- a/source/sv_misc.c +++ b/source/sv_misc.c @@ -7,3 +7,8 @@ void Draw_EndDisc(void) { } + +void +Cmd_ForwardToServer (void) +{ +} diff --git a/source/sys_dosa.S b/source/sys_dosa.S new file mode 100644 index 0000000..7b0ea61 --- /dev/null +++ b/source/sys_dosa.S @@ -0,0 +1,95 @@ +// +// sys_dosa.s +// x86 assembly-language DOS-dependent routines. + +#include "asm_i386.h" +#include "quakeasm.h" + + + .data + + .align 4 +fpenv: + .long 0, 0, 0, 0, 0, 0, 0, 0 + + .text + +.globl C(MaskExceptions) +C(MaskExceptions): + fnstenv fpenv + orl $0x3F,fpenv + fldenv fpenv + + ret + +#if 0 +.globl C(unmaskexceptions) +C(unmaskexceptions): + fnstenv fpenv + andl $0xFFFFFFE0,fpenv + fldenv fpenv + + ret +#endif + + .data + + .align 4 +.globl ceil_cw, single_cw, full_cw, cw, pushed_cw +ceil_cw: .long 0 +single_cw: .long 0 +full_cw: .long 0 +cw: .long 0 +pushed_cw: .long 0 + + .text + +.globl C(Sys_LowFPPrecision) +C(Sys_LowFPPrecision): + fldcw single_cw + + ret + +.globl C(Sys_HighFPPrecision) +C(Sys_HighFPPrecision): + fldcw full_cw + + ret + +.globl C(Sys_PushFPCW_SetHigh) +C(Sys_PushFPCW_SetHigh): + fnstcw pushed_cw + fldcw full_cw + + ret + +.globl C(Sys_PopFPCW) +C(Sys_PopFPCW): + fldcw pushed_cw + + ret + +.globl C(Sys_SetFPCW) +C(Sys_SetFPCW): + fnstcw cw + movl cw,%eax +#if id386 + andb $0xF0,%ah + orb $0x03,%ah // round mode, 64-bit precision +#endif + movl %eax,full_cw + +#if id386 + andb $0xF0,%ah + orb $0x0C,%ah // chop mode, single precision +#endif + movl %eax,single_cw + +#if id386 + andb $0xF0,%ah + orb $0x08,%ah // ceil mode, single precision +#endif + movl %eax,ceil_cw + + ret + diff --git a/source/sys_wina.S b/source/sys_wina.S new file mode 100644 index 0000000..6de31c2 --- /dev/null +++ b/source/sys_wina.S @@ -0,0 +1,115 @@ +/* +Copyright (C) 1996-1997 Id Software, Inc. + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +*/ +// +// sys_wina.s +// x86 assembly-language Win-dependent routines. + +#include "asm_i386.h" +#include "quakeasm.h" + +//@@@ should be id386-dependent, and have an equivalent C path + + .data + + .align 4 +fpenv: + .long 0, 0, 0, 0, 0, 0, 0, 0 + + .text + +.globl C(MaskExceptions) +C(MaskExceptions): + fnstenv fpenv + orl $0x3F,fpenv + fldenv fpenv + + ret + +#if 0 +.globl C(unmaskexceptions) +C(unmaskexceptions): + fnstenv fpenv + andl $0xFFFFFFE0,fpenv + fldenv fpenv + + ret +#endif + + .data + + .align 4 +.globl ceil_cw, single_cw, full_cw, cw, pushed_cw +ceil_cw: .long 0 +single_cw: .long 0 +full_cw: .long 0 +cw: .long 0 +pushed_cw: .long 0 + + .text + +.globl C(Sys_LowFPPrecision) +C(Sys_LowFPPrecision): + fldcw single_cw + + ret + +.globl C(Sys_HighFPPrecision) +C(Sys_HighFPPrecision): + fldcw full_cw + + ret + +.globl C(Sys_PushFPCW_SetHigh) +C(Sys_PushFPCW_SetHigh): + fnstcw pushed_cw + fldcw full_cw + + ret + +.globl C(Sys_PopFPCW) +C(Sys_PopFPCW): + fldcw pushed_cw + + ret + +.globl C(Sys_SetFPCW) +C(Sys_SetFPCW): + fnstcw cw + movl cw,%eax +#if id386 + andb $0xF0,%ah + orb $0x03,%ah // round mode, 64-bit precision +#endif + movl %eax,full_cw + +#if id386 + andb $0xF0,%ah + orb $0x0C,%ah // chop mode, single precision +#endif + movl %eax,single_cw + +#if id386 + andb $0xF0,%ah + orb $0x08,%ah // ceil mode, single precision +#endif + movl %eax,ceil_cw + + ret +