diff --git a/source/Makefile b/source/Makefile
index dc3bd5c..67eeecd 100644
--- a/source/Makefile
+++ b/source/Makefile
@@ -4,10 +4,10 @@ SV_name=qw-server
 SV_libs=
 
 CL_SVGA_name=qw-client-svga
-SV_libs=
+CL_SVGA_libs=-lvga
 
 CL_X11_name=qw-client-x11
-SV_libs=
+CL_X11_libs=
 
 DIRECTORIES=
 vpath %.a $(patsubst @%,%,$(DIRECTORIES)) /usr/lib
@@ -65,6 +65,8 @@ SV_dependencies = $(patsubst %,%.d,$(basename $(SV_sources)))
 SV_objects = $(patsubst %.d,%.o,$(SV_dependencies))
 
 CL_sources=\
+  cl_cmd.c \
+  cl_cvar.c \
   cl_demo.c \
   cl_ents.c \
   cl_input.c \
@@ -127,7 +129,25 @@ CL_sources=\
   zone.c \
   cd_linux.c \
   sys_linux.c \
-  snd_linux.c
+  snd_linux.c \
+  d_copy.S \
+  d_draw.S \
+  d_draw16.S \
+  d_parta.S \
+  d_polysa.S \
+  d_scana.S \
+  d_spr8.S \
+  d_varsa.S \
+  math.S \
+  r_aclipa.S \
+  r_aliasa.S \
+  r_drawa.S \
+  r_edgea.S \
+  r_varsa.S \
+  snd_mixa.S \
+  surf16.S \
+  surf8.S \
+  sys_dosa.S
 
 CL_dependencies = $(patsubst %,%.d,$(basename $(CL_sources)))
 CL_objects = $(patsubst %.d,%.o,$(CL_dependencies))
diff --git a/source/cl_cvar.c b/source/cl_cvar.c
index 3d26acf..14e17d9 100644
--- a/source/cl_cvar.c
+++ b/source/cl_cvar.c
@@ -29,7 +29,7 @@ Cvar_Info(cvar_t *var)
 		if (cls.state >= ca_connected)
 		{
 			MSG_WriteByte (&cls.netchan.message, clc_stringcmd);
-			SZ_Print (&cls.netchan.message, va("setinfo \"%s\" \"%s\"\n", var->name, string));
+			SZ_Print (&cls.netchan.message, va("setinfo \"%s\" \"%s\"\n", var->name, var->string));
 		}
 	}
 }
diff --git a/source/cmd.c b/source/cmd.c
index 811d1d2..fbdd47a 100644
--- a/source/cmd.c
+++ b/source/cmd.c
@@ -595,10 +595,6 @@ char *Cmd_CompleteCommand (char *partial)
 	return NULL;
 }
 
-void Cmd_ForwardToServer (void)
-{
-}
-
 /*
 ============
 Cmd_ExecuteString
diff --git a/source/d_copy.S b/source/d_copy.S
new file mode 100644
index 0000000..92e414a
--- /dev/null
+++ b/source/d_copy.S
@@ -0,0 +1,149 @@
+//
+// d_copy.s
+// x86 assembly-language screen copying code.
+//
+
+#include "asm_i386.h"
+#include "quakeasm.h"
+#include "asm_draw.h"
+
+	.data
+
+LCopyWidth:		.long	0
+LBlockSrcStep:	.long	0
+LBlockDestStep:	.long	0
+LSrcDelta:		.long	0
+LDestDelta:		.long	0
+
+#define bufptr	4+16
+
+// copies 16 rows per plane at a pop; idea is that 16*512 = 8k, and since
+// no Mode X mode is wider than 360, all the data should fit in the cache for
+// the passes for the next 3 planes
+
+	.text
+
+.globl C(VGA_UpdatePlanarScreen)
+C(VGA_UpdatePlanarScreen):
+	pushl	%ebp				// preserve caller's stack frame
+	pushl	%edi
+	pushl	%esi				// preserve register variables
+	pushl	%ebx
+
+	movl	C(VGA_bufferrowbytes),%eax
+	shll	$1,%eax
+	movl	%eax,LBlockSrcStep
+	movl	C(VGA_rowbytes),%eax
+	shll	$1,%eax
+	movl	%eax,LBlockDestStep
+
+	movl	$0x3C4,%edx
+	movb	$2,%al
+	outb	%al,%dx		// point the SC to the Map Mask
+	incl	%edx
+
+	movl	bufptr(%esp),%esi
+	movl	C(VGA_pagebase),%edi
+	movl	C(VGA_height),%ebp
+	shrl	$1,%ebp
+
+	movl	C(VGA_width),%ecx
+	movl	C(VGA_bufferrowbytes),%eax
+	subl	%ecx,%eax
+	movl	%eax,LSrcDelta
+	movl	C(VGA_rowbytes),%eax
+	shll	$2,%eax
+	subl	%ecx,%eax
+	movl	%eax,LDestDelta
+	shrl	$4,%ecx
+	movl	%ecx,LCopyWidth
+
+LRowLoop:
+	movb	$1,%al
+
+LPlaneLoop:
+	outb	%al,%dx
+	movb	$2,%ah
+
+	pushl	%esi
+	pushl	%edi
+LRowSetLoop:
+	movl	LCopyWidth,%ecx
+LColumnLoop:
+	movb	12(%esi),%bh
+	movb	8(%esi),%bl
+	shll	$16,%ebx
+	movb	4(%esi),%bh
+	movb	(%esi),%bl
+	movl	%ebx,(%edi)
+	addl	$16,%esi
+	addl	$4,%edi
+	decl	%ecx
+	jnz		LColumnLoop
+
+	addl	LDestDelta,%edi
+	addl	LSrcDelta,%esi
+	decb	%ah
+	jnz		LRowSetLoop
+
+	popl	%edi
+	popl	%esi
+	incl	%esi
+
+	shlb	$1,%al
+	cmpb	$16,%al
+	jnz		LPlaneLoop
+
+	subl	$4,%esi
+	addl	LBlockSrcStep,%esi
+	addl	LBlockDestStep,%edi
+	decl	%ebp
+	jnz		LRowLoop
+
+	popl	%ebx				// restore register variables
+	popl	%esi
+	popl	%edi
+	popl	%ebp				// restore the caller's stack frame
+
+	ret
+
+
+#define srcptr			4+16
+#define destptr			8+16
+#define width			12+16
+#define height			16+16
+#define srcrowbytes		20+16
+#define destrowbytes	24+16
+
+.globl C(VGA_UpdateLinearScreen)
+C(VGA_UpdateLinearScreen):
+	pushl	%ebp				// preserve caller's stack frame
+	pushl	%edi
+	pushl	%esi				// preserve register variables
+	pushl	%ebx
+
+	cld
+	movl	srcptr(%esp),%esi
+	movl	destptr(%esp),%edi
+	movl	width(%esp),%ebx
+	movl	srcrowbytes(%esp),%eax
+	subl	%ebx,%eax
+	movl	destrowbytes(%esp),%edx
+	subl	%ebx,%edx
+	shrl	$2,%ebx
+	movl	height(%esp),%ebp
+LLRowLoop:
+	movl	%ebx,%ecx
+	rep/movsl	(%esi),(%edi)
+	addl	%eax,%esi
+	addl	%edx,%edi
+	decl	%ebp
+	jnz		LLRowLoop
+
+	popl	%ebx				// restore register variables
+	popl	%esi
+	popl	%edi
+	popl	%ebp				// restore the caller's stack frame
+
+	ret
+
diff --git a/source/d_draw.S b/source/d_draw.S
new file mode 100644
index 0000000..d50b982
--- /dev/null
+++ b/source/d_draw.S
@@ -0,0 +1,1037 @@
+/*
+Copyright (C) 1996-1997 Id Software, Inc.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+
+See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+*/
+//
+// d_draw.s
+// x86 assembly-language horizontal 8-bpp span-drawing code.
+//
+
+#include "asm_i386.h"
+#include "quakeasm.h"
+#include "asm_draw.h"
+#include "d_ifacea.h"
+
+#if	id386
+
+//----------------------------------------------------------------------
+// 8-bpp horizontal span drawing code for polygons, with no transparency.
+//
+// Assumes there is at least one span in pspans, and that every span
+// contains at least one pixel
+//----------------------------------------------------------------------
+
+	.text
+
+// out-of-line, rarely-needed clamping code
+
+LClampHigh0:
+	movl	C(bbextents),%esi
+	jmp		LClampReentry0
+LClampHighOrLow0:
+	jg		LClampHigh0
+	xorl	%esi,%esi
+	jmp		LClampReentry0
+
+LClampHigh1:
+	movl	C(bbextentt),%edx
+	jmp		LClampReentry1
+LClampHighOrLow1:
+	jg		LClampHigh1
+	xorl	%edx,%edx
+	jmp		LClampReentry1
+
+LClampLow2:
+	movl	$2048,%ebp
+	jmp		LClampReentry2
+LClampHigh2:
+	movl	C(bbextents),%ebp
+	jmp		LClampReentry2
+
+LClampLow3:
+	movl	$2048,%ecx
+	jmp		LClampReentry3
+LClampHigh3:
+	movl	C(bbextentt),%ecx
+	jmp		LClampReentry3
+
+LClampLow4:
+	movl	$2048,%eax
+	jmp		LClampReentry4
+LClampHigh4:
+	movl	C(bbextents),%eax
+	jmp		LClampReentry4
+
+LClampLow5:
+	movl	$2048,%ebx
+	jmp		LClampReentry5
+LClampHigh5:
+	movl	C(bbextentt),%ebx
+	jmp		LClampReentry5
+
+
+#define pspans	4+16
+
+	.align 4
+.globl C(D_DrawSpans8)
+C(D_DrawSpans8):
+	pushl	%ebp				// preserve caller's stack frame
+	pushl	%edi
+	pushl	%esi				// preserve register variables
+	pushl	%ebx
+
+//
+// set up scaled-by-8 steps, for 8-long segments; also set up cacheblock
+// and span list pointers
+//
+// TODO: any overlap from rearranging?
+	flds	C(d_sdivzstepu)
+	fmuls	fp_8
+	movl	C(cacheblock),%edx
+	flds	C(d_tdivzstepu)
+	fmuls	fp_8
+	movl	pspans(%esp),%ebx	// point to the first span descriptor
+	flds	C(d_zistepu)
+	fmuls	fp_8
+	movl	%edx,pbase			// pbase = cacheblock
+	fstps	zi8stepu
+	fstps	tdivz8stepu
+	fstps	sdivz8stepu
+
+LSpanLoop:
+//
+// set up the initial s/z, t/z, and 1/z on the FP stack, and generate the
+// initial s and t values
+//
+// FIXME: pipeline FILD?
+	fildl	espan_t_v(%ebx)
+	fildl	espan_t_u(%ebx)
+
+	fld		%st(1)			// dv | du | dv
+	fmuls	C(d_sdivzstepv)	// dv*d_sdivzstepv | du | dv
+	fld		%st(1)			// du | dv*d_sdivzstepv | du | dv
+	fmuls	C(d_sdivzstepu)	// du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
+	fld		%st(2)			// du | du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
+	fmuls	C(d_tdivzstepu)	// du*d_tdivzstepu | du*d_sdivzstepu |
+							//  dv*d_sdivzstepv | du | dv
+	fxch	%st(1)			// du*d_sdivzstepu | du*d_tdivzstepu |
+							//  dv*d_sdivzstepv | du | dv
+	faddp	%st(0),%st(2)	// du*d_tdivzstepu |
+							//  du*d_sdivzstepu + dv*d_sdivzstepv | du | dv
+	fxch	%st(1)			// du*d_sdivzstepu + dv*d_sdivzstepv |
+							//  du*d_tdivzstepu | du | dv
+	fld		%st(3)			// dv | du*d_sdivzstepu + dv*d_sdivzstepv |
+							//  du*d_tdivzstepu | du | dv
+	fmuls	C(d_tdivzstepv)	// dv*d_tdivzstepv |
+							//  du*d_sdivzstepu + dv*d_sdivzstepv |
+							//  du*d_tdivzstepu | du | dv
+	fxch	%st(1)			// du*d_sdivzstepu + dv*d_sdivzstepv |
+							//  dv*d_tdivzstepv | du*d_tdivzstepu | du | dv
+	fadds	C(d_sdivzorigin)	// sdivz = d_sdivzorigin + dv*d_sdivzstepv +
+							//  du*d_sdivzstepu; stays in %st(2) at end
+	fxch	%st(4)			// dv | dv*d_tdivzstepv | du*d_tdivzstepu | du |
+							//  s/z
+	fmuls	C(d_zistepv)		// dv*d_zistepv | dv*d_tdivzstepv |
+							//  du*d_tdivzstepu | du | s/z
+	fxch	%st(1)			// dv*d_tdivzstepv |  dv*d_zistepv |
+							//  du*d_tdivzstepu | du | s/z
+	faddp	%st(0),%st(2)	// dv*d_zistepv |
+							//  dv*d_tdivzstepv + du*d_tdivzstepu | du | s/z
+	fxch	%st(2)			// du | dv*d_tdivzstepv + du*d_tdivzstepu |
+							//  dv*d_zistepv | s/z
+	fmuls	C(d_zistepu)		// du*d_zistepu |
+							//  dv*d_tdivzstepv + du*d_tdivzstepu |
+							//  dv*d_zistepv | s/z
+	fxch	%st(1)			// dv*d_tdivzstepv + du*d_tdivzstepu |
+							//  du*d_zistepu | dv*d_zistepv | s/z
+	fadds	C(d_tdivzorigin)	// tdivz = d_tdivzorigin + dv*d_tdivzstepv +
+							//  du*d_tdivzstepu; stays in %st(1) at end
+	fxch	%st(2)			// dv*d_zistepv | du*d_zistepu | t/z | s/z
+	faddp	%st(0),%st(1)	// dv*d_zistepv + du*d_zistepu | t/z | s/z
+
+	flds	fp_64k			// fp_64k | dv*d_zistepv + du*d_zistepu | t/z | s/z
+	fxch	%st(1)			// dv*d_zistepv + du*d_zistepu | fp_64k | t/z | s/z
+	fadds	C(d_ziorigin)		// zi = d_ziorigin + dv*d_zistepv +
+							//  du*d_zistepu; stays in %st(0) at end
+							// 1/z | fp_64k | t/z | s/z
+//
+// calculate and clamp s & t
+//
+	fdivr	%st(0),%st(1)	// 1/z | z*64k | t/z | s/z
+
+//
+// point %edi to the first pixel in the span
+//
+	movl	C(d_viewbuffer),%ecx
+	movl	espan_t_v(%ebx),%eax
+	movl	%ebx,pspantemp	// preserve spans pointer
+
+	movl	C(tadjust),%edx
+	movl	C(sadjust),%esi
+	movl	C(d_scantable)(,%eax,4),%edi	// v * screenwidth
+	addl	%ecx,%edi
+	movl	espan_t_u(%ebx),%ecx
+	addl	%ecx,%edi				// pdest = &pdestspan[scans->u];
+	movl	espan_t_count(%ebx),%ecx
+
+//
+// now start the FDIV for the end of the span
+//
+	cmpl	$8,%ecx
+	ja		LSetupNotLast1
+
+	decl	%ecx
+	jz		LCleanup1		// if only one pixel, no need to start an FDIV
+	movl	%ecx,spancountminus1
+
+// finish up the s and t calcs
+	fxch	%st(1)			// z*64k | 1/z | t/z | s/z
+
+	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z
+	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z
+	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z
+	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z
+	fxch	%st(1)			// s | t | 1/z | t/z | s/z
+	fistpl	s				// 1/z | t | t/z | s/z
+	fistpl	t				// 1/z | t/z | s/z
+
+	fildl	spancountminus1
+
+	flds	C(d_tdivzstepu)	// C(d_tdivzstepu) | spancountminus1
+	flds	C(d_zistepu)		// C(d_zistepu) | C(d_tdivzstepu) | spancountminus1
+	fmul	%st(2),%st(0)	// C(d_zistepu)*scm1 | C(d_tdivzstepu) | scm1
+	fxch	%st(1)			// C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1
+	fmul	%st(2),%st(0)	// C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1
+	fxch	%st(2)			// scm1 | C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1
+	fmuls	C(d_sdivzstepu)	// C(d_sdivzstepu)*scm1 | C(d_zistepu)*scm1 |
+							//  C(d_tdivzstepu)*scm1
+	fxch	%st(1)			// C(d_zistepu)*scm1 | C(d_sdivzstepu)*scm1 |
+							//  C(d_tdivzstepu)*scm1
+	faddp	%st(0),%st(3)	// C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1
+	fxch	%st(1)			// C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1
+	faddp	%st(0),%st(3)	// C(d_sdivzstepu)*scm1
+	faddp	%st(0),%st(3)
+
+	flds	fp_64k
+	fdiv	%st(1),%st(0)	// this is what we've gone to all this trouble to
+							//  overlap
+	jmp		LFDIVInFlight1
+
+LCleanup1:
+// finish up the s and t calcs
+	fxch	%st(1)			// z*64k | 1/z | t/z | s/z
+
+	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z
+	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z
+	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z
+	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z
+	fxch	%st(1)			// s | t | 1/z | t/z | s/z
+	fistpl	s				// 1/z | t | t/z | s/z
+	fistpl	t				// 1/z | t/z | s/z
+	jmp		LFDIVInFlight1
+
+	.align	4
+LSetupNotLast1:
+// finish up the s and t calcs
+	fxch	%st(1)			// z*64k | 1/z | t/z | s/z
+
+	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z
+	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z
+	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z
+	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z
+	fxch	%st(1)			// s | t | 1/z | t/z | s/z
+	fistpl	s				// 1/z | t | t/z | s/z
+	fistpl	t				// 1/z | t/z | s/z
+
+	fadds	zi8stepu
+	fxch	%st(2)
+	fadds	sdivz8stepu
+	fxch	%st(2)
+	flds	tdivz8stepu
+	faddp	%st(0),%st(2)
+	flds	fp_64k
+	fdiv	%st(1),%st(0)	// z = 1/1/z
+							// this is what we've gone to all this trouble to
+							//  overlap
+LFDIVInFlight1:
+
+	addl	s,%esi
+	addl	t,%edx
+	movl	C(bbextents),%ebx
+	movl	C(bbextentt),%ebp
+	cmpl	%ebx,%esi
+	ja		LClampHighOrLow0
+LClampReentry0:
+	movl	%esi,s
+	movl	pbase,%ebx
+	shll	$16,%esi
+	cmpl	%ebp,%edx
+	movl	%esi,sfracf
+	ja		LClampHighOrLow1
+LClampReentry1:
+	movl	%edx,t
+	movl	s,%esi					// sfrac = scans->sfrac;
+	shll	$16,%edx
+	movl	t,%eax					// tfrac = scans->tfrac;
+	sarl	$16,%esi
+	movl	%edx,tfracf
+
+//
+// calculate the texture starting address
+//
+	sarl	$16,%eax
+	movl	C(cachewidth),%edx
+	imull	%edx,%eax				// (tfrac >> 16) * cachewidth
+	addl	%ebx,%esi
+	addl	%eax,%esi				// psource = pbase + (sfrac >> 16) +
+									//           ((tfrac >> 16) * cachewidth);
+
+//
+// determine whether last span or not
+//
+	cmpl	$8,%ecx
+	jna		LLastSegment
+
+//
+// not the last segment; do full 8-wide segment
+//
+LNotLastSegment:
+
+//
+// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
+// get there
+//
+
+// pick up after the FDIV that was left in flight previously
+
+	fld		%st(0)			// duplicate it
+	fmul	%st(4),%st(0)	// s = s/z * z
+	fxch	%st(1)
+	fmul	%st(3),%st(0)	// t = t/z * z
+	fxch	%st(1)
+	fistpl	snext
+	fistpl	tnext
+	movl	snext,%eax
+	movl	tnext,%edx
+
+	movb	(%esi),%bl	// get first source texel
+	subl	$8,%ecx		// count off this segments' pixels
+	movl	C(sadjust),%ebp
+	movl	%ecx,counttemp	// remember count of remaining pixels
+
+	movl	C(tadjust),%ecx
+	movb	%bl,(%edi)	// store first dest pixel
+
+	addl	%eax,%ebp
+	addl	%edx,%ecx
+
+	movl	C(bbextents),%eax
+	movl	C(bbextentt),%edx
+
+	cmpl	$2048,%ebp
+	jl		LClampLow2
+	cmpl	%eax,%ebp
+	ja		LClampHigh2
+LClampReentry2:
+
+	cmpl	$2048,%ecx
+	jl		LClampLow3
+	cmpl	%edx,%ecx
+	ja		LClampHigh3
+LClampReentry3:
+
+	movl	%ebp,snext
+	movl	%ecx,tnext
+
+	subl	s,%ebp
+	subl	t,%ecx
+	
+//
+// set up advancetable
+//
+	movl	%ecx,%eax
+	movl	%ebp,%edx
+	sarl	$19,%eax			// tstep >>= 16;
+	jz		LZero
+	sarl	$19,%edx			// sstep >>= 16;
+	movl	C(cachewidth),%ebx
+	imull	%ebx,%eax
+	jmp		LSetUp1
+
+LZero:
+	sarl	$19,%edx			// sstep >>= 16;
+	movl	C(cachewidth),%ebx
+
+LSetUp1:
+
+	addl	%edx,%eax			// add in sstep
+								// (tstep >> 16) * cachewidth + (sstep >> 16);
+	movl	tfracf,%edx
+	movl	%eax,advancetable+4	// advance base in t
+	addl	%ebx,%eax			// ((tstep >> 16) + 1) * cachewidth +
+								//  (sstep >> 16);
+	shll	$13,%ebp			// left-justify sstep fractional part
+	movl	sfracf,%ebx
+	shll	$13,%ecx			// left-justify tstep fractional part
+	movl	%eax,advancetable	// advance extra in t
+
+	movl	%ecx,tstep
+	addl	%ecx,%edx			// advance tfrac fractional part by tstep frac
+
+	sbbl	%ecx,%ecx			// turn tstep carry into -1 (0 if none)
+	addl	%ebp,%ebx			// advance sfrac fractional part by sstep frac
+	adcl	advancetable+4(,%ecx,4),%esi	// point to next source texel
+
+	addl	tstep,%edx
+	sbbl	%ecx,%ecx
+	movb	(%esi),%al
+	addl	%ebp,%ebx
+	movb	%al,1(%edi)
+	adcl	advancetable+4(,%ecx,4),%esi
+
+	addl	tstep,%edx
+	sbbl	%ecx,%ecx
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+
+	addl	tstep,%edx
+	sbbl	%ecx,%ecx
+	movb	%al,2(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+
+	addl	tstep,%edx
+	sbbl	%ecx,%ecx
+	movb	%al,3(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+
+
+//
+// start FDIV for end of next segment in flight, so it can overlap
+//
+	movl	counttemp,%ecx
+	cmpl	$8,%ecx			// more than one segment after this?
+	ja		LSetupNotLast2	// yes
+
+	decl	%ecx
+	jz		LFDIVInFlight2	// if only one pixel, no need to start an FDIV
+	movl	%ecx,spancountminus1
+	fildl	spancountminus1
+
+	flds	C(d_zistepu)		// C(d_zistepu) | spancountminus1
+	fmul	%st(1),%st(0)	// C(d_zistepu)*scm1 | scm1
+	flds	C(d_tdivzstepu)	// C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1
+	fmul	%st(2),%st(0)	// C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1
+	fxch	%st(1)			// C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1 | scm1
+	faddp	%st(0),%st(3)	// C(d_tdivzstepu)*scm1 | scm1
+	fxch	%st(1)			// scm1 | C(d_tdivzstepu)*scm1
+	fmuls	C(d_sdivzstepu)	// C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1
+	fxch	%st(1)			// C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1
+	faddp	%st(0),%st(3)	// C(d_sdivzstepu)*scm1
+	flds	fp_64k			// 64k | C(d_sdivzstepu)*scm1
+	fxch	%st(1)			// C(d_sdivzstepu)*scm1 | 64k
+	faddp	%st(0),%st(4)	// 64k
+
+	fdiv	%st(1),%st(0)	// this is what we've gone to all this trouble to
+							//  overlap
+	jmp		LFDIVInFlight2
+
+	.align	4
+LSetupNotLast2:
+	fadds	zi8stepu
+	fxch	%st(2)
+	fadds	sdivz8stepu
+	fxch	%st(2)
+	flds	tdivz8stepu
+	faddp	%st(0),%st(2)
+	flds	fp_64k
+	fdiv	%st(1),%st(0)	// z = 1/1/z
+							// this is what we've gone to all this trouble to
+							//  overlap
+LFDIVInFlight2:
+	movl	%ecx,counttemp
+
+	addl	tstep,%edx
+	sbbl	%ecx,%ecx
+	movb	%al,4(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+
+	addl	tstep,%edx
+	sbbl	%ecx,%ecx
+	movb	%al,5(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+
+	addl	tstep,%edx
+	sbbl	%ecx,%ecx
+	movb	%al,6(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+
+	addl	$8,%edi
+	movl	%edx,tfracf
+	movl	snext,%edx
+	movl	%ebx,sfracf
+	movl	tnext,%ebx
+	movl	%edx,s
+	movl	%ebx,t
+
+	movl	counttemp,%ecx		// retrieve count
+
+//
+// determine whether last span or not
+//
+	cmpl	$8,%ecx				// are there multiple segments remaining?
+	movb	%al,-1(%edi)
+	ja		LNotLastSegment		// yes
+
+//
+// last segment of scan
+//
+LLastSegment:
+
+//
+// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
+// get there. The number of pixels left is variable, and we want to land on the
+// last pixel, not step one past it, so we can't run into arithmetic problems
+//
+	testl	%ecx,%ecx
+	jz		LNoSteps		// just draw the last pixel and we're done
+
+// pick up after the FDIV that was left in flight previously
+
+
+	fld		%st(0)			// duplicate it
+	fmul	%st(4),%st(0)	// s = s/z * z
+	fxch	%st(1)
+	fmul	%st(3),%st(0)	// t = t/z * z
+	fxch	%st(1)
+	fistpl	snext
+	fistpl	tnext
+
+	movb	(%esi),%al		// load first texel in segment
+	movl	C(tadjust),%ebx
+	movb	%al,(%edi)		// store first pixel in segment
+	movl	C(sadjust),%eax
+
+	addl	snext,%eax
+	addl	tnext,%ebx
+
+	movl	C(bbextents),%ebp
+	movl	C(bbextentt),%edx
+
+	cmpl	$2048,%eax
+	jl		LClampLow4
+	cmpl	%ebp,%eax
+	ja		LClampHigh4
+LClampReentry4:
+	movl	%eax,snext
+
+	cmpl	$2048,%ebx
+	jl		LClampLow5
+	cmpl	%edx,%ebx
+	ja		LClampHigh5
+LClampReentry5:
+
+	cmpl	$1,%ecx			// don't bother 
+	je		LOnlyOneStep	// if two pixels in segment, there's only one step,
+							//  of the segment length
+	subl	s,%eax
+	subl	t,%ebx
+
+	addl	%eax,%eax		// convert to 15.17 format so multiply by 1.31
+	addl	%ebx,%ebx		//  reciprocal yields 16.48
+
+	imull	reciprocal_table-8(,%ecx,4) // sstep = (snext - s) / (spancount-1)
+	movl	%edx,%ebp
+
+	movl	%ebx,%eax
+	imull	reciprocal_table-8(,%ecx,4) // tstep = (tnext - t) / (spancount-1)
+
+LSetEntryvec:
+//
+// set up advancetable
+//
+	movl	entryvec_table(,%ecx,4),%ebx
+	movl	%edx,%eax
+	movl	%ebx,jumptemp		// entry point into code for RET later
+	movl	%ebp,%ecx
+	sarl	$16,%edx			// tstep >>= 16;
+	movl	C(cachewidth),%ebx
+	sarl	$16,%ecx			// sstep >>= 16;
+	imull	%ebx,%edx
+
+	addl	%ecx,%edx			// add in sstep
+								// (tstep >> 16) * cachewidth + (sstep >> 16);
+	movl	tfracf,%ecx
+	movl	%edx,advancetable+4	// advance base in t
+	addl	%ebx,%edx			// ((tstep >> 16) + 1) * cachewidth +
+								//  (sstep >> 16);
+	shll	$16,%ebp			// left-justify sstep fractional part
+	movl	sfracf,%ebx
+	shll	$16,%eax			// left-justify tstep fractional part
+	movl	%edx,advancetable	// advance extra in t
+
+	movl	%eax,tstep
+	movl	%ecx,%edx
+	addl	%eax,%edx
+	sbbl	%ecx,%ecx
+	addl	%ebp,%ebx
+	adcl	advancetable+4(,%ecx,4),%esi
+
+	jmp		*jumptemp			// jump to the number-of-pixels handler
+
+//----------------------------------------
+
+LNoSteps:
+	movb	(%esi),%al		// load first texel in segment
+	subl	$7,%edi			// adjust for hardwired offset
+	jmp		LEndSpan
+
+
+LOnlyOneStep:
+	subl	s,%eax
+	subl	t,%ebx
+	movl	%eax,%ebp
+	movl	%ebx,%edx
+	jmp		LSetEntryvec
+
+//----------------------------------------
+
+.globl	Entry2_8
+Entry2_8:
+	subl	$6,%edi		// adjust for hardwired offsets
+	movb	(%esi),%al
+	jmp		LLEntry2_8
+
+//----------------------------------------
+
+.globl	Entry3_8
+Entry3_8:
+	subl	$5,%edi		// adjust for hardwired offsets
+	addl	%eax,%edx
+	movb	(%esi),%al
+	sbbl	%ecx,%ecx
+	addl	%ebp,%ebx
+	adcl	advancetable+4(,%ecx,4),%esi
+	jmp		LLEntry3_8
+
+//----------------------------------------
+
+.globl	Entry4_8
+Entry4_8:
+	subl	$4,%edi		// adjust for hardwired offsets
+	addl	%eax,%edx
+	movb	(%esi),%al
+	sbbl	%ecx,%ecx
+	addl	%ebp,%ebx
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+	jmp		LLEntry4_8
+
+//----------------------------------------
+
+.globl	Entry5_8
+Entry5_8:
+	subl	$3,%edi		// adjust for hardwired offsets
+	addl	%eax,%edx
+	movb	(%esi),%al
+	sbbl	%ecx,%ecx
+	addl	%ebp,%ebx
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+	jmp		LLEntry5_8
+
+//----------------------------------------
+
+.globl	Entry6_8
+Entry6_8:
+	subl	$2,%edi		// adjust for hardwired offsets
+	addl	%eax,%edx
+	movb	(%esi),%al
+	sbbl	%ecx,%ecx
+	addl	%ebp,%ebx
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+	jmp		LLEntry6_8
+
+//----------------------------------------
+
+.globl	Entry7_8
+Entry7_8:
+	decl	%edi		// adjust for hardwired offsets
+	addl	%eax,%edx
+	movb	(%esi),%al
+	sbbl	%ecx,%ecx
+	addl	%ebp,%ebx
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+	jmp		LLEntry7_8
+
+//----------------------------------------
+
+.globl	Entry8_8
+Entry8_8:
+	addl	%eax,%edx
+	movb	(%esi),%al
+	sbbl	%ecx,%ecx
+	addl	%ebp,%ebx
+	adcl	advancetable+4(,%ecx,4),%esi
+
+	addl	tstep,%edx
+	sbbl	%ecx,%ecx
+	movb	%al,1(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+LLEntry7_8:
+	sbbl	%ecx,%ecx
+	movb	%al,2(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+LLEntry6_8:
+	sbbl	%ecx,%ecx
+	movb	%al,3(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+LLEntry5_8:
+	sbbl	%ecx,%ecx
+	movb	%al,4(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+LLEntry4_8:
+	sbbl	%ecx,%ecx
+	movb	%al,5(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+LLEntry3_8:
+	movb	%al,6(%edi)
+	movb	(%esi),%al
+LLEntry2_8:
+
+LEndSpan:
+
+//
+// clear s/z, t/z, 1/z from FP stack
+//
+	fstp %st(0)
+	fstp %st(0)
+	fstp %st(0)
+
+	movl	pspantemp,%ebx				// restore spans pointer
+	movl	espan_t_pnext(%ebx),%ebx	// point to next span
+	testl	%ebx,%ebx			// any more spans?
+	movb	%al,7(%edi)
+	jnz		LSpanLoop			// more spans
+
+	popl	%ebx				// restore register variables
+	popl	%esi
+	popl	%edi
+	popl	%ebp				// restore the caller's stack frame
+	ret
+
+//----------------------------------------------------------------------
+// 8-bpp horizontal span z drawing codefor polygons, with no transparency.
+//
+// Assumes there is at least one span in pzspans, and that every span
+// contains at least one pixel
+//----------------------------------------------------------------------
+
+	.text
+
+// z-clamp on a non-negative gradient span
+LClamp:
+	movl	$0x40000000,%edx
+	xorl	%ebx,%ebx
+	fstp	%st(0)
+	jmp		LZDraw
+
+// z-clamp on a negative gradient span
+LClampNeg:
+	movl	$0x40000000,%edx
+	xorl	%ebx,%ebx
+	fstp	%st(0)
+	jmp		LZDrawNeg
+
+
+#define pzspans	4+16
+
+.globl C(D_DrawZSpans)
+C(D_DrawZSpans):
+	pushl	%ebp				// preserve caller's stack frame
+	pushl	%edi
+	pushl	%esi				// preserve register variables
+	pushl	%ebx
+
+	flds	C(d_zistepu)
+	movl	C(d_zistepu),%eax
+	movl	pzspans(%esp),%esi
+	testl	%eax,%eax
+	jz		LFNegSpan
+
+	fmuls	Float2ToThe31nd
+	fistpl	izistep		// note: we are relying on FP exceptions being turned
+						// off here to avoid range problems
+	movl	izistep,%ebx	// remains loaded for all spans
+
+LFSpanLoop:
+// set up the initial 1/z value
+	fildl	espan_t_v(%esi)
+	fildl	espan_t_u(%esi)
+	movl	espan_t_v(%esi),%ecx
+	movl	C(d_pzbuffer),%edi
+	fmuls	C(d_zistepu)
+	fxch	%st(1)
+	fmuls	C(d_zistepv)
+	fxch	%st(1)
+	fadds	C(d_ziorigin)
+	imull	C(d_zrowbytes),%ecx
+	faddp	%st(0),%st(1)
+
+// clamp if z is nearer than 2 (1/z > 0.5)
+	fcoms	float_point5
+	addl	%ecx,%edi
+	movl	espan_t_u(%esi),%edx
+	addl	%edx,%edx				// word count
+	movl	espan_t_count(%esi),%ecx
+	addl	%edx,%edi				// pdest = &pdestspan[scans->u];
+	pushl	%esi		// preserve spans pointer
+	fnstsw	%ax
+	testb	$0x45,%ah
+	jz		LClamp
+
+	fmuls	Float2ToThe31nd
+	fistpl	izi			// note: we are relying on FP exceptions being turned
+						// off here to avoid problems when the span is closer
+						// than 1/(2**31)
+	movl	izi,%edx
+
+// at this point:
+// %ebx = izistep
+// %ecx = count
+// %edx = izi
+// %edi = pdest
+
+LZDraw:
+
+// do a single pixel up front, if necessary to dword align the destination
+	testl	$2,%edi
+	jz		LFMiddle
+	movl	%edx,%eax
+	addl	%ebx,%edx
+	shrl	$16,%eax
+	decl	%ecx
+	movw	%ax,(%edi)
+	addl	$2,%edi
+
+// do middle a pair of aligned dwords at a time
+LFMiddle:
+	pushl	%ecx
+	shrl	$1,%ecx				// count / 2
+	jz		LFLast				// no aligned dwords to do
+	shrl	$1,%ecx				// (count / 2) / 2
+	jnc		LFMiddleLoop		// even number of aligned dwords to do
+
+	movl	%edx,%eax
+	addl	%ebx,%edx
+	shrl	$16,%eax
+	movl	%edx,%esi
+	addl	%ebx,%edx
+	andl	$0xFFFF0000,%esi
+	orl		%esi,%eax
+	movl	%eax,(%edi)
+	addl	$4,%edi
+	andl	%ecx,%ecx
+	jz		LFLast
+
+LFMiddleLoop:
+	movl	%edx,%eax
+	addl	%ebx,%edx
+	shrl	$16,%eax
+	movl	%edx,%esi
+	addl	%ebx,%edx
+	andl	$0xFFFF0000,%esi
+	orl		%esi,%eax
+	movl	%edx,%ebp
+	movl	%eax,(%edi)
+	addl	%ebx,%edx
+	shrl	$16,%ebp
+	movl	%edx,%esi
+	addl	%ebx,%edx
+	andl	$0xFFFF0000,%esi
+	orl		%esi,%ebp
+	movl	%ebp,4(%edi)	// FIXME: eliminate register contention
+	addl	$8,%edi
+
+	decl	%ecx
+	jnz		LFMiddleLoop
+
+LFLast:
+	popl	%ecx			// retrieve count
+	popl	%esi			// retrieve span pointer
+
+// do the last, unaligned pixel, if there is one
+	andl	$1,%ecx			// is there an odd pixel left to do?
+	jz		LFSpanDone		// no
+	shrl	$16,%edx
+	movw	%dx,(%edi)		// do the final pixel's z
+
+LFSpanDone:
+	movl	espan_t_pnext(%esi),%esi
+	testl	%esi,%esi
+	jnz		LFSpanLoop
+
+	jmp		LFDone
+
+LFNegSpan:
+	fmuls	FloatMinus2ToThe31nd
+	fistpl	izistep		// note: we are relying on FP exceptions being turned
+						// off here to avoid range problems
+	movl	izistep,%ebx	// remains loaded for all spans
+
+LFNegSpanLoop:
+// set up the initial 1/z value
+	fildl	espan_t_v(%esi)
+	fildl	espan_t_u(%esi)
+	movl	espan_t_v(%esi),%ecx
+	movl	C(d_pzbuffer),%edi
+	fmuls	C(d_zistepu)
+	fxch	%st(1)
+	fmuls	C(d_zistepv)
+	fxch	%st(1)
+	fadds	C(d_ziorigin)
+	imull	C(d_zrowbytes),%ecx
+	faddp	%st(0),%st(1)
+
+// clamp if z is nearer than 2 (1/z > 0.5)
+	fcoms	float_point5
+	addl	%ecx,%edi
+	movl	espan_t_u(%esi),%edx
+	addl	%edx,%edx				// word count
+	movl	espan_t_count(%esi),%ecx
+	addl	%edx,%edi				// pdest = &pdestspan[scans->u];
+	pushl	%esi		// preserve spans pointer
+	fnstsw	%ax
+	testb	$0x45,%ah
+	jz		LClampNeg
+
+	fmuls	Float2ToThe31nd
+	fistpl	izi			// note: we are relying on FP exceptions being turned
+						// off here to avoid problems when the span is closer
+						// than 1/(2**31)
+	movl	izi,%edx
+
+// at this point:
+// %ebx = izistep
+// %ecx = count
+// %edx = izi
+// %edi = pdest
+
+LZDrawNeg:
+
+// do a single pixel up front, if necessary to dword align the destination
+	testl	$2,%edi
+	jz		LFNegMiddle
+	movl	%edx,%eax
+	subl	%ebx,%edx
+	shrl	$16,%eax
+	decl	%ecx
+	movw	%ax,(%edi)
+	addl	$2,%edi
+
+// do middle a pair of aligned dwords at a time
+LFNegMiddle:
+	pushl	%ecx
+	shrl	$1,%ecx				// count / 2
+	jz		LFNegLast			// no aligned dwords to do
+	shrl	$1,%ecx				// (count / 2) / 2
+	jnc		LFNegMiddleLoop		// even number of aligned dwords to do
+
+	movl	%edx,%eax
+	subl	%ebx,%edx
+	shrl	$16,%eax
+	movl	%edx,%esi
+	subl	%ebx,%edx
+	andl	$0xFFFF0000,%esi
+	orl		%esi,%eax
+	movl	%eax,(%edi)
+	addl	$4,%edi
+	andl	%ecx,%ecx
+	jz		LFNegLast
+
+LFNegMiddleLoop:
+	movl	%edx,%eax
+	subl	%ebx,%edx
+	shrl	$16,%eax
+	movl	%edx,%esi
+	subl	%ebx,%edx
+	andl	$0xFFFF0000,%esi
+	orl		%esi,%eax
+	movl	%edx,%ebp
+	movl	%eax,(%edi)
+	subl	%ebx,%edx
+	shrl	$16,%ebp
+	movl	%edx,%esi
+	subl	%ebx,%edx
+	andl	$0xFFFF0000,%esi
+	orl		%esi,%ebp
+	movl	%ebp,4(%edi)	// FIXME: eliminate register contention
+	addl	$8,%edi
+
+	decl	%ecx
+	jnz		LFNegMiddleLoop
+
+LFNegLast:
+	popl	%ecx			// retrieve count
+	popl	%esi			// retrieve span pointer
+
+// do the last, unaligned pixel, if there is one
+	andl	$1,%ecx			// is there an odd pixel left to do?
+	jz		LFNegSpanDone	// no
+	shrl	$16,%edx
+	movw	%dx,(%edi)		// do the final pixel's z
+
+LFNegSpanDone:
+	movl	espan_t_pnext(%esi),%esi
+	testl	%esi,%esi
+	jnz		LFNegSpanLoop
+
+LFDone:
+	popl	%ebx				// restore register variables
+	popl	%esi
+	popl	%edi
+	popl	%ebp				// restore the caller's stack frame
+	ret
+
+#endif	// id386
diff --git a/source/d_draw16.S b/source/d_draw16.S
new file mode 100644
index 0000000..4cd0a17
--- /dev/null
+++ b/source/d_draw16.S
@@ -0,0 +1,974 @@
+/*
+Copyright (C) 1996-1997 Id Software, Inc.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+
+See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+*/
+//
+// d_draw16.s
+// x86 assembly-language horizontal 8-bpp span-drawing code, with 16-pixel
+// subdivision.
+//
+
+#include "asm_i386.h"
+#include "quakeasm.h"
+#include "asm_draw.h"
+#include "d_ifacea.h"
+
+#if	id386
+
+//----------------------------------------------------------------------
+// 8-bpp horizontal span drawing code for polygons, with no transparency and
+// 16-pixel subdivision.
+//
+// Assumes there is at least one span in pspans, and that every span
+// contains at least one pixel
+//----------------------------------------------------------------------
+
+	.data
+
+	.text
+
+// out-of-line, rarely-needed clamping code
+
+LClampHigh0:
+	movl	C(bbextents),%esi
+	jmp		LClampReentry0
+LClampHighOrLow0:
+	jg		LClampHigh0
+	xorl	%esi,%esi
+	jmp		LClampReentry0
+
+LClampHigh1:
+	movl	C(bbextentt),%edx
+	jmp		LClampReentry1
+LClampHighOrLow1:
+	jg		LClampHigh1
+	xorl	%edx,%edx
+	jmp		LClampReentry1
+
+LClampLow2:
+	movl	$4096,%ebp
+	jmp		LClampReentry2
+LClampHigh2:
+	movl	C(bbextents),%ebp
+	jmp		LClampReentry2
+
+LClampLow3:
+	movl	$4096,%ecx
+	jmp		LClampReentry3
+LClampHigh3:
+	movl	C(bbextentt),%ecx
+	jmp		LClampReentry3
+
+LClampLow4:
+	movl	$4096,%eax
+	jmp		LClampReentry4
+LClampHigh4:
+	movl	C(bbextents),%eax
+	jmp		LClampReentry4
+
+LClampLow5:
+	movl	$4096,%ebx
+	jmp		LClampReentry5
+LClampHigh5:
+	movl	C(bbextentt),%ebx
+	jmp		LClampReentry5
+
+
+#define pspans	4+16
+
+	.align 4
+.globl C(D_DrawSpans16)
+C(D_DrawSpans16):
+	pushl	%ebp				// preserve caller's stack frame
+	pushl	%edi
+	pushl	%esi				// preserve register variables
+	pushl	%ebx
+
+//
+// set up scaled-by-16 steps, for 16-long segments; also set up cacheblock
+// and span list pointers
+//
+// TODO: any overlap from rearranging?
+	flds	C(d_sdivzstepu)
+	fmuls	fp_16
+	movl	C(cacheblock),%edx
+	flds	C(d_tdivzstepu)
+	fmuls	fp_16
+	movl	pspans(%esp),%ebx	// point to the first span descriptor
+	flds	C(d_zistepu)
+	fmuls	fp_16
+	movl	%edx,pbase			// pbase = cacheblock
+	fstps	zi16stepu
+	fstps	tdivz16stepu
+	fstps	sdivz16stepu
+
+LSpanLoop:
+//
+// set up the initial s/z, t/z, and 1/z on the FP stack, and generate the
+// initial s and t values
+//
+// FIXME: pipeline FILD?
+	fildl	espan_t_v(%ebx)
+	fildl	espan_t_u(%ebx)
+
+	fld		%st(1)			// dv | du | dv
+	fmuls	C(d_sdivzstepv)	// dv*d_sdivzstepv | du | dv
+	fld		%st(1)			// du | dv*d_sdivzstepv | du | dv
+	fmuls	C(d_sdivzstepu)	// du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
+	fld		%st(2)			// du | du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
+	fmuls	C(d_tdivzstepu)	// du*d_tdivzstepu | du*d_sdivzstepu |
+							//  dv*d_sdivzstepv | du | dv
+	fxch	%st(1)			// du*d_sdivzstepu | du*d_tdivzstepu |
+							//  dv*d_sdivzstepv | du | dv
+	faddp	%st(0),%st(2)	// du*d_tdivzstepu |
+							//  du*d_sdivzstepu + dv*d_sdivzstepv | du | dv
+	fxch	%st(1)			// du*d_sdivzstepu + dv*d_sdivzstepv |
+							//  du*d_tdivzstepu | du | dv
+	fld		%st(3)			// dv | du*d_sdivzstepu + dv*d_sdivzstepv |
+							//  du*d_tdivzstepu | du | dv
+	fmuls	C(d_tdivzstepv)	// dv*d_tdivzstepv |
+							//  du*d_sdivzstepu + dv*d_sdivzstepv |
+							//  du*d_tdivzstepu | du | dv
+	fxch	%st(1)			// du*d_sdivzstepu + dv*d_sdivzstepv |
+							//  dv*d_tdivzstepv | du*d_tdivzstepu | du | dv
+	fadds	C(d_sdivzorigin)	// sdivz = d_sdivzorigin + dv*d_sdivzstepv +
+							//  du*d_sdivzstepu; stays in %st(2) at end
+	fxch	%st(4)			// dv | dv*d_tdivzstepv | du*d_tdivzstepu | du |
+							//  s/z
+	fmuls	C(d_zistepv)		// dv*d_zistepv | dv*d_tdivzstepv |
+							//  du*d_tdivzstepu | du | s/z
+	fxch	%st(1)			// dv*d_tdivzstepv |  dv*d_zistepv |
+							//  du*d_tdivzstepu | du | s/z
+	faddp	%st(0),%st(2)	// dv*d_zistepv |
+							//  dv*d_tdivzstepv + du*d_tdivzstepu | du | s/z
+	fxch	%st(2)			// du | dv*d_tdivzstepv + du*d_tdivzstepu |
+							//  dv*d_zistepv | s/z
+	fmuls	C(d_zistepu)		// du*d_zistepu |
+							//  dv*d_tdivzstepv + du*d_tdivzstepu |
+							//  dv*d_zistepv | s/z
+	fxch	%st(1)			// dv*d_tdivzstepv + du*d_tdivzstepu |
+							//  du*d_zistepu | dv*d_zistepv | s/z
+	fadds	C(d_tdivzorigin)	// tdivz = d_tdivzorigin + dv*d_tdivzstepv +
+							//  du*d_tdivzstepu; stays in %st(1) at end
+	fxch	%st(2)			// dv*d_zistepv | du*d_zistepu | t/z | s/z
+	faddp	%st(0),%st(1)	// dv*d_zistepv + du*d_zistepu | t/z | s/z
+
+	flds	fp_64k			// fp_64k | dv*d_zistepv + du*d_zistepu | t/z | s/z
+	fxch	%st(1)			// dv*d_zistepv + du*d_zistepu | fp_64k | t/z | s/z
+	fadds	C(d_ziorigin)		// zi = d_ziorigin + dv*d_zistepv +
+							//  du*d_zistepu; stays in %st(0) at end
+							// 1/z | fp_64k | t/z | s/z
+//
+// calculate and clamp s & t
+//
+	fdivr	%st(0),%st(1)	// 1/z | z*64k | t/z | s/z
+
+//
+// point %edi to the first pixel in the span
+//
+	movl	C(d_viewbuffer),%ecx
+	movl	espan_t_v(%ebx),%eax
+	movl	%ebx,pspantemp	// preserve spans pointer
+
+	movl	C(tadjust),%edx
+	movl	C(sadjust),%esi
+	movl	C(d_scantable)(,%eax,4),%edi	// v * screenwidth
+	addl	%ecx,%edi
+	movl	espan_t_u(%ebx),%ecx
+	addl	%ecx,%edi				// pdest = &pdestspan[scans->u];
+	movl	espan_t_count(%ebx),%ecx
+
+//
+// now start the FDIV for the end of the span
+//
+	cmpl	$16,%ecx
+	ja		LSetupNotLast1
+
+	decl	%ecx
+	jz		LCleanup1		// if only one pixel, no need to start an FDIV
+	movl	%ecx,spancountminus1
+
+// finish up the s and t calcs
+	fxch	%st(1)			// z*64k | 1/z | t/z | s/z
+
+	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z
+	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z
+	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z
+	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z
+	fxch	%st(1)			// s | t | 1/z | t/z | s/z
+	fistpl	s				// 1/z | t | t/z | s/z
+	fistpl	t				// 1/z | t/z | s/z
+
+	fildl	spancountminus1
+
+	flds	C(d_tdivzstepu)	// C(d_tdivzstepu) | spancountminus1
+	flds	C(d_zistepu)		// C(d_zistepu) | C(d_tdivzstepu) | spancountminus1
+	fmul	%st(2),%st(0)	// C(d_zistepu)*scm1 | C(d_tdivzstepu) | scm1
+	fxch	%st(1)			// C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1
+	fmul	%st(2),%st(0)	// C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1
+	fxch	%st(2)			// scm1 | C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1
+	fmuls	C(d_sdivzstepu)	// C(d_sdivzstepu)*scm1 | C(d_zistepu)*scm1 |
+							//  C(d_tdivzstepu)*scm1
+	fxch	%st(1)			// C(d_zistepu)*scm1 | C(d_sdivzstepu)*scm1 |
+							//  C(d_tdivzstepu)*scm1
+	faddp	%st(0),%st(3)	// C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1
+	fxch	%st(1)			// C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1
+	faddp	%st(0),%st(3)	// C(d_sdivzstepu)*scm1
+	faddp	%st(0),%st(3)
+
+	flds	fp_64k
+	fdiv	%st(1),%st(0)	// this is what we've gone to all this trouble to
+							//  overlap
+	jmp		LFDIVInFlight1
+
+LCleanup1:
+// finish up the s and t calcs
+	fxch	%st(1)			// z*64k | 1/z | t/z | s/z
+
+	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z
+	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z
+	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z
+	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z
+	fxch	%st(1)			// s | t | 1/z | t/z | s/z
+	fistpl	s				// 1/z | t | t/z | s/z
+	fistpl	t				// 1/z | t/z | s/z
+	jmp		LFDIVInFlight1
+
+	.align	4
+LSetupNotLast1:
+// finish up the s and t calcs
+	fxch	%st(1)			// z*64k | 1/z | t/z | s/z
+
+	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z
+	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z
+	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z
+	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z
+	fxch	%st(1)			// s | t | 1/z | t/z | s/z
+	fistpl	s				// 1/z | t | t/z | s/z
+	fistpl	t				// 1/z | t/z | s/z
+
+	fadds	zi16stepu
+	fxch	%st(2)
+	fadds	sdivz16stepu
+	fxch	%st(2)
+	flds	tdivz16stepu
+	faddp	%st(0),%st(2)
+	flds	fp_64k
+	fdiv	%st(1),%st(0)	// z = 1/1/z
+							// this is what we've gone to all this trouble to
+							//  overlap
+LFDIVInFlight1:
+
+	addl	s,%esi
+	addl	t,%edx
+	movl	C(bbextents),%ebx
+	movl	C(bbextentt),%ebp
+	cmpl	%ebx,%esi
+	ja		LClampHighOrLow0
+LClampReentry0:
+	movl	%esi,s
+	movl	pbase,%ebx
+	shll	$16,%esi
+	cmpl	%ebp,%edx
+	movl	%esi,sfracf
+	ja		LClampHighOrLow1
+LClampReentry1:
+	movl	%edx,t
+	movl	s,%esi					// sfrac = scans->sfrac;
+	shll	$16,%edx
+	movl	t,%eax					// tfrac = scans->tfrac;
+	sarl	$16,%esi
+	movl	%edx,tfracf
+
+//
+// calculate the texture starting address
+//
+	sarl	$16,%eax
+	movl	C(cachewidth),%edx
+	imull	%edx,%eax				// (tfrac >> 16) * cachewidth
+	addl	%ebx,%esi
+	addl	%eax,%esi				// psource = pbase + (sfrac >> 16) +
+									//           ((tfrac >> 16) * cachewidth);
+//
+// determine whether last span or not
+//
+	cmpl	$16,%ecx
+	jna		LLastSegment
+
+//
+// not the last segment; do full 16-wide segment
+//
+LNotLastSegment:
+
+//
+// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
+// get there
+//
+
+// pick up after the FDIV that was left in flight previously
+
+	fld		%st(0)			// duplicate it
+	fmul	%st(4),%st(0)	// s = s/z * z
+	fxch	%st(1)
+	fmul	%st(3),%st(0)	// t = t/z * z
+	fxch	%st(1)
+	fistpl	snext
+	fistpl	tnext
+	movl	snext,%eax
+	movl	tnext,%edx
+
+	movb	(%esi),%bl	// get first source texel
+	subl	$16,%ecx		// count off this segments' pixels
+	movl	C(sadjust),%ebp
+	movl	%ecx,counttemp	// remember count of remaining pixels
+
+	movl	C(tadjust),%ecx
+	movb	%bl,(%edi)	// store first dest pixel
+
+	addl	%eax,%ebp
+	addl	%edx,%ecx
+
+	movl	C(bbextents),%eax
+	movl	C(bbextentt),%edx
+
+	cmpl	$4096,%ebp
+	jl		LClampLow2
+	cmpl	%eax,%ebp
+	ja		LClampHigh2
+LClampReentry2:
+
+	cmpl	$4096,%ecx
+	jl		LClampLow3
+	cmpl	%edx,%ecx
+	ja		LClampHigh3
+LClampReentry3:
+
+	movl	%ebp,snext
+	movl	%ecx,tnext
+
+	subl	s,%ebp
+	subl	t,%ecx
+	
+//
+// set up advancetable
+//
+	movl	%ecx,%eax
+	movl	%ebp,%edx
+	sarl	$20,%eax			// tstep >>= 16;
+	jz		LZero
+	sarl	$20,%edx			// sstep >>= 16;
+	movl	C(cachewidth),%ebx
+	imull	%ebx,%eax
+	jmp		LSetUp1
+
+LZero:
+	sarl	$20,%edx			// sstep >>= 16;
+	movl	C(cachewidth),%ebx
+
+LSetUp1:
+
+	addl	%edx,%eax			// add in sstep
+								// (tstep >> 16) * cachewidth + (sstep >> 16);
+	movl	tfracf,%edx
+	movl	%eax,advancetable+4	// advance base in t
+	addl	%ebx,%eax			// ((tstep >> 16) + 1) * cachewidth +
+								//  (sstep >> 16);
+	shll	$12,%ebp			// left-justify sstep fractional part
+	movl	sfracf,%ebx
+	shll	$12,%ecx			// left-justify tstep fractional part
+	movl	%eax,advancetable	// advance extra in t
+
+	movl	%ecx,tstep
+	addl	%ecx,%edx			// advance tfrac fractional part by tstep frac
+
+	sbbl	%ecx,%ecx			// turn tstep carry into -1 (0 if none)
+	addl	%ebp,%ebx			// advance sfrac fractional part by sstep frac
+	adcl	advancetable+4(,%ecx,4),%esi	// point to next source texel
+
+	addl	tstep,%edx
+	sbbl	%ecx,%ecx
+	movb	(%esi),%al
+	addl	%ebp,%ebx
+	movb	%al,1(%edi)
+	adcl	advancetable+4(,%ecx,4),%esi
+
+	addl	tstep,%edx
+	sbbl	%ecx,%ecx
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+
+	addl	tstep,%edx
+	sbbl	%ecx,%ecx
+	movb	%al,2(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+
+	addl	tstep,%edx
+	sbbl	%ecx,%ecx
+	movb	%al,3(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+
+	addl	tstep,%edx
+	sbbl	%ecx,%ecx
+	movb	%al,4(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+
+	addl	tstep,%edx
+	sbbl	%ecx,%ecx
+	movb	%al,5(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+
+	addl	tstep,%edx
+	sbbl	%ecx,%ecx
+	movb	%al,6(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+
+	addl	tstep,%edx
+	sbbl	%ecx,%ecx
+	movb	%al,7(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+
+
+//
+// start FDIV for end of next segment in flight, so it can overlap
+//
+	movl	counttemp,%ecx
+	cmpl	$16,%ecx			// more than one segment after this?
+	ja		LSetupNotLast2	// yes
+
+	decl	%ecx
+	jz		LFDIVInFlight2	// if only one pixel, no need to start an FDIV
+	movl	%ecx,spancountminus1
+	fildl	spancountminus1
+
+	flds	C(d_zistepu)		// C(d_zistepu) | spancountminus1
+	fmul	%st(1),%st(0)	// C(d_zistepu)*scm1 | scm1
+	flds	C(d_tdivzstepu)	// C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1
+	fmul	%st(2),%st(0)	// C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1
+	fxch	%st(1)			// C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1 | scm1
+	faddp	%st(0),%st(3)	// C(d_tdivzstepu)*scm1 | scm1
+	fxch	%st(1)			// scm1 | C(d_tdivzstepu)*scm1
+	fmuls	C(d_sdivzstepu)	// C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1
+	fxch	%st(1)			// C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1
+	faddp	%st(0),%st(3)	// C(d_sdivzstepu)*scm1
+	flds	fp_64k			// 64k | C(d_sdivzstepu)*scm1
+	fxch	%st(1)			// C(d_sdivzstepu)*scm1 | 64k
+	faddp	%st(0),%st(4)	// 64k
+
+	fdiv	%st(1),%st(0)	// this is what we've gone to all this trouble to
+							//  overlap
+	jmp		LFDIVInFlight2
+
+	.align	4
+LSetupNotLast2:
+	fadds	zi16stepu
+	fxch	%st(2)
+	fadds	sdivz16stepu
+	fxch	%st(2)
+	flds	tdivz16stepu
+	faddp	%st(0),%st(2)
+	flds	fp_64k
+	fdiv	%st(1),%st(0)	// z = 1/1/z
+							// this is what we've gone to all this trouble to
+							//  overlap
+LFDIVInFlight2:
+	movl	%ecx,counttemp
+
+	addl	tstep,%edx
+	sbbl	%ecx,%ecx
+	movb	%al,8(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+
+	addl	tstep,%edx
+	sbbl	%ecx,%ecx
+	movb	%al,9(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+
+	addl	tstep,%edx
+	sbbl	%ecx,%ecx
+	movb	%al,10(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+
+	addl	tstep,%edx
+	sbbl	%ecx,%ecx
+	movb	%al,11(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+
+	addl	tstep,%edx
+	sbbl	%ecx,%ecx
+	movb	%al,12(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+
+	addl	tstep,%edx
+	sbbl	%ecx,%ecx
+	movb	%al,13(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+
+	addl	tstep,%edx
+	sbbl	%ecx,%ecx
+	movb	%al,14(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+
+	addl	$16,%edi
+	movl	%edx,tfracf
+	movl	snext,%edx
+	movl	%ebx,sfracf
+	movl	tnext,%ebx
+	movl	%edx,s
+	movl	%ebx,t
+
+	movl	counttemp,%ecx		// retrieve count
+
+//
+// determine whether last span or not
+//
+	cmpl	$16,%ecx				// are there multiple segments remaining?
+	movb	%al,-1(%edi)
+	ja		LNotLastSegment		// yes
+
+//
+// last segment of scan
+//
+LLastSegment:
+
+//
+// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
+// get there. The number of pixels left is variable, and we want to land on the
+// last pixel, not step one past it, so we can't run into arithmetic problems
+//
+	testl	%ecx,%ecx
+	jz		LNoSteps		// just draw the last pixel and we're done
+
+// pick up after the FDIV that was left in flight previously
+
+
+	fld		%st(0)			// duplicate it
+	fmul	%st(4),%st(0)	// s = s/z * z
+	fxch	%st(1)
+	fmul	%st(3),%st(0)	// t = t/z * z
+	fxch	%st(1)
+	fistpl	snext
+	fistpl	tnext
+
+	movb	(%esi),%al		// load first texel in segment
+	movl	C(tadjust),%ebx
+	movb	%al,(%edi)		// store first pixel in segment
+	movl	C(sadjust),%eax
+
+	addl	snext,%eax
+	addl	tnext,%ebx
+
+	movl	C(bbextents),%ebp
+	movl	C(bbextentt),%edx
+
+	cmpl	$4096,%eax
+	jl		LClampLow4
+	cmpl	%ebp,%eax
+	ja		LClampHigh4
+LClampReentry4:
+	movl	%eax,snext
+
+	cmpl	$4096,%ebx
+	jl		LClampLow5
+	cmpl	%edx,%ebx
+	ja		LClampHigh5
+LClampReentry5:
+
+	cmpl	$1,%ecx			// don't bother 
+	je		LOnlyOneStep	// if two pixels in segment, there's only one step,
+							//  of the segment length
+	subl	s,%eax
+	subl	t,%ebx
+
+	addl	%eax,%eax		// convert to 15.17 format so multiply by 1.31
+	addl	%ebx,%ebx		//  reciprocal yields 16.48
+
+	imull	reciprocal_table_16-8(,%ecx,4)	// sstep = (snext - s) /
+											//  (spancount-1)
+	movl	%edx,%ebp
+
+	movl	%ebx,%eax
+	imull	reciprocal_table_16-8(,%ecx,4)	// tstep = (tnext - t) /
+											//  (spancount-1)
+LSetEntryvec:
+//
+// set up advancetable
+//
+	movl	entryvec_table_16(,%ecx,4),%ebx
+	movl	%edx,%eax
+	movl	%ebx,jumptemp		// entry point into code for RET later
+	movl	%ebp,%ecx
+	sarl	$16,%edx			// tstep >>= 16;
+	movl	C(cachewidth),%ebx
+	sarl	$16,%ecx			// sstep >>= 16;
+	imull	%ebx,%edx
+
+	addl	%ecx,%edx			// add in sstep
+								// (tstep >> 16) * cachewidth + (sstep >> 16);
+	movl	tfracf,%ecx
+	movl	%edx,advancetable+4	// advance base in t
+	addl	%ebx,%edx			// ((tstep >> 16) + 1) * cachewidth +
+								//  (sstep >> 16);
+	shll	$16,%ebp			// left-justify sstep fractional part
+	movl	sfracf,%ebx
+	shll	$16,%eax			// left-justify tstep fractional part
+	movl	%edx,advancetable	// advance extra in t
+
+	movl	%eax,tstep
+	movl	%ecx,%edx
+	addl	%eax,%edx
+	sbbl	%ecx,%ecx
+	addl	%ebp,%ebx
+	adcl	advancetable+4(,%ecx,4),%esi
+
+	jmp		*jumptemp			// jump to the number-of-pixels handler
+
+//----------------------------------------
+
+LNoSteps:
+	movb	(%esi),%al		// load first texel in segment
+	subl	$15,%edi			// adjust for hardwired offset
+	jmp		LEndSpan
+
+
+LOnlyOneStep:
+	subl	s,%eax
+	subl	t,%ebx
+	movl	%eax,%ebp
+	movl	%ebx,%edx
+	jmp		LSetEntryvec
+
+//----------------------------------------
+
+.globl	Entry2_16, Entry3_16, Entry4_16, Entry5_16
+.globl	Entry6_16, Entry7_16, Entry8_16, Entry9_16
+.globl	Entry10_16, Entry11_16, Entry12_16, Entry13_16
+.globl	Entry14_16, Entry15_16, Entry16_16
+
+Entry2_16:
+	subl	$14,%edi		// adjust for hardwired offsets
+	movb	(%esi),%al
+	jmp		LEntry2_16
+
+//----------------------------------------
+
+Entry3_16:
+	subl	$13,%edi		// adjust for hardwired offsets
+	addl	%eax,%edx
+	movb	(%esi),%al
+	sbbl	%ecx,%ecx
+	addl	%ebp,%ebx
+	adcl	advancetable+4(,%ecx,4),%esi
+	jmp		LEntry3_16
+
+//----------------------------------------
+
+Entry4_16:
+	subl	$12,%edi		// adjust for hardwired offsets
+	addl	%eax,%edx
+	movb	(%esi),%al
+	sbbl	%ecx,%ecx
+	addl	%ebp,%ebx
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+	jmp		LEntry4_16
+
+//----------------------------------------
+
+Entry5_16:
+	subl	$11,%edi		// adjust for hardwired offsets
+	addl	%eax,%edx
+	movb	(%esi),%al
+	sbbl	%ecx,%ecx
+	addl	%ebp,%ebx
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+	jmp		LEntry5_16
+
+//----------------------------------------
+
+Entry6_16:
+	subl	$10,%edi		// adjust for hardwired offsets
+	addl	%eax,%edx
+	movb	(%esi),%al
+	sbbl	%ecx,%ecx
+	addl	%ebp,%ebx
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+	jmp		LEntry6_16
+
+//----------------------------------------
+
+Entry7_16:
+	subl	$9,%edi		// adjust for hardwired offsets
+	addl	%eax,%edx
+	movb	(%esi),%al
+	sbbl	%ecx,%ecx
+	addl	%ebp,%ebx
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+	jmp		LEntry7_16
+
+//----------------------------------------
+
+Entry8_16:
+	subl	$8,%edi		// adjust for hardwired offsets
+	addl	%eax,%edx
+	movb	(%esi),%al
+	sbbl	%ecx,%ecx
+	addl	%ebp,%ebx
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+	jmp		LEntry8_16
+
+//----------------------------------------
+
+Entry9_16:
+	subl	$7,%edi		// adjust for hardwired offsets
+	addl	%eax,%edx
+	movb	(%esi),%al
+	sbbl	%ecx,%ecx
+	addl	%ebp,%ebx
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+	jmp		LEntry9_16
+
+//----------------------------------------
+
+Entry10_16:
+	subl	$6,%edi		// adjust for hardwired offsets
+	addl	%eax,%edx
+	movb	(%esi),%al
+	sbbl	%ecx,%ecx
+	addl	%ebp,%ebx
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+	jmp		LEntry10_16
+
+//----------------------------------------
+
+Entry11_16:
+	subl	$5,%edi		// adjust for hardwired offsets
+	addl	%eax,%edx
+	movb	(%esi),%al
+	sbbl	%ecx,%ecx
+	addl	%ebp,%ebx
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+	jmp		LEntry11_16
+
+//----------------------------------------
+
+Entry12_16:
+	subl	$4,%edi		// adjust for hardwired offsets
+	addl	%eax,%edx
+	movb	(%esi),%al
+	sbbl	%ecx,%ecx
+	addl	%ebp,%ebx
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+	jmp		LEntry12_16
+
+//----------------------------------------
+
+Entry13_16:
+	subl	$3,%edi		// adjust for hardwired offsets
+	addl	%eax,%edx
+	movb	(%esi),%al
+	sbbl	%ecx,%ecx
+	addl	%ebp,%ebx
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+	jmp		LEntry13_16
+
+//----------------------------------------
+
+Entry14_16:
+	subl	$2,%edi		// adjust for hardwired offsets
+	addl	%eax,%edx
+	movb	(%esi),%al
+	sbbl	%ecx,%ecx
+	addl	%ebp,%ebx
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+	jmp		LEntry14_16
+
+//----------------------------------------
+
+Entry15_16:
+	decl	%edi		// adjust for hardwired offsets
+	addl	%eax,%edx
+	movb	(%esi),%al
+	sbbl	%ecx,%ecx
+	addl	%ebp,%ebx
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+	jmp		LEntry15_16
+
+//----------------------------------------
+
+Entry16_16:
+	addl	%eax,%edx
+	movb	(%esi),%al
+	sbbl	%ecx,%ecx
+	addl	%ebp,%ebx
+	adcl	advancetable+4(,%ecx,4),%esi
+
+	addl	tstep,%edx
+	sbbl	%ecx,%ecx
+	movb	%al,1(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+LEntry15_16:
+	sbbl	%ecx,%ecx
+	movb	%al,2(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+LEntry14_16:
+	sbbl	%ecx,%ecx
+	movb	%al,3(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+LEntry13_16:
+	sbbl	%ecx,%ecx
+	movb	%al,4(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+LEntry12_16:
+	sbbl	%ecx,%ecx
+	movb	%al,5(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+LEntry11_16:
+	sbbl	%ecx,%ecx
+	movb	%al,6(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+LEntry10_16:
+	sbbl	%ecx,%ecx
+	movb	%al,7(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+LEntry9_16:
+	sbbl	%ecx,%ecx
+	movb	%al,8(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+LEntry8_16:
+	sbbl	%ecx,%ecx
+	movb	%al,9(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+LEntry7_16:
+	sbbl	%ecx,%ecx
+	movb	%al,10(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+LEntry6_16:
+	sbbl	%ecx,%ecx
+	movb	%al,11(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+LEntry5_16:
+	sbbl	%ecx,%ecx
+	movb	%al,12(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+	addl	tstep,%edx
+LEntry4_16:
+	sbbl	%ecx,%ecx
+	movb	%al,13(%edi)
+	addl	%ebp,%ebx
+	movb	(%esi),%al
+	adcl	advancetable+4(,%ecx,4),%esi
+LEntry3_16:
+	movb	%al,14(%edi)
+	movb	(%esi),%al
+LEntry2_16:
+
+LEndSpan:
+
+//
+// clear s/z, t/z, 1/z from FP stack
+//
+	fstp %st(0)
+	fstp %st(0)
+	fstp %st(0)
+
+	movl	pspantemp,%ebx				// restore spans pointer
+	movl	espan_t_pnext(%ebx),%ebx	// point to next span
+	testl	%ebx,%ebx			// any more spans?
+	movb	%al,15(%edi)
+	jnz		LSpanLoop			// more spans
+
+	popl	%ebx				// restore register variables
+	popl	%esi
+	popl	%edi
+	popl	%ebp				// restore the caller's stack frame
+	ret
+
+#endif	// id386
diff --git a/source/d_parta.S b/source/d_parta.S
new file mode 100644
index 0000000..560925d
--- /dev/null
+++ b/source/d_parta.S
@@ -0,0 +1,477 @@
+/*
+Copyright (C) 1996-1997 Id Software, Inc.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+
+See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+*/
+//
+// d_parta.s
+// x86 assembly-language 8-bpp particle-drawing code.
+//
+
+#include "asm_i386.h"
+#include "quakeasm.h"
+#include "d_ifacea.h"
+#include "asm_draw.h"
+
+#if	id386
+
+//----------------------------------------------------------------------
+// 8-bpp particle drawing code.
+//----------------------------------------------------------------------
+
+//FIXME: comments, full optimization
+
+//----------------------------------------------------------------------
+// 8-bpp particle queueing code.
+//----------------------------------------------------------------------
+
+	.text
+
+#define P	12+4
+
+	.align 4
+.globl C(D_DrawParticle)
+C(D_DrawParticle):
+	pushl	%ebp				// preserve caller's stack frame
+	pushl	%edi				// preserve register variables
+	pushl	%ebx
+
+	movl	P(%esp),%edi
+
+// FIXME: better FP overlap in general here
+
+// transform point
+//	VectorSubtract (p->org, r_origin, local);
+	flds	C(r_origin)
+	fsubrs	pt_org(%edi)
+	flds	pt_org+4(%edi)
+	fsubs	C(r_origin)+4
+	flds	pt_org+8(%edi)
+	fsubs	C(r_origin)+8
+	fxch	%st(2)			// local[0] | local[1] | local[2]
+
+//	transformed[2] = DotProduct(local, r_ppn);		
+	flds	C(r_ppn)		// r_ppn[0] | local[0] | local[1] | local[2]
+	fmul	%st(1),%st(0)	// dot0 | local[0] | local[1] | local[2]
+	flds	C(r_ppn)+4	// r_ppn[1] | dot0 | local[0] | local[1] | local[2]
+	fmul	%st(3),%st(0)	// dot1 | dot0 | local[0] | local[1] | local[2]
+	flds	C(r_ppn)+8	// r_ppn[2] | dot1 | dot0 | local[0] |
+						//  local[1] | local[2]
+	fmul	%st(5),%st(0)	// dot2 | dot1 | dot0 | local[0] | local[1] | local[2]
+	fxch	%st(2)		// dot0 | dot1 | dot2 | local[0] | local[1] | local[2]
+	faddp	%st(0),%st(1) // dot0 + dot1 | dot2 | local[0] | local[1] |
+						  //  local[2]
+	faddp	%st(0),%st(1) // z | local[0] | local[1] | local[2]
+	fld		%st(0)		// z | z | local[0] | local[1] |
+						//  local[2]
+	fdivrs	float_1		// 1/z | z | local[0] | local[1] | local[2]
+	fxch	%st(1)		// z | 1/z | local[0] | local[1] | local[2]
+
+//	if (transformed[2] < PARTICLE_Z_CLIP)
+//		return;
+	fcomps	float_particle_z_clip	// 1/z | local[0] | local[1] | local[2]
+	fxch	%st(3)					// local[2] | local[0] | local[1] | 1/z
+
+	flds	C(r_pup)	// r_pup[0] | local[2] | local[0] | local[1] | 1/z
+	fmul	%st(2),%st(0)	// dot0 | local[2] | local[0] | local[1] | 1/z 
+	flds	C(r_pup)+4	// r_pup[1] | dot0 | local[2] | local[0] |
+						//  local[1] | 1/z 
+
+	fnstsw	%ax
+	testb	$1,%ah
+	jnz		LPop6AndDone
+
+//	transformed[1] = DotProduct(local, r_pup);
+	fmul	%st(4),%st(0)	// dot1 | dot0 | local[2] | local[0] | local[1] | 1/z 
+	flds	C(r_pup)+8	// r_pup[2] | dot1 | dot0 | local[2] |
+						//  local[0] | local[1] | 1/z 
+	fmul	%st(3),%st(0)	// dot2 | dot1 | dot0 | local[2] | local[0] |
+						//  local[1] | 1/z 
+	fxch	%st(2)		// dot0 | dot1 | dot2 | local[2] | local[0] |
+						//  local[1] | 1/z 
+	faddp	%st(0),%st(1) // dot0 + dot1 | dot2 | local[2] | local[0] |
+						//  local[1] | 1/z 
+	faddp	%st(0),%st(1) // y | local[2] | local[0] | local[1] | 1/z 
+	fxch	%st(3)		// local[1] | local[2] | local[0] | y | 1/z 
+
+//	transformed[0] = DotProduct(local, r_pright);
+	fmuls	C(r_pright)+4	// dot1 | local[2] | local[0] | y | 1/z
+	fxch	%st(2)		// local[0] | local[2] | dot1 | y | 1/z
+	fmuls	C(r_pright)	// dot0 | local[2] | dot1 | y | 1/z
+	fxch	%st(1)		// local[2] | dot0 | dot1 | y | 1/z
+	fmuls	C(r_pright)+8	// dot2 | dot0 | dot1 | y | 1/z
+	fxch	%st(2)		// dot1 | dot0 | dot2 | y | 1/z
+	faddp	%st(0),%st(1) // dot1 + dot0 | dot2 | y | 1/z
+
+	faddp	%st(0),%st(1)	// x | y | 1/z
+	fxch	%st(1)			// y | x | 1/z
+
+// project the point
+	fmul	%st(2),%st(0)	// y/z | x | 1/z
+	fxch	%st(1)			// x | y/z | 1/z
+	fmul	%st(2),%st(0)	// x/z | y/z | 1/z
+	fxch	%st(1)			// y/z | x/z | 1/z
+	fsubrs	C(ycenter)		// v | x/z | 1/z
+	fxch	%st(1)			// x/z | v | 1/z
+	fadds	C(xcenter)		// u | v | 1/z
+// FIXME: preadjust xcenter and ycenter
+	fxch	%st(1)			// v | u | 1/z
+	fadds	float_point5	// v | u | 1/z
+	fxch	%st(1)			// u | v | 1/z
+	fadds	float_point5	// u | v | 1/z
+	fxch	%st(2)			// 1/z | v | u
+	fmuls	DP_32768		// 1/z * 0x8000 | v | u
+	fxch	%st(2)			// u | v | 1/z * 0x8000
+
+// FIXME: use Terje's fp->int trick here?
+// FIXME: check we're getting proper rounding here
+	fistpl	DP_u			// v | 1/z * 0x8000
+	fistpl	DP_v			// 1/z * 0x8000
+
+	movl	DP_u,%eax
+	movl	DP_v,%edx
+
+// if ((v > d_vrectbottom_particle) || 
+// 	(u > d_vrectright_particle) ||
+// 	(v < d_vrecty) ||
+// 	(u < d_vrectx))
+// {
+// 	continue;
+// }
+
+	movl	C(d_vrectbottom_particle),%ebx
+	movl	C(d_vrectright_particle),%ecx
+	cmpl	%ebx,%edx
+	jg		LPop1AndDone
+	cmpl	%ecx,%eax
+	jg		LPop1AndDone
+	movl	C(d_vrecty),%ebx
+	movl	C(d_vrectx),%ecx
+	cmpl	%ebx,%edx
+	jl		LPop1AndDone
+
+	cmpl	%ecx,%eax
+	jl		LPop1AndDone
+
+	flds	pt_color(%edi)	// color | 1/z * 0x8000
+// FIXME: use Terje's fast fp->int trick?
+	fistpl	DP_Color		// 1/z * 0x8000
+
+	movl	C(d_viewbuffer),%ebx
+
+	addl	%eax,%ebx
+	movl	C(d_scantable)(,%edx,4),%edi		// point to the pixel
+
+	imull	C(d_zrowbytes),%edx		// point to the z pixel
+
+	leal	(%edx,%eax,2),%edx
+	movl	C(d_pzbuffer),%eax
+
+	fistpl	izi
+
+	addl	%ebx,%edi
+	addl	%eax,%edx
+
+// pix = izi >> d_pix_shift;
+
+	movl	izi,%eax
+	movl	C(d_pix_shift),%ecx
+	shrl	%cl,%eax
+	movl	izi,%ebp
+
+// if (pix < d_pix_min)
+// 		pix = d_pix_min;
+// else if (pix > d_pix_max)
+//  	pix = d_pix_max;
+
+	movl	C(d_pix_min),%ebx
+	movl	C(d_pix_max),%ecx
+	cmpl	%ebx,%eax
+	jnl		LTestPixMax
+	movl	%ebx,%eax
+	jmp		LTestDone
+
+LTestPixMax:
+	cmpl	%ecx,%eax
+	jng		LTestDone
+	movl	%ecx,%eax
+LTestDone:
+
+	movb	DP_Color,%ch
+
+	movl	C(d_y_aspect_shift),%ebx
+	testl	%ebx,%ebx
+	jnz		LDefault
+
+	cmpl	$4,%eax
+	ja		LDefault
+
+	jmp		DP_EntryTable-4(,%eax,4)
+
+// 1x1
+.globl	DP_1x1
+DP_1x1:
+	cmpw	%bp,(%edx)		// just one pixel to do
+	jg		LDone
+	movw	%bp,(%edx)
+	movb	%ch,(%edi)
+	jmp		LDone
+
+// 2x2
+.globl	DP_2x2
+DP_2x2:
+	pushl	%esi
+	movl	C(screenwidth),%ebx
+	movl	C(d_zrowbytes),%esi
+
+	cmpw	%bp,(%edx)
+	jg		L2x2_1
+	movw	%bp,(%edx)
+	movb	%ch,(%edi)
+L2x2_1:
+	cmpw	%bp,2(%edx)
+	jg		L2x2_2
+	movw	%bp,2(%edx)
+	movb	%ch,1(%edi)
+L2x2_2:
+	cmpw	%bp,(%edx,%esi,1)
+	jg		L2x2_3
+	movw	%bp,(%edx,%esi,1)
+	movb	%ch,(%edi,%ebx,1)
+L2x2_3:
+	cmpw	%bp,2(%edx,%esi,1)
+	jg		L2x2_4
+	movw	%bp,2(%edx,%esi,1)
+	movb	%ch,1(%edi,%ebx,1)
+L2x2_4:
+
+	popl	%esi
+	jmp		LDone
+
+// 3x3
+.globl	DP_3x3
+DP_3x3:
+	pushl	%esi
+	movl	C(screenwidth),%ebx
+	movl	C(d_zrowbytes),%esi
+
+	cmpw	%bp,(%edx)
+	jg		L3x3_1
+	movw	%bp,(%edx)
+	movb	%ch,(%edi)
+L3x3_1:
+	cmpw	%bp,2(%edx)
+	jg		L3x3_2
+	movw	%bp,2(%edx)
+	movb	%ch,1(%edi)
+L3x3_2:
+	cmpw	%bp,4(%edx)
+	jg		L3x3_3
+	movw	%bp,4(%edx)
+	movb	%ch,2(%edi)
+L3x3_3:
+
+	cmpw	%bp,(%edx,%esi,1)
+	jg		L3x3_4
+	movw	%bp,(%edx,%esi,1)
+	movb	%ch,(%edi,%ebx,1)
+L3x3_4:
+	cmpw	%bp,2(%edx,%esi,1)
+	jg		L3x3_5
+	movw	%bp,2(%edx,%esi,1)
+	movb	%ch,1(%edi,%ebx,1)
+L3x3_5:
+	cmpw	%bp,4(%edx,%esi,1)
+	jg		L3x3_6
+	movw	%bp,4(%edx,%esi,1)
+	movb	%ch,2(%edi,%ebx,1)
+L3x3_6:
+
+	cmpw	%bp,(%edx,%esi,2)
+	jg		L3x3_7
+	movw	%bp,(%edx,%esi,2)
+	movb	%ch,(%edi,%ebx,2)
+L3x3_7:
+	cmpw	%bp,2(%edx,%esi,2)
+	jg		L3x3_8
+	movw	%bp,2(%edx,%esi,2)
+	movb	%ch,1(%edi,%ebx,2)
+L3x3_8:
+	cmpw	%bp,4(%edx,%esi,2)
+	jg		L3x3_9
+	movw	%bp,4(%edx,%esi,2)
+	movb	%ch,2(%edi,%ebx,2)
+L3x3_9:
+
+	popl	%esi
+	jmp		LDone
+
+
+// 4x4
+.globl	DP_4x4
+DP_4x4:
+	pushl	%esi
+	movl	C(screenwidth),%ebx
+	movl	C(d_zrowbytes),%esi
+
+	cmpw	%bp,(%edx)
+	jg		L4x4_1
+	movw	%bp,(%edx)
+	movb	%ch,(%edi)
+L4x4_1:
+	cmpw	%bp,2(%edx)
+	jg		L4x4_2
+	movw	%bp,2(%edx)
+	movb	%ch,1(%edi)
+L4x4_2:
+	cmpw	%bp,4(%edx)
+	jg		L4x4_3
+	movw	%bp,4(%edx)
+	movb	%ch,2(%edi)
+L4x4_3:
+	cmpw	%bp,6(%edx)
+	jg		L4x4_4
+	movw	%bp,6(%edx)
+	movb	%ch,3(%edi)
+L4x4_4:
+
+	cmpw	%bp,(%edx,%esi,1)
+	jg		L4x4_5
+	movw	%bp,(%edx,%esi,1)
+	movb	%ch,(%edi,%ebx,1)
+L4x4_5:
+	cmpw	%bp,2(%edx,%esi,1)
+	jg		L4x4_6
+	movw	%bp,2(%edx,%esi,1)
+	movb	%ch,1(%edi,%ebx,1)
+L4x4_6:
+	cmpw	%bp,4(%edx,%esi,1)
+	jg		L4x4_7
+	movw	%bp,4(%edx,%esi,1)
+	movb	%ch,2(%edi,%ebx,1)
+L4x4_7:
+	cmpw	%bp,6(%edx,%esi,1)
+	jg		L4x4_8
+	movw	%bp,6(%edx,%esi,1)
+	movb	%ch,3(%edi,%ebx,1)
+L4x4_8:
+
+	leal	(%edx,%esi,2),%edx
+	leal	(%edi,%ebx,2),%edi
+
+	cmpw	%bp,(%edx)
+	jg		L4x4_9
+	movw	%bp,(%edx)
+	movb	%ch,(%edi)
+L4x4_9:
+	cmpw	%bp,2(%edx)
+	jg		L4x4_10
+	movw	%bp,2(%edx)
+	movb	%ch,1(%edi)
+L4x4_10:
+	cmpw	%bp,4(%edx)
+	jg		L4x4_11
+	movw	%bp,4(%edx)
+	movb	%ch,2(%edi)
+L4x4_11:
+	cmpw	%bp,6(%edx)
+	jg		L4x4_12
+	movw	%bp,6(%edx)
+	movb	%ch,3(%edi)
+L4x4_12:
+
+	cmpw	%bp,(%edx,%esi,1)
+	jg		L4x4_13
+	movw	%bp,(%edx,%esi,1)
+	movb	%ch,(%edi,%ebx,1)
+L4x4_13:
+	cmpw	%bp,2(%edx,%esi,1)
+	jg		L4x4_14
+	movw	%bp,2(%edx,%esi,1)
+	movb	%ch,1(%edi,%ebx,1)
+L4x4_14:
+	cmpw	%bp,4(%edx,%esi,1)
+	jg		L4x4_15
+	movw	%bp,4(%edx,%esi,1)
+	movb	%ch,2(%edi,%ebx,1)
+L4x4_15:
+	cmpw	%bp,6(%edx,%esi,1)
+	jg		L4x4_16
+	movw	%bp,6(%edx,%esi,1)
+	movb	%ch,3(%edi,%ebx,1)
+L4x4_16:
+
+	popl	%esi
+	jmp		LDone
+
+// default case, handling any size particle
+LDefault:
+
+// count = pix << d_y_aspect_shift;
+
+	movl	%eax,%ebx
+	movl	%eax,DP_Pix
+	movb	C(d_y_aspect_shift),%cl
+	shll	%cl,%ebx
+
+// for ( ; count ; count--, pz += d_zwidth, pdest += screenwidth)
+// {
+// 	for (i=0 ; i<pix ; i++)
+// 	{
+// 		if (pz[i] <= izi)
+// 		{
+// 			pz[i] = izi;
+// 			pdest[i] = color;
+// 		}
+// 	}
+// }
+
+LGenRowLoop:
+	movl	DP_Pix,%eax
+
+LGenColLoop:
+	cmpw	%bp,-2(%edx,%eax,2)
+	jg		LGSkip
+	movw	%bp,-2(%edx,%eax,2)
+	movb	%ch,-1(%edi,%eax,1)
+LGSkip:
+	decl	%eax			// --pix
+	jnz		LGenColLoop
+
+	addl	C(d_zrowbytes),%edx
+	addl	C(screenwidth),%edi
+
+	decl	%ebx			// --count
+	jnz		LGenRowLoop
+
+LDone:
+	popl	%ebx				// restore register variables
+	popl	%edi
+	popl	%ebp				// restore the caller's stack frame
+	ret
+
+LPop6AndDone:
+	fstp	%st(0)
+	fstp	%st(0)
+	fstp	%st(0)
+	fstp	%st(0)
+	fstp	%st(0)
+LPop1AndDone:
+	fstp	%st(0)
+	jmp		LDone
+
+#endif	// id386
diff --git a/source/d_polysa.S b/source/d_polysa.S
new file mode 100644
index 0000000..5ad104a
--- /dev/null
+++ b/source/d_polysa.S
@@ -0,0 +1,1744 @@
+/*
+Copyright (C) 1996-1997 Id Software, Inc.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+
+See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+*/
+//
+// d_polysa.s
+// x86 assembly-language polygon model drawing code
+//
+
+#include "asm_i386.h"
+#include "quakeasm.h"
+#include "asm_draw.h"
+#include "d_ifacea.h"
+
+#if	id386
+
+// !!! if this is changed, it must be changed in d_polyse.c too !!!
+#define DPS_MAXSPANS			MAXHEIGHT+1	
+									// 1 extra for spanpackage that marks end
+
+//#define	SPAN_SIZE	(((DPS_MAXSPANS + 1 + ((CACHE_SIZE - 1) / spanpackage_t_size)) + 1) * spanpackage_t_size)
+#define SPAN_SIZE (1024+1+1+1)*32 
+
+
+	.data
+
+	.align	4
+p10_minus_p20:	.single		0
+p01_minus_p21:	.single		0
+temp0:			.single		0
+temp1:			.single		0
+Ltemp:			.single		0
+
+aff8entryvec_table:	.long	LDraw8, LDraw7, LDraw6, LDraw5
+				.long	LDraw4, LDraw3, LDraw2, LDraw1
+
+lzistepx:		.long	0
+
+
+	.text
+
+#ifndef NeXT
+	.extern C(D_PolysetSetEdgeTable)
+	.extern C(D_RasterizeAliasPolySmooth)
+#endif
+
+//----------------------------------------------------------------------
+// affine triangle gradient calculation code
+//----------------------------------------------------------------------
+
+#define skinwidth	4+0
+
+.globl C(D_PolysetCalcGradients)
+C(D_PolysetCalcGradients):
+
+//	p00_minus_p20 = r_p0[0] - r_p2[0];
+//	p01_minus_p21 = r_p0[1] - r_p2[1];
+//	p10_minus_p20 = r_p1[0] - r_p2[0];
+//	p11_minus_p21 = r_p1[1] - r_p2[1];
+//
+//	xstepdenominv = 1.0 / (p10_minus_p20 * p01_minus_p21 -
+//			     p00_minus_p20 * p11_minus_p21);
+//
+//	ystepdenominv = -xstepdenominv;
+
+	fildl	C(r_p0)+0		// r_p0[0]
+	fildl	C(r_p2)+0		// r_p2[0] | r_p0[0]
+	fildl	C(r_p0)+4		// r_p0[1] | r_p2[0] | r_p0[0]
+	fildl	C(r_p2)+4		// r_p2[1] | r_p0[1] | r_p2[0] | r_p0[0]
+	fildl	C(r_p1)+0		// r_p1[0] | r_p2[1] | r_p0[1] | r_p2[0] | r_p0[0]
+	fildl	C(r_p1)+4		// r_p1[1] | r_p1[0] | r_p2[1] | r_p0[1] |
+							//  r_p2[0] | r_p0[0]
+	fxch	%st(3)			// r_p0[1] | r_p1[0] | r_p2[1] | r_p1[1] |
+							//  r_p2[0] | r_p0[0]
+	fsub	%st(2),%st(0)	// p01_minus_p21 | r_p1[0] | r_p2[1] | r_p1[1] |
+							//  r_p2[0] | r_p0[0]
+	fxch	%st(1)			// r_p1[0] | p01_minus_p21 | r_p2[1] | r_p1[1] |
+							//  r_p2[0] | r_p0[0]
+	fsub	%st(4),%st(0)	// p10_minus_p20 | p01_minus_p21 | r_p2[1] |
+							//  r_p1[1] | r_p2[0] | r_p0[0]
+	fxch	%st(5)			// r_p0[0] | p01_minus_p21 | r_p2[1] |
+							//  r_p1[1] | r_p2[0] | p10_minus_p20
+	fsubp	%st(0),%st(4)	// p01_minus_p21 | r_p2[1] | r_p1[1] |
+							//  p00_minus_p20 | p10_minus_p20
+	fxch	%st(2)			// r_p1[1] | r_p2[1] | p01_minus_p21 |
+							//  p00_minus_p20 | p10_minus_p20
+	fsubp	%st(0),%st(1)	// p11_minus_p21 | p01_minus_p21 |
+							//  p00_minus_p20 | p10_minus_p20
+	fxch	%st(1)			// p01_minus_p21 | p11_minus_p21 |
+							//  p00_minus_p20 | p10_minus_p20
+	flds	C(d_xdenom)		// d_xdenom | p01_minus_p21 | p11_minus_p21 |
+							//  p00_minus_p20 | p10_minus_p20
+	fxch	%st(4)			// p10_minus_p20 | p01_minus_p21 | p11_minus_p21 |
+							//  p00_minus_p20 | d_xdenom
+	fstps	p10_minus_p20	// p01_minus_p21 | p11_minus_p21 |
+							//  p00_minus_p20 | d_xdenom
+	fstps	p01_minus_p21	// p11_minus_p21 | p00_minus_p20 | xstepdenominv
+	fxch	%st(2)			// xstepdenominv | p00_minus_p20 | p11_minus_p21
+
+//// ceil () for light so positive steps are exaggerated, negative steps
+//// diminished,  pushing us away from underflow toward overflow. Underflow is
+//// very visible, overflow is very unlikely, because of ambient lighting
+//	t0 = r_p0[4] - r_p2[4];
+//	t1 = r_p1[4] - r_p2[4];
+
+	fildl	C(r_p2)+16		// r_p2[4] | xstepdenominv | p00_minus_p20 |
+							//  p11_minus_p21
+	fildl	C(r_p0)+16		// r_p0[4] | r_p2[4] | xstepdenominv |
+							//  p00_minus_p20 | p11_minus_p21
+	fildl	C(r_p1)+16		// r_p1[4] | r_p0[4] | r_p2[4] | xstepdenominv |
+							//  p00_minus_p20 | p11_minus_p21
+	fxch	%st(2)			// r_p2[4] | r_p0[4] | r_p1[4] | xstepdenominv |
+							//  p00_minus_p20 | p11_minus_p21
+	fld		%st(0)			// r_p2[4] | r_p2[4] | r_p0[4] | r_p1[4] |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fsubrp	%st(0),%st(2)	// r_p2[4] | t0 | r_p1[4] | xstepdenominv |
+							//  p00_minus_p20 | p11_minus_p21
+	fsubrp	%st(0),%st(2)	// t0 | t1 | xstepdenominv | p00_minus_p20 |
+							//  p11_minus_p21
+
+//	r_lstepx = (int)
+//			ceil((t1 * p01_minus_p21 - t0 * p11_minus_p21) * xstepdenominv);
+//	r_lstepy = (int)
+//			ceil((t1 * p00_minus_p20 - t0 * p10_minus_p20) * ystepdenominv);
+
+	fld		%st(0)			// t0 | t0 | t1 | xstepdenominv | p00_minus_p20 |
+							//  p11_minus_p21
+	fmul	%st(5),%st(0)	// t0*p11_minus_p21 | t0 | t1 | xstepdenominv |
+							//  p00_minus_p20 | p11_minus_p21
+	fxch	%st(2)			// t1 | t0 | t0*p11_minus_p21 | xstepdenominv |
+							//  p00_minus_p20 | p11_minus_p21
+	fld		%st(0)			// t1 | t1 | t0 | t0*p11_minus_p21 |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fmuls	p01_minus_p21	// t1*p01_minus_p21 | t1 | t0 | t0*p11_minus_p21 |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fxch	%st(2)			// t0 | t1 | t1*p01_minus_p21 | t0*p11_minus_p21 |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fmuls	p10_minus_p20	// t0*p10_minus_p20 | t1 | t1*p01_minus_p21 |
+							//  t0*p11_minus_p21 | xstepdenominv |
+							//  p00_minus_p20 | p11_minus_p21
+	fxch	%st(1)			// t1 | t0*p10_minus_p20 | t1*p01_minus_p21 |
+							//  t0*p11_minus_p21 | xstepdenominv |
+							//  p00_minus_p20 | p11_minus_p21
+	fmul	%st(5),%st(0)	// t1*p00_minus_p20 | t0*p10_minus_p20 |
+							//  t1*p01_minus_p21 | t0*p11_minus_p21 |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fxch	%st(2)			// t1*p01_minus_p21 | t0*p10_minus_p20 |
+							//  t1*p00_minus_p20 | t0*p11_minus_p21 |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fsubp	%st(0),%st(3)	// t0*p10_minus_p20 | t1*p00_minus_p20 |
+							//  t1*p01_minus_p21 - t0*p11_minus_p21 |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fsubrp	%st(0),%st(1)	// t1*p00_minus_p20 - t0*p10_minus_p20 |
+							//  t1*p01_minus_p21 - t0*p11_minus_p21 |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fld		%st(2)			// xstepdenominv |
+							//  t1*p00_minus_p20 - t0*p10_minus_p20 |
+							//  t1*p01_minus_p21 - t0*p11_minus_p21 |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fmuls	float_minus_1	// ystepdenominv |
+							//  t1*p00_minus_p20 - t0*p10_minus_p20 |
+							//  t1*p01_minus_p21 - t0*p11_minus_p21 |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fxch	%st(2)			// t1*p01_minus_p21 - t0*p11_minus_p21 |
+							//  t1*p00_minus_p20 - t0*p10_minus_p20 |
+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
+							//  p11_minus_p21
+	fmul	%st(3),%st(0)	// (t1*p01_minus_p21 - t0*p11_minus_p21)*
+							//   xstepdenominv |
+							//  t1*p00_minus_p20 - t0*p10_minus_p20 |
+							//   | ystepdenominv | xstepdenominv |
+							//   p00_minus_p20 | p11_minus_p21
+	fxch	%st(1)			// t1*p00_minus_p20 - t0*p10_minus_p20 |
+							//  (t1*p01_minus_p21 - t0*p11_minus_p21)*
+							//   xstepdenominv | ystepdenominv |
+							//   xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fmul	%st(2),%st(0)	// (t1*p00_minus_p20 - t0*p10_minus_p20)*
+							//  ystepdenominv |
+							//  (t1*p01_minus_p21 - t0*p11_minus_p21)*
+							//  xstepdenominv | ystepdenominv |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fldcw	ceil_cw
+	fistpl	C(r_lstepy)		// r_lstepx | ystepdenominv | xstepdenominv |
+							//  p00_minus_p20 | p11_minus_p21
+	fistpl	C(r_lstepx)		// ystepdenominv | xstepdenominv | p00_minus_p20 |
+							//  p11_minus_p21
+	fldcw	single_cw
+
+//	t0 = r_p0[2] - r_p2[2];
+//	t1 = r_p1[2] - r_p2[2];
+
+	fildl	C(r_p2)+8		// r_p2[2] | ystepdenominv | xstepdenominv |
+							//  p00_minus_p20 | p11_minus_p21
+	fildl	C(r_p0)+8		// r_p0[2] | r_p2[2] | ystepdenominv |
+							//   xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fildl	C(r_p1)+8		// r_p1[2] | r_p0[2] | r_p2[2] | ystepdenominv |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fxch	%st(2)			// r_p2[2] | r_p0[2] | r_p1[2] | ystepdenominv |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fld		%st(0)			// r_p2[2] | r_p2[2] | r_p0[2] | r_p1[2] |
+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
+							//  p11_minus_p21
+	fsubrp	%st(0),%st(2)	// r_p2[2] | t0 | r_p1[2] | ystepdenominv |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fsubrp	%st(0),%st(2)	// t0 | t1 | ystepdenominv | xstepdenominv |
+							//  p00_minus_p20 | p11_minus_p21
+
+//	r_sstepx = (int)((t1 * p01_minus_p21 - t0 * p11_minus_p21) *
+//			xstepdenominv);
+//	r_sstepy = (int)((t1 * p00_minus_p20 - t0 * p10_minus_p20) *
+//			ystepdenominv);
+
+	fld		%st(0)			// t0 | t0 | t1 | ystepdenominv | xstepdenominv
+	fmul	%st(6),%st(0)	// t0*p11_minus_p21 | t0 | t1 | ystepdenominv |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fxch	%st(2)			// t1 | t0 | t0*p11_minus_p21 | ystepdenominv |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fld		%st(0)			// t1 | t1 | t0 | t0*p11_minus_p21 |
+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
+							//  p11_minus_p21
+	fmuls	p01_minus_p21	// t1*p01_minus_p21 | t1 | t0 | t0*p11_minus_p21 |
+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
+							//  p11_minus_p21
+	fxch	%st(2)			// t0 | t1 | t1*p01_minus_p21 | t0*p11_minus_p21 |
+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
+							//  p11_minus_p21
+	fmuls	p10_minus_p20	// t0*p10_minus_p20 | t1 | t1*p01_minus_p21 |
+							//  t0*p11_minus_p21 | ystepdenominv |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fxch	%st(1)			// t1 | t0*p10_minus_p20 | t1*p01_minus_p21 |
+							//  t0*p11_minus_p21 | ystepdenominv |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fmul	%st(6),%st(0)	// t1*p00_minus_p20 | t0*p10_minus_p20 |
+							//  t1*p01_minus_p21 | t0*p11_minus_p21 |
+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
+							//  p11_minus_p21
+	fxch	%st(2)			// t1*p01_minus_p21 | t0*p10_minus_p20 |
+							//  t1*p00_minus_p20 | t0*p11_minus_p21 |
+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
+							//  p11_minus_p21
+	fsubp	%st(0),%st(3)	// t0*p10_minus_p20 | t1*p00_minus_p20 |
+							//  t1*p01_minus_p21 - t0*p11_minus_p21 |
+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
+							//  p11_minus_p21
+	fsubrp	%st(0),%st(1)	// t1*p00_minus_p20 - t0*p10_minus_p20 |
+							//  t1*p01_minus_p21 - t0*p11_minus_p21 |
+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
+							//  p11_minus_p21
+	fmul	%st(2),%st(0)	// (t1*p00_minus_p20 - t0*p10_minus_p20)*
+							//   ystepdenominv |
+							//  t1*p01_minus_p21 - t0*p11_minus_p21 |
+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
+							//  p11_minus_p21
+	fxch	%st(1)			// t1*p01_minus_p21 - t0*p11_minus_p21 |
+							//  (t1*p00_minus_p20 - t0*p10_minus_p20)*
+							//   ystepdenominv | ystepdenominv |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fmul	%st(3),%st(0)	// (t1*p01_minus_p21 - t0*p11_minus_p21)*
+							//  xstepdenominv |
+							//  (t1*p00_minus_p20 - t0*p10_minus_p20)*
+							//  ystepdenominv | ystepdenominv |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fxch	%st(1)			// (t1*p00_minus_p20 - t0*p10_minus_p20)*
+							//  ystepdenominv |
+							//  (t1*p01_minus_p21 - t0*p11_minus_p21)*
+							//  xstepdenominv | ystepdenominv |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fistpl	C(r_sstepy)		// r_sstepx | ystepdenominv | xstepdenominv |
+							//  p00_minus_p20 | p11_minus_p21
+	fistpl	C(r_sstepx)		// ystepdenominv | xstepdenominv | p00_minus_p20 |
+							//  p11_minus_p21
+
+//	t0 = r_p0[3] - r_p2[3];
+//	t1 = r_p1[3] - r_p2[3];
+
+	fildl	C(r_p2)+12		// r_p2[3] | ystepdenominv | xstepdenominv |
+							//  p00_minus_p20 | p11_minus_p21
+	fildl	C(r_p0)+12		// r_p0[3] | r_p2[3] | ystepdenominv |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fildl	C(r_p1)+12		// r_p1[3] | r_p0[3] | r_p2[3] | ystepdenominv |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fxch	%st(2)			// r_p2[3] | r_p0[3] | r_p1[3] | ystepdenominv |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fld		%st(0)			// r_p2[3] | r_p2[3] | r_p0[3] | r_p1[3] |
+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
+							//  p11_minus_p21
+	fsubrp	%st(0),%st(2)	// r_p2[3] | t0 | r_p1[3] | ystepdenominv |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fsubrp	%st(0),%st(2)	// t0 | t1 | ystepdenominv | xstepdenominv |
+							//  p00_minus_p20 | p11_minus_p21
+
+//	r_tstepx = (int)((t1 * p01_minus_p21 - t0 * p11_minus_p21) *
+//			xstepdenominv);
+//	r_tstepy = (int)((t1 * p00_minus_p20 - t0 * p10_minus_p20) *
+//			ystepdenominv);
+
+	fld		%st(0)			// t0 | t0 | t1 | ystepdenominv | xstepdenominv |
+							//  p00_minus_p20 | p11_minus_p21
+	fmul	%st(6),%st(0)	// t0*p11_minus_p21 | t0 | t1 | ystepdenominv |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fxch	%st(2)			// t1 | t0 | t0*p11_minus_p21 | ystepdenominv |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fld		%st(0)			// t1 | t1 | t0 | t0*p11_minus_p21 |
+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
+							//  p11_minus_p21
+	fmuls	p01_minus_p21	// t1*p01_minus_p21 | t1 | t0 | t0*p11_minus_p21 |
+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
+							//  p11_minus_p21
+	fxch	%st(2)			// t0 | t1 | t1*p01_minus_p21 | t0*p11_minus_p21 |
+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
+							//  p11_minus_p21
+	fmuls	p10_minus_p20	// t0*p10_minus_p20 | t1 | t1*p01_minus_p21 |
+							//  t0*p11_minus_p21 | ystepdenominv |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fxch	%st(1)			// t1 | t0*p10_minus_p20 | t1*p01_minus_p21 |
+							//  t0*p11_minus_p21 | ystepdenominv |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fmul	%st(6),%st(0)	// t1*p00_minus_p20 | t0*p10_minus_p20 |
+							//  t1*p01_minus_p21 | t0*p11_minus_p21 |
+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
+							//  p11_minus_p21
+	fxch	%st(2)			// t1*p01_minus_p21 | t0*p10_minus_p20 |
+							//  t1*p00_minus_p20 | t0*p11_minus_p21 |
+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
+							//  p11_minus_p21
+	fsubp	%st(0),%st(3)	// t0*p10_minus_p20 | t1*p00_minus_p20 |
+							//  t1*p01_minus_p21 - t0*p11_minus_p21 |
+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
+							//  p11_minus_p21
+	fsubrp	%st(0),%st(1)	// t1*p00_minus_p20 - t0*p10_minus_p20 |
+							//  t1*p01_minus_p21 - t0*p11_minus_p21 |
+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
+							//  p11_minus_p21
+	fmul	%st(2),%st(0)	// (t1*p00_minus_p20 - t0*p10_minus_p20)*
+							//   ystepdenominv |
+							//  t1*p01_minus_p21 - t0*p11_minus_p21 |
+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
+							//  p11_minus_p21
+	fxch	%st(1)			// t1*p01_minus_p21 - t0*p11_minus_p21 |
+							//  (t1*p00_minus_p20 - t0*p10_minus_p20)*
+							//  ystepdenominv | ystepdenominv |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fmul	%st(3),%st(0)	// (t1*p01_minus_p21 - t0*p11_minus_p21)*
+							//  xstepdenominv |
+							//  (t1*p00_minus_p20 - t0*p10_minus_p20)*
+							//  ystepdenominv | ystepdenominv |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fxch	%st(1)			// (t1*p00_minus_p20 - t0*p10_minus_p20)*
+							//  ystepdenominv |
+							//  (t1*p01_minus_p21 - t0*p11_minus_p21)*
+							//  xstepdenominv | ystepdenominv |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fistpl	C(r_tstepy)		// r_tstepx | ystepdenominv | xstepdenominv |
+							//  p00_minus_p20 | p11_minus_p21
+	fistpl	C(r_tstepx)		// ystepdenominv | xstepdenominv | p00_minus_p20 |
+							//  p11_minus_p21
+
+//	t0 = r_p0[5] - r_p2[5];
+//	t1 = r_p1[5] - r_p2[5];
+
+	fildl	C(r_p2)+20		// r_p2[5] | ystepdenominv | xstepdenominv |
+							//  p00_minus_p20 | p11_minus_p21
+	fildl	C(r_p0)+20		// r_p0[5] | r_p2[5] | ystepdenominv |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fildl	C(r_p1)+20		// r_p1[5] | r_p0[5] | r_p2[5] | ystepdenominv |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fxch	%st(2)			// r_p2[5] | r_p0[5] | r_p1[5] | ystepdenominv |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fld		%st(0)			// r_p2[5] | r_p2[5] | r_p0[5] | r_p1[5] |
+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
+							//  p11_minus_p21
+	fsubrp	%st(0),%st(2)	// r_p2[5] | t0 | r_p1[5] | ystepdenominv |
+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
+	fsubrp	%st(0),%st(2)	// t0 | t1 | ystepdenominv | xstepdenominv |
+							//  p00_minus_p20 | p11_minus_p21
+
+//	r_zistepx = (int)((t1 * p01_minus_p21 - t0 * p11_minus_p21) *
+//			xstepdenominv);
+//	r_zistepy = (int)((t1 * p00_minus_p20 - t0 * p10_minus_p20) *
+//			ystepdenominv);
+
+	fld		%st(0)			// t0 | t0 | t1 | ystepdenominv | xstepdenominv |
+							//  p00_minus_p20 | p11_minus_p21
+	fmulp	%st(0),%st(6)	// t0 | t1 | ystepdenominv | xstepdenominv |
+							//  p00_minus_p20 | t0*p11_minus_p21
+	fxch	%st(1)			// t1 | t0 | ystepdenominv | xstepdenominv |
+							//  p00_minus_p20 | t0*p11_minus_p21
+	fld		%st(0)			// t1 | t1 | t0 | ystepdenominv | xstepdenominv |
+							//  p00_minus_p20 | t0*p11_minus_p21
+	fmuls	p01_minus_p21	// t1*p01_minus_p21 | t1 | t0 | ystepdenominv |
+							//  xstepdenominv | p00_minus_p20 |
+							//  t0*p11_minus_p21
+	fxch	%st(2)			// t0 | t1 | t1*p01_minus_p21 | ystepdenominv |
+							//  xstepdenominv | p00_minus_p20 |
+							//  t0*p11_minus_p21
+	fmuls	p10_minus_p20	// t0*p10_minus_p20 | t1 | t1*p01_minus_p21 |
+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
+							//  t0*p11_minus_p21
+	fxch	%st(1)			// t1 | t0*p10_minus_p20 | t1*p01_minus_p21 |
+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
+							//  t0*p11_minus_p21
+	fmulp	%st(0),%st(5)	// t0*p10_minus_p20 | t1*p01_minus_p21 |
+							//  ystepdenominv | xstepdenominv |
+							//  t1*p00_minus_p20 | t0*p11_minus_p21
+	fxch	%st(5)			// t0*p11_minus_p21 | t1*p01_minus_p21 |
+							//  ystepdenominv | xstepdenominv |
+							//  t1*p00_minus_p20 | t0*p10_minus_p20
+	fsubrp	%st(0),%st(1)	// t1*p01_minus_p21 - t0*p11_minus_p21 |
+							//  ystepdenominv | xstepdenominv |
+							//  t1*p00_minus_p20 | t0*p10_minus_p20
+	fxch	%st(3)			// t1*p00_minus_p20 | ystepdenominv |
+							//  xstepdenominv |
+							//  t1*p01_minus_p21 - t0*p11_minus_p21 |
+							//  t0*p10_minus_p20
+	fsubp	%st(0),%st(4)	// ystepdenominv | xstepdenominv |
+							//  t1*p01_minus_p21 - t0*p11_minus_p21 |
+							//  t1*p00_minus_p20 - t0*p10_minus_p20
+	fxch	%st(1)			// xstepdenominv | ystepdenominv |
+							//  t1*p01_minus_p21 - t0*p11_minus_p21 |
+							//  t1*p00_minus_p20 - t0*p10_minus_p20
+	fmulp	%st(0),%st(2)	// ystepdenominv |
+							//  (t1*p01_minus_p21 - t0*p11_minus_p21) *
+							//  xstepdenominv |
+							//  t1*p00_minus_p20 - t0*p10_minus_p20
+	fmulp	%st(0),%st(2)	// (t1*p01_minus_p21 - t0*p11_minus_p21) *
+							//  xstepdenominv |
+							//  (t1*p00_minus_p20 - t0*p10_minus_p20) *
+							//  ystepdenominv
+	fistpl	C(r_zistepx)	// (t1*p00_minus_p20 - t0*p10_minus_p20) *
+							//  ystepdenominv
+	fistpl	C(r_zistepy)
+
+//	a_sstepxfrac = r_sstepx << 16;
+//	a_tstepxfrac = r_tstepx << 16;
+//
+//	a_ststepxwhole = r_affinetridesc.skinwidth * (r_tstepx >> 16) +
+//			(r_sstepx >> 16);
+
+	movl	C(r_sstepx),%eax
+	movl	C(r_tstepx),%edx
+	shll	$16,%eax
+	shll	$16,%edx
+	movl	%eax,C(a_sstepxfrac)
+	movl	%edx,C(a_tstepxfrac)
+
+	movl	C(r_sstepx),%ecx
+	movl	C(r_tstepx),%eax
+	sarl	$16,%ecx
+	sarl	$16,%eax
+	imull	skinwidth(%esp)
+	addl	%ecx,%eax
+	movl	%eax,C(a_ststepxwhole)
+
+	ret
+
+
+//----------------------------------------------------------------------
+// recursive subdivision affine triangle drawing code
+//
+// not C-callable because of stdcall return
+//----------------------------------------------------------------------
+
+#define lp1	4+16
+#define lp2	8+16
+#define lp3	12+16
+
+.globl C(D_PolysetRecursiveTriangle)
+C(D_PolysetRecursiveTriangle):
+	pushl	%ebp				// preserve caller stack frame pointer
+	pushl	%esi				// preserve register variables
+	pushl	%edi
+	pushl	%ebx
+
+//	int		*temp;
+//	int		d;
+//	int		new[6];
+//	int		i;
+//	int		z;
+//	short	*zbuf;
+	movl	lp2(%esp),%esi
+	movl	lp1(%esp),%ebx
+	movl	lp3(%esp),%edi
+
+//	d = lp2[0] - lp1[0];
+//	if (d < -1 || d > 1)
+//		goto split;
+	movl	0(%esi),%eax
+
+	movl	0(%ebx),%edx
+	movl	4(%esi),%ebp
+
+	subl	%edx,%eax
+	movl	4(%ebx),%ecx
+
+	subl	%ecx,%ebp
+	incl	%eax
+
+	cmpl	$2,%eax
+	ja		LSplit
+
+//	d = lp2[1] - lp1[1];
+//	if (d < -1 || d > 1)
+//		goto split;
+	movl	0(%edi),%eax
+	incl	%ebp
+
+	cmpl	$2,%ebp
+	ja		LSplit
+
+//	d = lp3[0] - lp2[0];
+//	if (d < -1 || d > 1)
+//		goto split2;
+	movl	0(%esi),%edx
+	movl	4(%edi),%ebp
+
+	subl	%edx,%eax
+	movl	4(%esi),%ecx
+
+	subl	%ecx,%ebp
+	incl	%eax
+
+	cmpl	$2,%eax
+	ja		LSplit2
+
+//	d = lp3[1] - lp2[1];
+//	if (d < -1 || d > 1)
+//		goto split2;
+	movl	0(%ebx),%eax
+	incl	%ebp
+
+	cmpl	$2,%ebp
+	ja		LSplit2
+
+//	d = lp1[0] - lp3[0];
+//	if (d < -1 || d > 1)
+//		goto split3;
+	movl	0(%edi),%edx
+	movl	4(%ebx),%ebp
+
+	subl	%edx,%eax
+	movl	4(%edi),%ecx
+
+	subl	%ecx,%ebp
+	incl	%eax
+
+	incl	%ebp
+	movl	%ebx,%edx
+
+	cmpl	$2,%eax
+	ja		LSplit3
+
+//	d = lp1[1] - lp3[1];
+//	if (d < -1 || d > 1)
+//	{
+//split3:
+//		temp = lp1;
+//		lp3 = lp2;
+//		lp1 = lp3;
+//		lp2 = temp;
+//		goto split;
+//	}
+//
+//	return;			// entire tri is filled
+//
+	cmpl	$2,%ebp
+	jna		LDone
+
+LSplit3:
+	movl	%edi,%ebx
+	movl	%esi,%edi
+	movl	%edx,%esi
+	jmp		LSplit
+
+//split2:
+LSplit2:
+
+//	temp = lp1;
+//	lp1 = lp2;
+//	lp2 = lp3;
+//	lp3 = temp;
+	movl	%ebx,%eax
+	movl	%esi,%ebx
+	movl	%edi,%esi
+	movl	%eax,%edi
+
+//split:
+LSplit:
+
+	subl	$24,%esp		// allocate space for a new vertex
+
+//// split this edge
+//	new[0] = (lp1[0] + lp2[0]) >> 1;
+//	new[1] = (lp1[1] + lp2[1]) >> 1;
+//	new[2] = (lp1[2] + lp2[2]) >> 1;
+//	new[3] = (lp1[3] + lp2[3]) >> 1;
+//	new[5] = (lp1[5] + lp2[5]) >> 1;
+	movl	8(%ebx),%eax
+
+	movl	8(%esi),%edx
+	movl	12(%ebx),%ecx
+
+	addl	%edx,%eax
+	movl	12(%esi),%edx
+
+	sarl	$1,%eax
+	addl	%edx,%ecx
+
+	movl	%eax,8(%esp)
+	movl	20(%ebx),%eax
+
+	sarl	$1,%ecx
+	movl	20(%esi),%edx
+
+	movl	%ecx,12(%esp)
+	addl	%edx,%eax
+
+	movl	0(%ebx),%ecx
+	movl	0(%esi),%edx
+
+	sarl	$1,%eax
+	addl	%ecx,%edx
+
+	movl	%eax,20(%esp)
+	movl	4(%ebx),%eax
+
+	sarl	$1,%edx
+	movl	4(%esi),%ebp
+
+	movl	%edx,0(%esp)
+	addl	%eax,%ebp
+
+	sarl	$1,%ebp
+	movl	%ebp,4(%esp)
+
+//// draw the point if splitting a leading edge
+//	if (lp2[1] > lp1[1])
+//		goto nodraw;
+	cmpl	%eax,4(%esi)
+	jg		LNoDraw
+
+//	if ((lp2[1] == lp1[1]) && (lp2[0] < lp1[0]))
+//		goto nodraw;
+	movl	0(%esi),%edx
+	jnz		LDraw
+
+	cmpl	%ecx,%edx
+	jl		LNoDraw
+
+LDraw:
+
+// z = new[5] >> 16;
+	movl	20(%esp),%edx
+	movl	4(%esp),%ecx
+
+	sarl	$16,%edx
+	movl	0(%esp),%ebp
+
+//	zbuf = zspantable[new[1]] + new[0];
+	movl	C(zspantable)(,%ecx,4),%eax
+
+//	if (z >= *zbuf)
+//	{
+	cmpw	(%eax,%ebp,2),%dx
+	jnge	LNoDraw
+
+//		int		pix;
+//		
+//		*zbuf = z;
+	movw	%dx,(%eax,%ebp,2)
+
+//		pix = d_pcolormap[skintable[new[3]>>16][new[2]>>16]];
+	movl	12(%esp),%eax
+
+	sarl	$16,%eax
+	movl	8(%esp),%edx
+
+	sarl	$16,%edx
+	subl	%ecx,%ecx
+
+	movl	C(skintable)(,%eax,4),%eax
+	movl	4(%esp),%ebp
+
+	movb	(%eax,%edx,),%cl
+	movl	C(d_pcolormap),%edx
+
+	movb	(%edx,%ecx,),%dl
+	movl	0(%esp),%ecx
+
+//		d_viewbuffer[d_scantable[new[1]] + new[0]] = pix;
+	movl	C(d_scantable)(,%ebp,4),%eax
+	addl	%eax,%ecx
+	movl	C(d_viewbuffer),%eax
+	movb	%dl,(%eax,%ecx,1)
+
+//	}
+//
+//nodraw:
+LNoDraw:
+
+//// recursively continue
+//	D_PolysetRecursiveTriangle (lp3, lp1, new);
+	pushl	%esp
+	pushl	%ebx
+	pushl	%edi
+	call	C(D_PolysetRecursiveTriangle)
+
+//	D_PolysetRecursiveTriangle (lp3, new, lp2);
+	movl	%esp,%ebx
+	pushl	%esi
+	pushl	%ebx
+	pushl	%edi
+	call	C(D_PolysetRecursiveTriangle)
+	addl	$24,%esp
+
+LDone:
+	popl	%ebx				// restore register variables
+	popl	%edi
+	popl	%esi
+	popl	%ebp				// restore caller stack frame pointer
+	ret		$12
+
+
+//----------------------------------------------------------------------
+// 8-bpp horizontal span drawing code for affine polygons, with smooth
+// shading and no transparency
+//----------------------------------------------------------------------
+
+#define pspans	4+8
+
+.globl C(D_PolysetAff8Start)
+C(D_PolysetAff8Start):
+
+.globl C(D_PolysetDrawSpans8)
+C(D_PolysetDrawSpans8):
+	pushl	%esi				// preserve register variables
+	pushl	%ebx
+
+	movl	pspans(%esp),%esi	// point to the first span descriptor
+	movl	C(r_zistepx),%ecx
+
+	pushl	%ebp				// preserve caller's stack frame
+	pushl	%edi
+
+	rorl	$16,%ecx			// put high 16 bits of 1/z step in low word
+	movl	spanpackage_t_count(%esi),%edx
+
+	movl	%ecx,lzistepx
+
+LSpanLoop:
+
+//		lcount = d_aspancount - pspanpackage->count;
+//
+//		errorterm += erroradjustup;
+//		if (errorterm >= 0)
+//		{
+//			d_aspancount += d_countextrastep;
+//			errorterm -= erroradjustdown;
+//		}
+//		else
+//		{
+//			d_aspancount += ubasestep;
+//		}
+	movl	C(d_aspancount),%eax
+	subl	%edx,%eax
+
+	movl	C(erroradjustup),%edx
+	movl	C(errorterm),%ebx
+	addl	%edx,%ebx
+	js		LNoTurnover
+
+	movl	C(erroradjustdown),%edx
+	movl	C(d_countextrastep),%edi
+	subl	%edx,%ebx
+	movl	C(d_aspancount),%ebp
+	movl	%ebx,C(errorterm)
+	addl	%edi,%ebp
+	movl	%ebp,C(d_aspancount)
+	jmp		LRightEdgeStepped
+
+LNoTurnover:
+	movl	C(d_aspancount),%edi
+	movl	C(ubasestep),%edx
+	movl	%ebx,C(errorterm)
+	addl	%edx,%edi
+	movl	%edi,C(d_aspancount)
+
+LRightEdgeStepped:
+	cmpl	$1,%eax
+
+	jl		LNextSpan
+	jz		LExactlyOneLong
+
+//
+// set up advancetable
+//
+	movl	C(a_ststepxwhole),%ecx
+	movl	C(r_affinetridesc)+atd_skinwidth,%edx
+
+	movl	%ecx,advancetable+4	// advance base in t
+	addl	%edx,%ecx
+
+	movl	%ecx,advancetable	// advance extra in t
+	movl	C(a_tstepxfrac),%ecx
+
+	movw	C(r_lstepx),%cx
+	movl	%eax,%edx			// count
+
+	movl	%ecx,tstep
+	addl	$7,%edx
+
+	shrl	$3,%edx				// count of full and partial loops
+	movl	spanpackage_t_sfrac(%esi),%ebx
+
+	movw	%dx,%bx
+	movl	spanpackage_t_pz(%esi),%ecx
+
+	negl	%eax
+
+	movl	spanpackage_t_pdest(%esi),%edi
+	andl	$7,%eax		// 0->0, 1->7, 2->6, ... , 7->1
+
+	subl	%eax,%edi	// compensate for hardwired offsets
+	subl	%eax,%ecx
+
+	subl	%eax,%ecx
+	movl	spanpackage_t_tfrac(%esi),%edx
+
+	movw	spanpackage_t_light(%esi),%dx
+	movl	spanpackage_t_zi(%esi),%ebp
+
+	rorl	$16,%ebp	// put high 16 bits of 1/z in low word
+	pushl	%esi
+
+	movl	spanpackage_t_ptex(%esi),%esi
+	jmp		aff8entryvec_table(,%eax,4)
+
+// %bx = count of full and partial loops
+// %ebx high word = sfrac
+// %ecx = pz
+// %dx = light
+// %edx high word = tfrac
+// %esi = ptex
+// %edi = pdest
+// %ebp = 1/z
+// tstep low word = C(r_lstepx)
+// tstep high word = C(a_tstepxfrac)
+// C(a_sstepxfrac) low word = 0
+// C(a_sstepxfrac) high word = C(a_sstepxfrac)
+
+LDrawLoop:
+
+// FIXME: do we need to clamp light? We may need at least a buffer bit to
+// keep it from poking into tfrac and causing problems
+
+LDraw8:
+	cmpw	(%ecx),%bp
+	jl		Lp1
+	xorl	%eax,%eax
+	movb	%dh,%ah
+	movb	(%esi),%al
+	movw	%bp,(%ecx)
+	movb	0x12345678(%eax),%al
+LPatch8:
+	movb	%al,(%edi)
+Lp1:
+	addl	tstep,%edx
+	sbbl	%eax,%eax
+	addl	lzistepx,%ebp
+	adcl	$0,%ebp
+	addl	C(a_sstepxfrac),%ebx
+	adcl	advancetable+4(,%eax,4),%esi
+
+LDraw7:
+	cmpw	2(%ecx),%bp
+	jl		Lp2
+	xorl	%eax,%eax
+	movb	%dh,%ah
+	movb	(%esi),%al
+	movw	%bp,2(%ecx)
+	movb	0x12345678(%eax),%al
+LPatch7:
+	movb	%al,1(%edi)
+Lp2:
+	addl	tstep,%edx
+	sbbl	%eax,%eax
+	addl	lzistepx,%ebp
+	adcl	$0,%ebp
+	addl	C(a_sstepxfrac),%ebx
+	adcl	advancetable+4(,%eax,4),%esi
+
+LDraw6:
+	cmpw	4(%ecx),%bp
+	jl		Lp3
+	xorl	%eax,%eax
+	movb	%dh,%ah
+	movb	(%esi),%al
+	movw	%bp,4(%ecx)
+	movb	0x12345678(%eax),%al
+LPatch6:
+	movb	%al,2(%edi)
+Lp3:
+	addl	tstep,%edx
+	sbbl	%eax,%eax
+	addl	lzistepx,%ebp
+	adcl	$0,%ebp
+	addl	C(a_sstepxfrac),%ebx
+	adcl	advancetable+4(,%eax,4),%esi
+
+LDraw5:
+	cmpw	6(%ecx),%bp
+	jl		Lp4
+	xorl	%eax,%eax
+	movb	%dh,%ah
+	movb	(%esi),%al
+	movw	%bp,6(%ecx)
+	movb	0x12345678(%eax),%al
+LPatch5:
+	movb	%al,3(%edi)
+Lp4:
+	addl	tstep,%edx
+	sbbl	%eax,%eax
+	addl	lzistepx,%ebp
+	adcl	$0,%ebp
+	addl	C(a_sstepxfrac),%ebx
+	adcl	advancetable+4(,%eax,4),%esi
+
+LDraw4:
+	cmpw	8(%ecx),%bp
+	jl		Lp5
+	xorl	%eax,%eax
+	movb	%dh,%ah
+	movb	(%esi),%al
+	movw	%bp,8(%ecx)
+	movb	0x12345678(%eax),%al
+LPatch4:
+	movb	%al,4(%edi)
+Lp5:
+	addl	tstep,%edx
+	sbbl	%eax,%eax
+	addl	lzistepx,%ebp
+	adcl	$0,%ebp
+	addl	C(a_sstepxfrac),%ebx
+	adcl	advancetable+4(,%eax,4),%esi
+
+LDraw3:
+	cmpw	10(%ecx),%bp
+	jl		Lp6
+	xorl	%eax,%eax
+	movb	%dh,%ah
+	movb	(%esi),%al
+	movw	%bp,10(%ecx)
+	movb	0x12345678(%eax),%al
+LPatch3:
+	movb	%al,5(%edi)
+Lp6:
+	addl	tstep,%edx
+	sbbl	%eax,%eax
+	addl	lzistepx,%ebp
+	adcl	$0,%ebp
+	addl	C(a_sstepxfrac),%ebx
+	adcl	advancetable+4(,%eax,4),%esi
+
+LDraw2:
+	cmpw	12(%ecx),%bp
+	jl		Lp7
+	xorl	%eax,%eax
+	movb	%dh,%ah
+	movb	(%esi),%al
+	movw	%bp,12(%ecx)
+	movb	0x12345678(%eax),%al
+LPatch2:
+	movb	%al,6(%edi)
+Lp7:
+	addl	tstep,%edx
+	sbbl	%eax,%eax
+	addl	lzistepx,%ebp
+	adcl	$0,%ebp
+	addl	C(a_sstepxfrac),%ebx
+	adcl	advancetable+4(,%eax,4),%esi
+
+LDraw1:
+	cmpw	14(%ecx),%bp
+	jl		Lp8
+	xorl	%eax,%eax
+	movb	%dh,%ah
+	movb	(%esi),%al
+	movw	%bp,14(%ecx)
+	movb	0x12345678(%eax),%al
+LPatch1:
+	movb	%al,7(%edi)
+Lp8:
+	addl	tstep,%edx
+	sbbl	%eax,%eax
+	addl	lzistepx,%ebp
+	adcl	$0,%ebp
+	addl	C(a_sstepxfrac),%ebx
+	adcl	advancetable+4(,%eax,4),%esi
+
+	addl	$8,%edi
+	addl	$16,%ecx
+
+	decw	%bx
+	jnz		LDrawLoop
+
+	popl	%esi				// restore spans pointer
+LNextSpan:
+	addl	$(spanpackage_t_size),%esi	// point to next span
+LNextSpanESISet:
+	movl	spanpackage_t_count(%esi),%edx
+	cmpl	$-999999,%edx		// any more spans?
+	jnz		LSpanLoop			// yes
+
+	popl	%edi
+	popl	%ebp				// restore the caller's stack frame
+	popl	%ebx				// restore register variables
+	popl	%esi
+	ret
+
+
+// draw a one-long span
+
+LExactlyOneLong:
+
+	movl	spanpackage_t_pz(%esi),%ecx
+	movl	spanpackage_t_zi(%esi),%ebp
+
+	rorl	$16,%ebp	// put high 16 bits of 1/z in low word
+	movl	spanpackage_t_ptex(%esi),%ebx
+
+	cmpw	(%ecx),%bp
+	jl		LNextSpan
+	xorl	%eax,%eax
+	movl	spanpackage_t_pdest(%esi),%edi
+	movb	spanpackage_t_light+1(%esi),%ah
+	addl	$(spanpackage_t_size),%esi	// point to next span
+	movb	(%ebx),%al
+	movw	%bp,(%ecx)
+	movb	0x12345678(%eax),%al
+LPatch9:
+	movb	%al,(%edi)
+
+	jmp		LNextSpanESISet
+
+.globl C(D_PolysetAff8End)
+C(D_PolysetAff8End):
+
+
+#define pcolormap		4
+
+.globl C(D_Aff8Patch)
+C(D_Aff8Patch):
+	movl	pcolormap(%esp),%eax
+	movl	%eax,LPatch1-4
+	movl	%eax,LPatch2-4
+	movl	%eax,LPatch3-4
+	movl	%eax,LPatch4-4
+	movl	%eax,LPatch5-4
+	movl	%eax,LPatch6-4
+	movl	%eax,LPatch7-4
+	movl	%eax,LPatch8-4
+	movl	%eax,LPatch9-4
+
+	ret
+
+
+//----------------------------------------------------------------------
+// Alias model polygon dispatching code, combined with subdivided affine
+// triangle drawing code
+//----------------------------------------------------------------------
+
+.globl C(D_PolysetDraw)
+C(D_PolysetDraw):
+
+//	spanpackage_t	spans[DPS_MAXSPANS + 1 +
+//			((CACHE_SIZE - 1) / sizeof(spanpackage_t)) + 1];
+//						// one extra because of cache line pretouching
+//
+//	a_spans = (spanpackage_t *)
+//			(((long)&spans[0] + CACHE_SIZE - 1) & ~(CACHE_SIZE - 1));
+	subl	$(SPAN_SIZE),%esp
+	movl	%esp,%eax
+	addl	$(CACHE_SIZE - 1),%eax
+	andl	$(~(CACHE_SIZE - 1)),%eax
+	movl	%eax,C(a_spans)
+
+//	if (r_affinetridesc.drawtype)
+//		D_DrawSubdiv ();
+//	else
+//		D_DrawNonSubdiv ();
+	movl	C(r_affinetridesc)+atd_drawtype,%eax
+	testl	%eax,%eax
+	jz		C(D_DrawNonSubdiv)
+
+	pushl	%ebp				// preserve caller stack frame pointer
+
+//	lnumtriangles = r_affinetridesc.numtriangles;
+	movl	C(r_affinetridesc)+atd_numtriangles,%ebp
+
+	pushl	%esi				// preserve register variables
+	shll	$4,%ebp
+
+	pushl	%ebx
+//	ptri = r_affinetridesc.ptriangles;
+	movl	C(r_affinetridesc)+atd_ptriangles,%ebx
+
+	pushl	%edi
+
+//	mtriangle_t		*ptri;
+//	finalvert_t		*pfv, *index0, *index1, *index2;
+//	int				i;
+//	int				lnumtriangles;
+//	int				s0, s1, s2;
+
+//	pfv = r_affinetridesc.pfinalverts;
+	movl	C(r_affinetridesc)+atd_pfinalverts,%edi
+
+//	for (i=0 ; i<lnumtriangles ; i++)
+//	{
+
+Llooptop:
+
+//		index0 = pfv + ptri[i].vertindex[0];
+//		index1 = pfv + ptri[i].vertindex[1];
+//		index2 = pfv + ptri[i].vertindex[2];
+	movl	mtri_vertindex-16+0(%ebx,%ebp,),%ecx
+	movl	mtri_vertindex-16+4(%ebx,%ebp,),%esi
+
+	shll	$(fv_shift),%ecx
+	movl	mtri_vertindex-16+8(%ebx,%ebp,),%edx
+
+	shll	$(fv_shift),%esi
+	addl	%edi,%ecx
+
+	shll	$(fv_shift),%edx
+	addl	%edi,%esi
+
+	addl	%edi,%edx
+
+//		if (((index0->v[1]-index1->v[1]) *
+//				(index0->v[0]-index2->v[0]) -
+//				(index0->v[0]-index1->v[0])*(index0->v[1]-index2->v[1])) >= 0)
+//		{
+//			continue;
+//		}
+//
+//		d_pcolormap = &((byte *)acolormap)[index0->v[4] & 0xFF00];
+	fildl	fv_v+4(%ecx)	// i0v1
+	fildl	fv_v+4(%esi)	// i1v1 | i0v1
+	fildl	fv_v+0(%ecx)	// i0v0 | i1v1 | i0v1
+	fildl	fv_v+0(%edx)	// i2v0 | i0v0 | i1v1 | i0v1
+	fxch	%st(2)			// i1v1 | i0v0 | i2v0 | i0v1
+	fsubr	%st(3),%st(0)	// i0v1-i1v1 | i0v0 | i2v0 | i0v1
+	fildl	fv_v+0(%esi)	// i1v0 | i0v1-i1v1 | i0v0 | i2v0 | i0v1
+	fxch	%st(2)			// i0v0 | i0v1-i1v1 | i1v0 | i2v0 | i0v1
+	fsub	%st(0),%st(3)	// i0v0 | i0v1-i1v1 | i1v0 | i0v0-i2v0 | i0v1
+	fildl	fv_v+4(%edx)	// i2v1 | i0v0 | i0v1-i1v1 | i1v0 | i0v0-i2v0| i0v1
+	fxch	%st(1)			// i0v0 | i2v1 | i0v1-i1v1 | i1v0 | i0v0-i2v0| i0v1
+	fsubp	%st(0),%st(3)	// i2v1 | i0v1-i1v1 | i0v0-i1v0 | i0v0-i2v0 | i0v1
+	fxch	%st(1)			// i0v1-i1v1 | i2v1 | i0v0-i1v0 | i0v0-i2v0 | i0v1
+	fmulp	%st(0),%st(3)	// i2v1 | i0v0-i1v0 | i0v1-i1v1*i0v0-i2v0 | i0v1
+	fsubrp	%st(0),%st(3)	// i0v0-i1v0 | i0v1-i1v1*i0v0-i2v0 | i0v1-i2v1
+	movl	fv_v+16(%ecx),%eax
+	andl	$0xFF00,%eax
+	fmulp	%st(0),%st(2)	// i0v1-i1v1*i0v0-i2v0 | i0v0-i1v0*i0v1-i2v1
+	addl	C(acolormap),%eax
+	fsubp	%st(0),%st(1)	// (i0v1-i1v1)*(i0v0-i2v0)-(i0v0-i1v0)*(i0v1-i2v1)
+	movl	%eax,C(d_pcolormap)
+	fstps	Ltemp
+	movl	Ltemp,%eax
+	subl	$0x80000001,%eax
+	jc		Lskip
+
+//		if (ptri[i].facesfront)
+//		{
+//			D_PolysetRecursiveTriangle(index0->v, index1->v, index2->v);
+	movl	mtri_facesfront-16(%ebx,%ebp,),%eax
+	testl	%eax,%eax
+	jz		Lfacesback
+
+	pushl	%edx
+	pushl	%esi
+	pushl	%ecx
+	call	C(D_PolysetRecursiveTriangle)
+
+	subl	$16,%ebp
+	jnz		Llooptop
+	jmp		Ldone2
+
+//		}
+//		else
+//		{
+Lfacesback:
+
+//			s0 = index0->v[2];
+//			s1 = index1->v[2];
+//			s2 = index2->v[2];
+	movl	fv_v+8(%ecx),%eax
+	pushl	%eax
+	movl	fv_v+8(%esi),%eax
+	pushl	%eax
+	movl	fv_v+8(%edx),%eax
+	pushl	%eax
+	pushl	%ecx
+	pushl	%edx
+
+//			if (index0->flags & ALIAS_ONSEAM)
+//				index0->v[2] += r_affinetridesc.seamfixupX16;
+	movl	C(r_affinetridesc)+atd_seamfixupX16,%eax
+	testl	$(ALIAS_ONSEAM),fv_flags(%ecx)
+	jz		Lp11
+	addl	%eax,fv_v+8(%ecx)
+Lp11:
+
+//			if (index1->flags & ALIAS_ONSEAM)
+//				index1->v[2] += r_affinetridesc.seamfixupX16;
+	testl	$(ALIAS_ONSEAM),fv_flags(%esi)
+	jz		Lp12
+	addl	%eax,fv_v+8(%esi)
+Lp12:
+
+//			if (index2->flags & ALIAS_ONSEAM)
+//				index2->v[2] += r_affinetridesc.seamfixupX16;
+	testl	$(ALIAS_ONSEAM),fv_flags(%edx)
+	jz		Lp13
+	addl	%eax,fv_v+8(%edx)
+Lp13:
+
+//			D_PolysetRecursiveTriangle(index0->v, index1->v, index2->v);
+	pushl	%edx
+	pushl	%esi
+	pushl	%ecx
+	call	C(D_PolysetRecursiveTriangle)
+
+//			index0->v[2] = s0;
+//			index1->v[2] = s1;
+//			index2->v[2] = s2;
+	popl	%edx
+	popl	%ecx
+	popl	%eax
+	movl	%eax,fv_v+8(%edx)
+	popl	%eax
+	movl	%eax,fv_v+8(%esi)
+	popl	%eax
+	movl	%eax,fv_v+8(%ecx)
+
+//		}
+//	}
+Lskip:
+	subl	$16,%ebp
+	jnz		Llooptop
+
+Ldone2:
+	popl	%edi				// restore the caller's stack frame
+	popl	%ebx
+	popl	%esi				// restore register variables
+	popl	%ebp
+
+	addl	$(SPAN_SIZE),%esp
+
+	ret
+
+
+//----------------------------------------------------------------------
+// Alias model triangle left-edge scanning code
+//----------------------------------------------------------------------
+
+#define height	4+16
+
+.globl C(D_PolysetScanLeftEdge)
+C(D_PolysetScanLeftEdge):
+	pushl	%ebp				// preserve caller stack frame pointer
+	pushl	%esi				// preserve register variables
+	pushl	%edi
+	pushl	%ebx
+
+	movl	height(%esp),%eax
+	movl	C(d_sfrac),%ecx
+	andl	$0xFFFF,%eax
+	movl	C(d_ptex),%ebx
+	orl		%eax,%ecx
+	movl	C(d_pedgespanpackage),%esi
+	movl	C(d_tfrac),%edx
+	movl	C(d_light),%edi
+	movl	C(d_zi),%ebp
+
+// %eax: scratch
+// %ebx: d_ptex
+// %ecx: d_sfrac in high word, count in low word
+// %edx: d_tfrac
+// %esi: d_pedgespanpackage, errorterm, scratch alternately
+// %edi: d_light
+// %ebp: d_zi
+
+//	do
+//	{
+
+LScanLoop:
+
+//		d_pedgespanpackage->ptex = ptex;
+//		d_pedgespanpackage->pdest = d_pdest;
+//		d_pedgespanpackage->pz = d_pz;
+//		d_pedgespanpackage->count = d_aspancount;
+//		d_pedgespanpackage->light = d_light;
+//		d_pedgespanpackage->zi = d_zi;
+//		d_pedgespanpackage->sfrac = d_sfrac << 16;
+//		d_pedgespanpackage->tfrac = d_tfrac << 16;
+	movl	%ebx,spanpackage_t_ptex(%esi)
+	movl	C(d_pdest),%eax
+	movl	%eax,spanpackage_t_pdest(%esi)
+	movl	C(d_pz),%eax
+	movl	%eax,spanpackage_t_pz(%esi)
+	movl	C(d_aspancount),%eax
+	movl	%eax,spanpackage_t_count(%esi)
+	movl	%edi,spanpackage_t_light(%esi)
+	movl	%ebp,spanpackage_t_zi(%esi)
+	movl	%ecx,spanpackage_t_sfrac(%esi)
+	movl	%edx,spanpackage_t_tfrac(%esi)
+
+// pretouch the next cache line
+	movb	spanpackage_t_size(%esi),%al
+
+//		d_pedgespanpackage++;
+	addl	$(spanpackage_t_size),%esi
+	movl	C(erroradjustup),%eax
+	movl	%esi,C(d_pedgespanpackage)
+
+//		errorterm += erroradjustup;
+	movl	C(errorterm),%esi
+	addl	%eax,%esi
+	movl	C(d_pdest),%eax
+
+//		if (errorterm >= 0)
+//		{
+	js		LNoLeftEdgeTurnover
+
+//			errorterm -= erroradjustdown;
+//			d_pdest += d_pdestextrastep;
+	subl	C(erroradjustdown),%esi
+	addl	C(d_pdestextrastep),%eax
+	movl	%esi,C(errorterm)
+	movl	%eax,C(d_pdest)
+
+//			d_pz += d_pzextrastep;
+//			d_aspancount += d_countextrastep;
+//			d_ptex += d_ptexextrastep;
+//			d_sfrac += d_sfracextrastep;
+//			d_ptex += d_sfrac >> 16;
+//			d_sfrac &= 0xFFFF;
+//			d_tfrac += d_tfracextrastep;
+	movl	C(d_pz),%eax
+	movl	C(d_aspancount),%esi
+	addl	C(d_pzextrastep),%eax
+	addl	C(d_sfracextrastep),%ecx
+	adcl	C(d_ptexextrastep),%ebx
+	addl	C(d_countextrastep),%esi
+	movl	%eax,C(d_pz)
+	movl	C(d_tfracextrastep),%eax
+	movl	%esi,C(d_aspancount)
+	addl	%eax,%edx
+
+//			if (d_tfrac & 0x10000)
+//			{
+	jnc		LSkip1
+
+//				d_ptex += r_affinetridesc.skinwidth;
+//				d_tfrac &= 0xFFFF;
+	addl	C(r_affinetridesc)+atd_skinwidth,%ebx
+
+//			}
+
+LSkip1:
+
+//			d_light += d_lightextrastep;
+//			d_zi += d_ziextrastep;
+	addl	C(d_lightextrastep),%edi
+	addl	C(d_ziextrastep),%ebp
+
+//		}
+	movl	C(d_pedgespanpackage),%esi
+	decl	%ecx
+	testl	$0xFFFF,%ecx
+	jnz		LScanLoop
+
+	popl	%ebx
+	popl	%edi
+	popl	%esi
+	popl	%ebp
+	ret
+
+//		else
+//		{
+
+LNoLeftEdgeTurnover:
+	movl	%esi,C(errorterm)
+
+//			d_pdest += d_pdestbasestep;
+	addl	C(d_pdestbasestep),%eax
+	movl	%eax,C(d_pdest)
+
+//			d_pz += d_pzbasestep;
+//			d_aspancount += ubasestep;
+//			d_ptex += d_ptexbasestep;
+//			d_sfrac += d_sfracbasestep;
+//			d_ptex += d_sfrac >> 16;
+//			d_sfrac &= 0xFFFF;
+	movl	C(d_pz),%eax
+	movl	C(d_aspancount),%esi
+	addl	C(d_pzbasestep),%eax
+	addl	C(d_sfracbasestep),%ecx
+	adcl	C(d_ptexbasestep),%ebx
+	addl	C(ubasestep),%esi
+	movl	%eax,C(d_pz)
+	movl	%esi,C(d_aspancount)
+
+//			d_tfrac += d_tfracbasestep;
+	movl	C(d_tfracbasestep),%esi
+	addl	%esi,%edx
+
+//			if (d_tfrac & 0x10000)
+//			{
+	jnc		LSkip2
+
+//				d_ptex += r_affinetridesc.skinwidth;
+//				d_tfrac &= 0xFFFF;
+	addl	C(r_affinetridesc)+atd_skinwidth,%ebx
+
+//			}
+
+LSkip2:
+
+//			d_light += d_lightbasestep;
+//			d_zi += d_zibasestep;
+	addl	C(d_lightbasestep),%edi
+	addl	C(d_zibasestep),%ebp
+
+//		}
+//	} while (--height);
+	movl	C(d_pedgespanpackage),%esi
+	decl	%ecx
+	testl	$0xFFFF,%ecx
+	jnz		LScanLoop
+
+	popl	%ebx
+	popl	%edi
+	popl	%esi
+	popl	%ebp
+	ret
+
+
+//----------------------------------------------------------------------
+// Alias model vertex drawing code
+//----------------------------------------------------------------------
+
+#define fv			4+8
+#define	numverts	8+8
+
+.globl C(D_PolysetDrawFinalVerts)
+C(D_PolysetDrawFinalVerts):
+	pushl	%ebp				// preserve caller stack frame pointer
+	pushl	%ebx
+
+//	int		i, z;
+//	short	*zbuf;
+
+	movl	numverts(%esp),%ecx
+	movl	fv(%esp),%ebx
+
+	pushl	%esi				// preserve register variables
+	pushl	%edi
+
+LFVLoop:
+
+//	for (i=0 ; i<numverts ; i++, fv++)
+//	{
+//	// valid triangle coordinates for filling can include the bottom and
+//	// right clip edges, due to the fill rule; these shouldn't be drawn
+//		if ((fv->v[0] < r_refdef.vrectright) &&
+//			(fv->v[1] < r_refdef.vrectbottom))
+//		{
+	movl	fv_v+0(%ebx),%eax
+	movl	C(r_refdef)+rd_vrectright,%edx
+	cmpl	%edx,%eax
+	jge		LNextVert
+	movl	fv_v+4(%ebx),%esi
+	movl	C(r_refdef)+rd_vrectbottom,%edx
+	cmpl	%edx,%esi
+	jge		LNextVert
+
+//			zbuf = zspantable[fv->v[1]] + fv->v[0];
+	movl	C(zspantable)(,%esi,4),%edi
+
+//			z = fv->v[5]>>16;
+	movl	fv_v+20(%ebx),%edx
+	shrl	$16,%edx
+
+//			if (z >= *zbuf)
+//			{
+//				int		pix;
+	cmpw	(%edi,%eax,2),%dx
+	jl		LNextVert
+
+//				*zbuf = z;
+	movw	%dx,(%edi,%eax,2)
+
+//				pix = skintable[fv->v[3]>>16][fv->v[2]>>16];
+	movl	fv_v+12(%ebx),%edi
+	shrl	$16,%edi
+	movl	C(skintable)(,%edi,4),%edi
+	movl	fv_v+8(%ebx),%edx
+	shrl	$16,%edx
+	movb	(%edi,%edx),%dl
+
+//				pix = ((byte *)acolormap)[pix + (fv->v[4] & 0xFF00)];
+	movl	fv_v+16(%ebx),%edi
+	andl	$0xFF00,%edi
+	andl	$0x00FF,%edx
+	addl	%edx,%edi
+	movl	C(acolormap),%edx
+	movb	(%edx,%edi,1),%dl
+
+//				d_viewbuffer[d_scantable[fv->v[1]] + fv->v[0]] = pix;
+	movl	C(d_scantable)(,%esi,4),%edi
+	movl	C(d_viewbuffer),%esi
+	addl	%eax,%edi
+	movb	%dl,(%esi,%edi)
+
+//			}
+//		}
+//	}
+LNextVert:
+	addl	$(fv_size),%ebx
+	decl	%ecx
+	jnz		LFVLoop
+
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+
+
+//----------------------------------------------------------------------
+// Alias model non-subdivided polygon dispatching code
+//
+// not C-callable because of stack buffer cleanup
+//----------------------------------------------------------------------
+
+.globl C(D_DrawNonSubdiv)
+C(D_DrawNonSubdiv):
+	pushl	%ebp				// preserve caller stack frame pointer
+	movl	C(r_affinetridesc)+atd_numtriangles,%ebp
+	pushl	%ebx
+	shll	$(mtri_shift),%ebp
+	pushl	%esi				// preserve register variables
+	movl	C(r_affinetridesc)+atd_ptriangles,%esi
+	pushl	%edi
+
+//	mtriangle_t		*ptri;
+//	finalvert_t		*pfv, *index0, *index1, *index2;
+//	int				i;
+//	int				lnumtriangles;
+
+//	pfv = r_affinetridesc.pfinalverts;
+//	ptri = r_affinetridesc.ptriangles;
+//	lnumtriangles = r_affinetridesc.numtriangles;
+
+LNDLoop:
+
+//	for (i=0 ; i<lnumtriangles ; i++, ptri++)
+//	{
+//		index0 = pfv + ptri->vertindex[0];
+//		index1 = pfv + ptri->vertindex[1];
+//		index2 = pfv + ptri->vertindex[2];
+	movl	C(r_affinetridesc)+atd_pfinalverts,%edi
+	movl	mtri_vertindex+0-mtri_size(%esi,%ebp,1),%ecx
+	shll	$(fv_shift),%ecx
+	movl	mtri_vertindex+4-mtri_size(%esi,%ebp,1),%edx
+	shll	$(fv_shift),%edx
+	movl	mtri_vertindex+8-mtri_size(%esi,%ebp,1),%ebx
+	shll	$(fv_shift),%ebx
+	addl	%edi,%ecx
+	addl	%edi,%edx
+	addl	%edi,%ebx
+
+//		d_xdenom = (index0->v[1]-index1->v[1]) *
+//				(index0->v[0]-index2->v[0]) -
+//				(index0->v[0]-index1->v[0])*(index0->v[1]-index2->v[1]);
+	movl	fv_v+4(%ecx),%eax
+	movl	fv_v+0(%ecx),%esi
+	subl	fv_v+4(%edx),%eax
+	subl	fv_v+0(%ebx),%esi
+	imull	%esi,%eax
+	movl	fv_v+0(%ecx),%esi
+	movl	fv_v+4(%ecx),%edi
+	subl	fv_v+0(%edx),%esi
+	subl	fv_v+4(%ebx),%edi
+	imull	%esi,%edi
+	subl	%edi,%eax
+
+//		if (d_xdenom >= 0)
+//		{
+//			continue;
+	jns		LNextTri
+
+//		}
+
+	movl	%eax,C(d_xdenom)
+	fildl	C(d_xdenom)
+
+//		r_p0[0] = index0->v[0];		// u
+//		r_p0[1] = index0->v[1];		// v
+//		r_p0[2] = index0->v[2];		// s
+//		r_p0[3] = index0->v[3];		// t
+//		r_p0[4] = index0->v[4];		// light
+//		r_p0[5] = index0->v[5];		// iz
+	movl	fv_v+0(%ecx),%eax
+	movl	fv_v+4(%ecx),%esi
+	movl	%eax,C(r_p0)+0
+	movl	%esi,C(r_p0)+4
+	movl	fv_v+8(%ecx),%eax
+	movl	fv_v+12(%ecx),%esi
+	movl	%eax,C(r_p0)+8
+	movl	%esi,C(r_p0)+12
+	movl	fv_v+16(%ecx),%eax
+	movl	fv_v+20(%ecx),%esi
+	movl	%eax,C(r_p0)+16
+	movl	%esi,C(r_p0)+20
+
+	fdivrs	float_1
+
+//		r_p1[0] = index1->v[0];
+//		r_p1[1] = index1->v[1];
+//		r_p1[2] = index1->v[2];
+//		r_p1[3] = index1->v[3];
+//		r_p1[4] = index1->v[4];
+//		r_p1[5] = index1->v[5];
+	movl	fv_v+0(%edx),%eax
+	movl	fv_v+4(%edx),%esi
+	movl	%eax,C(r_p1)+0
+	movl	%esi,C(r_p1)+4
+	movl	fv_v+8(%edx),%eax
+	movl	fv_v+12(%edx),%esi
+	movl	%eax,C(r_p1)+8
+	movl	%esi,C(r_p1)+12
+	movl	fv_v+16(%edx),%eax
+	movl	fv_v+20(%edx),%esi
+	movl	%eax,C(r_p1)+16
+	movl	%esi,C(r_p1)+20
+
+//		r_p2[0] = index2->v[0];
+//		r_p2[1] = index2->v[1];
+//		r_p2[2] = index2->v[2];
+//		r_p2[3] = index2->v[3];
+//		r_p2[4] = index2->v[4];
+//		r_p2[5] = index2->v[5];
+	movl	fv_v+0(%ebx),%eax
+	movl	fv_v+4(%ebx),%esi
+	movl	%eax,C(r_p2)+0
+	movl	%esi,C(r_p2)+4
+	movl	fv_v+8(%ebx),%eax
+	movl	fv_v+12(%ebx),%esi
+	movl	%eax,C(r_p2)+8
+	movl	%esi,C(r_p2)+12
+	movl	fv_v+16(%ebx),%eax
+	movl	fv_v+20(%ebx),%esi
+	movl	%eax,C(r_p2)+16
+	movl	C(r_affinetridesc)+atd_ptriangles,%edi
+	movl	%esi,C(r_p2)+20
+	movl	mtri_facesfront-mtri_size(%edi,%ebp,1),%eax
+
+//		if (!ptri->facesfront)
+//		{
+	testl	%eax,%eax
+	jnz		LFacesFront
+
+//			if (index0->flags & ALIAS_ONSEAM)
+//				r_p0[2] += r_affinetridesc.seamfixupX16;
+	movl	fv_flags(%ecx),%eax
+	movl	fv_flags(%edx),%esi
+	movl	fv_flags(%ebx),%edi
+	testl	$(ALIAS_ONSEAM),%eax
+	movl	C(r_affinetridesc)+atd_seamfixupX16,%eax
+	jz		LOnseamDone0
+	addl	%eax,C(r_p0)+8
+LOnseamDone0:
+
+//			if (index1->flags & ALIAS_ONSEAM)
+// 				r_p1[2] += r_affinetridesc.seamfixupX16;
+	testl	$(ALIAS_ONSEAM),%esi
+	jz		LOnseamDone1
+	addl	%eax,C(r_p1)+8
+LOnseamDone1:
+
+//			if (index2->flags & ALIAS_ONSEAM)
+//				r_p2[2] += r_affinetridesc.seamfixupX16;
+	testl	$(ALIAS_ONSEAM),%edi
+	jz		LOnseamDone2
+	addl	%eax,C(r_p2)+8
+LOnseamDone2:
+
+//		}
+
+LFacesFront:
+
+	fstps	C(d_xdenom)
+
+//		D_PolysetSetEdgeTable ();
+//		D_RasterizeAliasPolySmooth ();
+		call	C(D_PolysetSetEdgeTable)
+		call	C(D_RasterizeAliasPolySmooth)
+
+LNextTri:
+		movl	C(r_affinetridesc)+atd_ptriangles,%esi
+		subl	$16,%ebp
+		jnz		LNDLoop
+//	}
+
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+
+	addl	$(SPAN_SIZE),%esp
+
+	ret
+
+
+#endif	// id386
+
diff --git a/source/d_scana.S b/source/d_scana.S
new file mode 100644
index 0000000..3f4b91d
--- /dev/null
+++ b/source/d_scana.S
@@ -0,0 +1,89 @@
+/*
+Copyright (C) 1996-1997 Id Software, Inc.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+
+See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+*/
+//
+// d_scana.s
+// x86 assembly-language turbulent texture mapping code
+//
+
+#include "asm_i386.h"
+#include "quakeasm.h"
+#include "asm_draw.h"
+#include "d_ifacea.h"
+
+#if id386
+
+	.data
+
+	.text
+
+//----------------------------------------------------------------------
+// turbulent texture mapping code
+//----------------------------------------------------------------------
+
+	.align 4
+.globl C(D_DrawTurbulent8Span)
+C(D_DrawTurbulent8Span):
+	pushl	%ebp				// preserve caller's stack frame pointer
+	pushl	%esi				// preserve register variables
+	pushl	%edi
+	pushl	%ebx
+
+	movl	C(r_turb_s),%esi
+	movl	C(r_turb_t),%ecx
+	movl	C(r_turb_pdest),%edi
+	movl	C(r_turb_spancount),%ebx
+
+Llp:
+	movl	%ecx,%eax
+	movl	%esi,%edx
+	sarl	$16,%eax
+	movl	C(r_turb_turb),%ebp
+	sarl	$16,%edx
+	andl	$(CYCLE-1),%eax
+	andl	$(CYCLE-1),%edx
+	movl	(%ebp,%eax,4),%eax
+	movl	(%ebp,%edx,4),%edx
+	addl	%esi,%eax
+	sarl	$16,%eax
+	addl	%ecx,%edx
+	sarl	$16,%edx
+	andl	$(TURB_TEX_SIZE-1),%eax
+	andl	$(TURB_TEX_SIZE-1),%edx
+	shll	$6,%edx
+	movl	C(r_turb_pbase),%ebp
+	addl	%eax,%edx
+	incl	%edi
+	addl	C(r_turb_sstep),%esi
+	addl	C(r_turb_tstep),%ecx
+	movb	(%ebp,%edx,1),%dl
+	decl	%ebx
+	movb	%dl,-1(%edi)
+	jnz		Llp
+
+	movl	%edi,C(r_turb_pdest)
+
+	popl	%ebx				// restore register variables
+	popl	%edi
+	popl	%esi
+	popl	%ebp				// restore caller's stack frame pointer
+	ret
+
+#endif	// id386
+
diff --git a/source/d_spr8.S b/source/d_spr8.S
new file mode 100644
index 0000000..42ccb62
--- /dev/null
+++ b/source/d_spr8.S
@@ -0,0 +1,900 @@
+/*
+Copyright (C) 1996-1997 Id Software, Inc.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+
+See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+*/
+//
+// d_spr8.s
+// x86 assembly-language horizontal 8-bpp transparent span-drawing code.
+//
+
+#include "asm_i386.h"
+#include "quakeasm.h"
+#include "asm_draw.h"
+
+#if id386
+
+//----------------------------------------------------------------------
+// 8-bpp horizontal span drawing code for polygons, with transparency.
+//----------------------------------------------------------------------
+
+	.text
+
+// out-of-line, rarely-needed clamping code
+
+LClampHigh0:
+	movl	C(bbextents),%esi
+	jmp		LClampReentry0
+LClampHighOrLow0:
+	jg		LClampHigh0
+	xorl	%esi,%esi
+	jmp		LClampReentry0
+
+LClampHigh1:
+	movl	C(bbextentt),%edx
+	jmp		LClampReentry1
+LClampHighOrLow1:
+	jg		LClampHigh1
+	xorl	%edx,%edx
+	jmp		LClampReentry1
+
+LClampLow2:
+	movl	$2048,%ebp
+	jmp		LClampReentry2
+LClampHigh2:
+	movl	C(bbextents),%ebp
+	jmp		LClampReentry2
+
+LClampLow3:
+	movl	$2048,%ecx
+	jmp		LClampReentry3
+LClampHigh3:
+	movl	C(bbextentt),%ecx
+	jmp		LClampReentry3
+
+LClampLow4:
+	movl	$2048,%eax
+	jmp		LClampReentry4
+LClampHigh4:
+	movl	C(bbextents),%eax
+	jmp		LClampReentry4
+
+LClampLow5:
+	movl	$2048,%ebx
+	jmp		LClampReentry5
+LClampHigh5:
+	movl	C(bbextentt),%ebx
+	jmp		LClampReentry5
+
+
+#define pspans	4+16
+
+	.align 4
+.globl C(D_SpriteDrawSpans)
+C(D_SpriteDrawSpans):
+	pushl	%ebp				// preserve caller's stack frame
+	pushl	%edi
+	pushl	%esi				// preserve register variables
+	pushl	%ebx
+
+//
+// set up scaled-by-8 steps, for 8-long segments; also set up cacheblock
+// and span list pointers, and 1/z step in 0.32 fixed-point
+//
+// FIXME: any overlap from rearranging?
+	flds	C(d_sdivzstepu)
+	fmuls	fp_8
+	movl	C(cacheblock),%edx
+	flds	C(d_tdivzstepu)
+	fmuls	fp_8
+	movl	pspans(%esp),%ebx	// point to the first span descriptor
+	flds	C(d_zistepu)
+	fmuls	fp_8
+	movl	%edx,pbase			// pbase = cacheblock
+	flds	C(d_zistepu)
+	fmuls	fp_64kx64k
+	fxch	%st(3)
+	fstps	sdivz8stepu
+	fstps	zi8stepu
+	fstps	tdivz8stepu
+	fistpl	izistep
+	movl	izistep,%eax
+	rorl	$16,%eax		// put upper 16 bits in low word
+	movl	sspan_t_count(%ebx),%ecx
+	movl	%eax,izistep
+
+	cmpl	$0,%ecx
+	jle		LNextSpan
+
+LSpanLoop:
+
+//
+// set up the initial s/z, t/z, and 1/z on the FP stack, and generate the
+// initial s and t values
+//
+// FIXME: pipeline FILD?
+	fildl	sspan_t_v(%ebx)
+	fildl	sspan_t_u(%ebx)
+
+	fld		%st(1)			// dv | du | dv
+	fmuls	C(d_sdivzstepv)	// dv*d_sdivzstepv | du | dv
+	fld		%st(1)			// du | dv*d_sdivzstepv | du | dv
+	fmuls	C(d_sdivzstepu)	// du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
+	fld		%st(2)			// du | du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
+	fmuls	C(d_tdivzstepu)	// du*d_tdivzstepu | du*d_sdivzstepu |
+							//  dv*d_sdivzstepv | du | dv
+	fxch	%st(1)			// du*d_sdivzstepu | du*d_tdivzstepu |
+							//  dv*d_sdivzstepv | du | dv
+	faddp	%st(0),%st(2)	// du*d_tdivzstepu |
+							//  du*d_sdivzstepu + dv*d_sdivzstepv | du | dv
+	fxch	%st(1)			// du*d_sdivzstepu + dv*d_sdivzstepv |
+							//  du*d_tdivzstepu | du | dv
+	fld		%st(3)			// dv | du*d_sdivzstepu + dv*d_sdivzstepv |
+							//  du*d_tdivzstepu | du | dv
+	fmuls	C(d_tdivzstepv)	// dv*d_tdivzstepv |
+							//  du*d_sdivzstepu + dv*d_sdivzstepv |
+							//  du*d_tdivzstepu | du | dv
+	fxch	%st(1)			// du*d_sdivzstepu + dv*d_sdivzstepv |
+							//  dv*d_tdivzstepv | du*d_tdivzstepu | du | dv
+	fadds	C(d_sdivzorigin) // sdivz = d_sdivzorigin + dv*d_sdivzstepv +
+							//  du*d_sdivzstepu; stays in %st(2) at end
+	fxch	%st(4)			// dv | dv*d_tdivzstepv | du*d_tdivzstepu | du |
+							//  s/z
+	fmuls	C(d_zistepv)		// dv*d_zistepv | dv*d_tdivzstepv |
+							//  du*d_tdivzstepu | du | s/z
+	fxch	%st(1)			// dv*d_tdivzstepv |  dv*d_zistepv |
+							//  du*d_tdivzstepu | du | s/z
+	faddp	%st(0),%st(2)	// dv*d_zistepv |
+							//  dv*d_tdivzstepv + du*d_tdivzstepu | du | s/z
+	fxch	%st(2)			// du | dv*d_tdivzstepv + du*d_tdivzstepu |
+							//  dv*d_zistepv | s/z
+	fmuls	C(d_zistepu)		// du*d_zistepu |
+							//  dv*d_tdivzstepv + du*d_tdivzstepu |
+							//  dv*d_zistepv | s/z
+	fxch	%st(1)			// dv*d_tdivzstepv + du*d_tdivzstepu |
+							//  du*d_zistepu | dv*d_zistepv | s/z
+	fadds	C(d_tdivzorigin)	// tdivz = d_tdivzorigin + dv*d_tdivzstepv +
+							//  du*d_tdivzstepu; stays in %st(1) at end
+	fxch	%st(2)			// dv*d_zistepv | du*d_zistepu | t/z | s/z
+	faddp	%st(0),%st(1)	// dv*d_zistepv + du*d_zistepu | t/z | s/z
+
+	flds	fp_64k			// fp_64k | dv*d_zistepv + du*d_zistepu | t/z | s/z
+	fxch	%st(1)			// dv*d_zistepv + du*d_zistepu | fp_64k | t/z | s/z
+	fadds	C(d_ziorigin)		// zi = d_ziorigin + dv*d_zistepv +
+							//  du*d_zistepu; stays in %st(0) at end
+							// 1/z | fp_64k | t/z | s/z
+
+	fld		%st(0)			// FIXME: get rid of stall on FMUL?
+	fmuls	fp_64kx64k
+	fxch	%st(1)
+
+//
+// calculate and clamp s & t
+//
+	fdivr	%st(0),%st(2)	// 1/z | z*64k | t/z | s/z
+	fxch	%st(1)
+
+	fistpl	izi				// 0.32 fixed-point 1/z
+	movl	izi,%ebp
+
+//
+// set pz to point to the first z-buffer pixel in the span
+//
+	rorl	$16,%ebp		// put upper 16 bits in low word
+	movl	sspan_t_v(%ebx),%eax
+	movl	%ebp,izi
+	movl	sspan_t_u(%ebx),%ebp
+	imull	C(d_zrowbytes)
+	shll	$1,%ebp					// a word per pixel
+	addl	C(d_pzbuffer),%eax
+	addl	%ebp,%eax
+	movl	%eax,pz
+
+//
+// point %edi to the first pixel in the span
+//
+	movl	C(d_viewbuffer),%ebp
+	movl	sspan_t_v(%ebx),%eax
+	pushl	%ebx		// preserve spans pointer
+	movl	C(tadjust),%edx
+	movl	C(sadjust),%esi
+	movl	C(d_scantable)(,%eax,4),%edi	// v * screenwidth
+	addl	%ebp,%edi
+	movl	sspan_t_u(%ebx),%ebp
+	addl	%ebp,%edi				// pdest = &pdestspan[scans->u];
+
+//
+// now start the FDIV for the end of the span
+//
+	cmpl	$8,%ecx
+	ja		LSetupNotLast1
+
+	decl	%ecx
+	jz		LCleanup1		// if only one pixel, no need to start an FDIV
+	movl	%ecx,spancountminus1
+
+// finish up the s and t calcs
+	fxch	%st(1)			// z*64k | 1/z | t/z | s/z
+
+	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z
+	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z
+	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z
+	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z
+	fxch	%st(1)			// s | t | 1/z | t/z | s/z
+	fistpl	s				// 1/z | t | t/z | s/z
+	fistpl	t				// 1/z | t/z | s/z
+
+	fildl	spancountminus1
+
+	flds	C(d_tdivzstepu)	// _d_tdivzstepu | spancountminus1
+	flds	C(d_zistepu)	// _d_zistepu | _d_tdivzstepu | spancountminus1
+	fmul	%st(2),%st(0)	// _d_zistepu*scm1 | _d_tdivzstepu | scm1
+	fxch	%st(1)			// _d_tdivzstepu | _d_zistepu*scm1 | scm1
+	fmul	%st(2),%st(0)	// _d_tdivzstepu*scm1 | _d_zistepu*scm1 | scm1
+	fxch	%st(2)			// scm1 | _d_zistepu*scm1 | _d_tdivzstepu*scm1
+	fmuls	C(d_sdivzstepu)	// _d_sdivzstepu*scm1 | _d_zistepu*scm1 |
+							//  _d_tdivzstepu*scm1
+	fxch	%st(1)			// _d_zistepu*scm1 | _d_sdivzstepu*scm1 |
+							//  _d_tdivzstepu*scm1
+	faddp	%st(0),%st(3)	// _d_sdivzstepu*scm1 | _d_tdivzstepu*scm1
+	fxch	%st(1)			// _d_tdivzstepu*scm1 | _d_sdivzstepu*scm1
+	faddp	%st(0),%st(3)	// _d_sdivzstepu*scm1
+	faddp	%st(0),%st(3)
+
+	flds	fp_64k
+	fdiv	%st(1),%st(0)	// this is what we've gone to all this trouble to
+							//  overlap
+	jmp		LFDIVInFlight1
+
+LCleanup1:
+// finish up the s and t calcs
+	fxch	%st(1)			// z*64k | 1/z | t/z | s/z
+
+	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z
+	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z
+	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z
+	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z
+	fxch	%st(1)			// s | t | 1/z | t/z | s/z
+	fistpl	s				// 1/z | t | t/z | s/z
+	fistpl	t				// 1/z | t/z | s/z
+	jmp		LFDIVInFlight1
+
+	.align	4
+LSetupNotLast1:
+// finish up the s and t calcs
+	fxch	%st(1)			// z*64k | 1/z | t/z | s/z
+
+	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z
+	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z
+	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z
+	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z
+	fxch	%st(1)			// s | t | 1/z | t/z | s/z
+	fistpl	s				// 1/z | t | t/z | s/z
+	fistpl	t				// 1/z | t/z | s/z
+
+	fadds	zi8stepu
+	fxch	%st(2)
+	fadds	sdivz8stepu
+	fxch	%st(2)
+	flds	tdivz8stepu
+	faddp	%st(0),%st(2)
+	flds	fp_64k
+	fdiv	%st(1),%st(0)	// z = 1/1/z
+							// this is what we've gone to all this trouble to
+							//  overlap
+LFDIVInFlight1:
+
+	addl	s,%esi
+	addl	t,%edx
+	movl	C(bbextents),%ebx
+	movl	C(bbextentt),%ebp
+	cmpl	%ebx,%esi
+	ja		LClampHighOrLow0
+LClampReentry0:
+	movl	%esi,s
+	movl	pbase,%ebx
+	shll	$16,%esi
+	cmpl	%ebp,%edx
+	movl	%esi,sfracf
+	ja		LClampHighOrLow1
+LClampReentry1:
+	movl	%edx,t
+	movl	s,%esi					// sfrac = scans->sfrac;
+	shll	$16,%edx
+	movl	t,%eax					// tfrac = scans->tfrac;
+	sarl	$16,%esi
+	movl	%edx,tfracf
+
+//
+// calculate the texture starting address
+//
+	sarl	$16,%eax
+	addl	%ebx,%esi
+	imull	C(cachewidth),%eax		// (tfrac >> 16) * cachewidth
+	addl	%eax,%esi				// psource = pbase + (sfrac >> 16) +
+									//           ((tfrac >> 16) * cachewidth);
+
+//
+// determine whether last span or not
+//
+	cmpl	$8,%ecx
+	jna		LLastSegment
+
+//
+// not the last segment; do full 8-wide segment
+//
+LNotLastSegment:
+
+//
+// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
+// get there
+//
+
+// pick up after the FDIV that was left in flight previously
+
+	fld		%st(0)			// duplicate it
+	fmul	%st(4),%st(0)	// s = s/z * z
+	fxch	%st(1)
+	fmul	%st(3),%st(0)	// t = t/z * z
+	fxch	%st(1)
+	fistpl	snext
+	fistpl	tnext
+	movl	snext,%eax
+	movl	tnext,%edx
+
+	subl	$8,%ecx		// count off this segments' pixels
+	movl	C(sadjust),%ebp
+	pushl	%ecx		// remember count of remaining pixels
+	movl	C(tadjust),%ecx
+
+	addl	%eax,%ebp
+	addl	%edx,%ecx
+
+	movl	C(bbextents),%eax
+	movl	C(bbextentt),%edx
+
+	cmpl	$2048,%ebp
+	jl		LClampLow2
+	cmpl	%eax,%ebp
+	ja		LClampHigh2
+LClampReentry2:
+
+	cmpl	$2048,%ecx
+	jl		LClampLow3
+	cmpl	%edx,%ecx
+	ja		LClampHigh3
+LClampReentry3:
+
+	movl	%ebp,snext
+	movl	%ecx,tnext
+
+	subl	s,%ebp
+	subl	t,%ecx
+	
+//
+// set up advancetable
+//
+	movl	%ecx,%eax
+	movl	%ebp,%edx
+	sarl	$19,%edx			// sstep >>= 16;
+	movl	C(cachewidth),%ebx
+	sarl	$19,%eax			// tstep >>= 16;
+	jz		LIsZero
+	imull	%ebx,%eax			// (tstep >> 16) * cachewidth;
+LIsZero:
+	addl	%edx,%eax			// add in sstep
+								// (tstep >> 16) * cachewidth + (sstep >> 16);
+	movl	tfracf,%edx
+	movl	%eax,advancetable+4	// advance base in t
+	addl	%ebx,%eax			// ((tstep >> 16) + 1) * cachewidth +
+								//  (sstep >> 16);
+	shll	$13,%ebp			// left-justify sstep fractional part
+	movl	%ebp,sstep
+	movl	sfracf,%ebx
+	shll	$13,%ecx			// left-justify tstep fractional part
+	movl	%eax,advancetable	// advance extra in t
+	movl	%ecx,tstep
+
+	movl	pz,%ecx
+	movl	izi,%ebp
+
+	cmpw	(%ecx),%bp
+	jl		Lp1
+	movb	(%esi),%al			// get first source texel
+	cmpb	$(TRANSPARENT_COLOR),%al
+	jz		Lp1
+	movw	%bp,(%ecx)
+	movb	%al,(%edi)			// store first dest pixel
+Lp1:
+	addl	izistep,%ebp
+	adcl	$0,%ebp
+	addl	tstep,%edx			// advance tfrac fractional part by tstep frac
+
+	sbbl	%eax,%eax			// turn tstep carry into -1 (0 if none)
+	addl	sstep,%ebx			// advance sfrac fractional part by sstep frac
+	adcl	advancetable+4(,%eax,4),%esi	// point to next source texel
+
+	cmpw	2(%ecx),%bp
+	jl		Lp2
+	movb	(%esi),%al
+	cmpb	$(TRANSPARENT_COLOR),%al
+	jz		Lp2
+	movw	%bp,2(%ecx)
+	movb	%al,1(%edi)
+Lp2:
+	addl	izistep,%ebp
+	adcl	$0,%ebp
+	addl	tstep,%edx
+	sbbl	%eax,%eax
+	addl	sstep,%ebx
+	adcl	advancetable+4(,%eax,4),%esi
+
+	cmpw	4(%ecx),%bp
+	jl		Lp3
+	movb	(%esi),%al
+	cmpb	$(TRANSPARENT_COLOR),%al
+	jz		Lp3
+	movw	%bp,4(%ecx)
+	movb	%al,2(%edi)
+Lp3:
+	addl	izistep,%ebp
+	adcl	$0,%ebp
+	addl	tstep,%edx
+	sbbl	%eax,%eax
+	addl	sstep,%ebx
+	adcl	advancetable+4(,%eax,4),%esi
+
+	cmpw	6(%ecx),%bp
+	jl		Lp4
+	movb	(%esi),%al
+	cmpb	$(TRANSPARENT_COLOR),%al
+	jz		Lp4
+	movw	%bp,6(%ecx)
+	movb	%al,3(%edi)
+Lp4:
+	addl	izistep,%ebp
+	adcl	$0,%ebp
+	addl	tstep,%edx
+	sbbl	%eax,%eax
+	addl	sstep,%ebx
+	adcl	advancetable+4(,%eax,4),%esi
+
+	cmpw	8(%ecx),%bp
+	jl		Lp5
+	movb	(%esi),%al
+	cmpb	$(TRANSPARENT_COLOR),%al
+	jz		Lp5
+	movw	%bp,8(%ecx)
+	movb	%al,4(%edi)
+Lp5:
+	addl	izistep,%ebp
+	adcl	$0,%ebp
+	addl	tstep,%edx
+	sbbl	%eax,%eax
+	addl	sstep,%ebx
+	adcl	advancetable+4(,%eax,4),%esi
+
+//
+// start FDIV for end of next segment in flight, so it can overlap
+//
+	popl	%eax
+	cmpl	$8,%eax			// more than one segment after this?
+	ja		LSetupNotLast2	// yes
+
+	decl	%eax
+	jz		LFDIVInFlight2	// if only one pixel, no need to start an FDIV
+	movl	%eax,spancountminus1
+	fildl	spancountminus1
+
+	flds	C(d_zistepu)		// _d_zistepu | spancountminus1
+	fmul	%st(1),%st(0)	// _d_zistepu*scm1 | scm1
+	flds	C(d_tdivzstepu)	// _d_tdivzstepu | _d_zistepu*scm1 | scm1
+	fmul	%st(2),%st(0)	// _d_tdivzstepu*scm1 | _d_zistepu*scm1 | scm1
+	fxch	%st(1)			// _d_zistepu*scm1 | _d_tdivzstepu*scm1 | scm1
+	faddp	%st(0),%st(3)	// _d_tdivzstepu*scm1 | scm1
+	fxch	%st(1)			// scm1 | _d_tdivzstepu*scm1
+	fmuls	C(d_sdivzstepu)	// _d_sdivzstepu*scm1 | _d_tdivzstepu*scm1
+	fxch	%st(1)			// _d_tdivzstepu*scm1 | _d_sdivzstepu*scm1
+	faddp	%st(0),%st(3)	// _d_sdivzstepu*scm1
+	flds	fp_64k			// 64k | _d_sdivzstepu*scm1
+	fxch	%st(1)			// _d_sdivzstepu*scm1 | 64k
+	faddp	%st(0),%st(4)	// 64k
+
+	fdiv	%st(1),%st(0)	// this is what we've gone to all this trouble to
+							//  overlap
+	jmp		LFDIVInFlight2
+
+	.align	4
+LSetupNotLast2:
+	fadds	zi8stepu
+	fxch	%st(2)
+	fadds	sdivz8stepu
+	fxch	%st(2)
+	flds	tdivz8stepu
+	faddp	%st(0),%st(2)
+	flds	fp_64k
+	fdiv	%st(1),%st(0)	// z = 1/1/z
+							// this is what we've gone to all this trouble to
+							//  overlap
+LFDIVInFlight2:
+	pushl	%eax
+
+	cmpw	10(%ecx),%bp
+	jl		Lp6
+	movb	(%esi),%al
+	cmpb	$(TRANSPARENT_COLOR),%al
+	jz		Lp6
+	movw	%bp,10(%ecx)
+	movb	%al,5(%edi)
+Lp6:
+	addl	izistep,%ebp
+	adcl	$0,%ebp
+	addl	tstep,%edx
+	sbbl	%eax,%eax
+	addl	sstep,%ebx
+	adcl	advancetable+4(,%eax,4),%esi
+
+	cmpw	12(%ecx),%bp
+	jl		Lp7
+	movb	(%esi),%al
+	cmpb	$(TRANSPARENT_COLOR),%al
+	jz		Lp7
+	movw	%bp,12(%ecx)
+	movb	%al,6(%edi)
+Lp7:
+	addl	izistep,%ebp
+	adcl	$0,%ebp
+	addl	tstep,%edx
+	sbbl	%eax,%eax
+	addl	sstep,%ebx
+	adcl	advancetable+4(,%eax,4),%esi
+
+	cmpw	14(%ecx),%bp
+	jl		Lp8
+	movb	(%esi),%al
+	cmpb	$(TRANSPARENT_COLOR),%al
+	jz		Lp8
+	movw	%bp,14(%ecx)
+	movb	%al,7(%edi)
+Lp8:
+	addl	izistep,%ebp
+	adcl	$0,%ebp
+	addl	tstep,%edx
+	sbbl	%eax,%eax
+	addl	sstep,%ebx
+	adcl	advancetable+4(,%eax,4),%esi
+
+	addl	$8,%edi
+	addl	$16,%ecx
+	movl	%edx,tfracf
+	movl	snext,%edx
+	movl	%ebx,sfracf
+	movl	tnext,%ebx
+	movl	%edx,s
+	movl	%ebx,t
+
+	movl	%ecx,pz
+	movl	%ebp,izi
+
+	popl	%ecx				// retrieve count
+
+//
+// determine whether last span or not
+//
+	cmpl	$8,%ecx				// are there multiple segments remaining?
+	ja		LNotLastSegment		// yes
+
+//
+// last segment of scan
+//
+LLastSegment:
+
+//
+// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
+// get there. The number of pixels left is variable, and we want to land on the
+// last pixel, not step one past it, so we can't run into arithmetic problems
+//
+	testl	%ecx,%ecx
+	jz		LNoSteps		// just draw the last pixel and we're done
+
+// pick up after the FDIV that was left in flight previously
+
+
+	fld		%st(0)			// duplicate it
+	fmul	%st(4),%st(0)	// s = s/z * z
+	fxch	%st(1)
+	fmul	%st(3),%st(0)	// t = t/z * z
+	fxch	%st(1)
+	fistpl	snext
+	fistpl	tnext
+
+	movl	C(tadjust),%ebx
+	movl	C(sadjust),%eax
+
+	addl	snext,%eax
+	addl	tnext,%ebx
+
+	movl	C(bbextents),%ebp
+	movl	C(bbextentt),%edx
+
+	cmpl	$2048,%eax
+	jl		LClampLow4
+	cmpl	%ebp,%eax
+	ja		LClampHigh4
+LClampReentry4:
+	movl	%eax,snext
+
+	cmpl	$2048,%ebx
+	jl		LClampLow5
+	cmpl	%edx,%ebx
+	ja		LClampHigh5
+LClampReentry5:
+
+	cmpl	$1,%ecx			// don't bother 
+	je		LOnlyOneStep	// if two pixels in segment, there's only one step,
+							//  of the segment length
+	subl	s,%eax
+	subl	t,%ebx
+
+	addl	%eax,%eax		// convert to 15.17 format so multiply by 1.31
+	addl	%ebx,%ebx		//  reciprocal yields 16.48
+	imull	reciprocal_table-8(,%ecx,4) // sstep = (snext - s) / (spancount-1)
+	movl	%edx,%ebp
+
+	movl	%ebx,%eax
+	imull	reciprocal_table-8(,%ecx,4) // tstep = (tnext - t) / (spancount-1)
+
+LSetEntryvec:
+//
+// set up advancetable
+//
+	movl	spr8entryvec_table(,%ecx,4),%ebx
+	movl	%edx,%eax
+	pushl	%ebx				// entry point into code for RET later
+	movl	%ebp,%ecx
+	sarl	$16,%ecx			// sstep >>= 16;
+	movl	C(cachewidth),%ebx
+	sarl	$16,%edx			// tstep >>= 16;
+	jz		LIsZeroLast
+	imull	%ebx,%edx			// (tstep >> 16) * cachewidth;
+LIsZeroLast:
+	addl	%ecx,%edx			// add in sstep
+								// (tstep >> 16) * cachewidth + (sstep >> 16);
+	movl	tfracf,%ecx
+	movl	%edx,advancetable+4	// advance base in t
+	addl	%ebx,%edx			// ((tstep >> 16) + 1) * cachewidth +
+								//  (sstep >> 16);
+	shll	$16,%ebp			// left-justify sstep fractional part
+	movl	sfracf,%ebx
+	shll	$16,%eax			// left-justify tstep fractional part
+	movl	%edx,advancetable	// advance extra in t
+
+	movl	%eax,tstep
+	movl	%ebp,sstep
+	movl	%ecx,%edx
+
+	movl	pz,%ecx
+	movl	izi,%ebp
+
+	ret							// jump to the number-of-pixels handler
+
+//----------------------------------------
+
+LNoSteps:
+	movl	pz,%ecx
+	subl	$7,%edi			// adjust for hardwired offset
+	subl	$14,%ecx
+	jmp		LEndSpan
+
+
+LOnlyOneStep:
+	subl	s,%eax
+	subl	t,%ebx
+	movl	%eax,%ebp
+	movl	%ebx,%edx
+	jmp		LSetEntryvec
+
+//----------------------------------------
+
+.globl	Spr8Entry2_8
+Spr8Entry2_8:
+	subl	$6,%edi		// adjust for hardwired offsets
+	subl	$12,%ecx
+	movb	(%esi),%al
+	jmp		LLEntry2_8
+
+//----------------------------------------
+
+.globl	Spr8Entry3_8
+Spr8Entry3_8:
+	subl	$5,%edi		// adjust for hardwired offsets
+	subl	$10,%ecx
+	jmp		LLEntry3_8
+
+//----------------------------------------
+
+.globl	Spr8Entry4_8
+Spr8Entry4_8:
+	subl	$4,%edi		// adjust for hardwired offsets
+	subl	$8,%ecx
+	jmp		LLEntry4_8
+
+//----------------------------------------
+
+.globl	Spr8Entry5_8
+Spr8Entry5_8:
+	subl	$3,%edi		// adjust for hardwired offsets
+	subl	$6,%ecx
+	jmp		LLEntry5_8
+
+//----------------------------------------
+
+.globl	Spr8Entry6_8
+Spr8Entry6_8:
+	subl	$2,%edi		// adjust for hardwired offsets
+	subl	$4,%ecx
+	jmp		LLEntry6_8
+
+//----------------------------------------
+
+.globl	Spr8Entry7_8
+Spr8Entry7_8:
+	decl	%edi		// adjust for hardwired offsets
+	subl	$2,%ecx
+	jmp		LLEntry7_8
+
+//----------------------------------------
+
+.globl	Spr8Entry8_8
+Spr8Entry8_8:
+	cmpw	(%ecx),%bp
+	jl		Lp9
+	movb	(%esi),%al
+	cmpb	$(TRANSPARENT_COLOR),%al
+	jz		Lp9
+	movw	%bp,(%ecx)
+	movb	%al,(%edi)
+Lp9:
+	addl	izistep,%ebp
+	adcl	$0,%ebp
+	addl	tstep,%edx
+	sbbl	%eax,%eax
+	addl	sstep,%ebx
+	adcl	advancetable+4(,%eax,4),%esi
+LLEntry7_8:
+	cmpw	2(%ecx),%bp
+	jl		Lp10
+	movb	(%esi),%al
+	cmpb	$(TRANSPARENT_COLOR),%al
+	jz		Lp10
+	movw	%bp,2(%ecx)
+	movb	%al,1(%edi)
+Lp10:
+	addl	izistep,%ebp
+	adcl	$0,%ebp
+	addl	tstep,%edx
+	sbbl	%eax,%eax
+	addl	sstep,%ebx
+	adcl	advancetable+4(,%eax,4),%esi
+LLEntry6_8:
+	cmpw	4(%ecx),%bp
+	jl		Lp11
+	movb	(%esi),%al
+	cmpb	$(TRANSPARENT_COLOR),%al
+	jz		Lp11
+	movw	%bp,4(%ecx)
+	movb	%al,2(%edi)
+Lp11:
+	addl	izistep,%ebp
+	adcl	$0,%ebp
+	addl	tstep,%edx
+	sbbl	%eax,%eax
+	addl	sstep,%ebx
+	adcl	advancetable+4(,%eax,4),%esi
+LLEntry5_8:
+	cmpw	6(%ecx),%bp
+	jl		Lp12
+	movb	(%esi),%al
+	cmpb	$(TRANSPARENT_COLOR),%al
+	jz		Lp12
+	movw	%bp,6(%ecx)
+	movb	%al,3(%edi)
+Lp12:
+	addl	izistep,%ebp
+	adcl	$0,%ebp
+	addl	tstep,%edx
+	sbbl	%eax,%eax
+	addl	sstep,%ebx
+	adcl	advancetable+4(,%eax,4),%esi
+LLEntry4_8:
+	cmpw	8(%ecx),%bp
+	jl		Lp13
+	movb	(%esi),%al
+	cmpb	$(TRANSPARENT_COLOR),%al
+	jz		Lp13
+	movw	%bp,8(%ecx)
+	movb	%al,4(%edi)
+Lp13:
+	addl	izistep,%ebp
+	adcl	$0,%ebp
+	addl	tstep,%edx
+	sbbl	%eax,%eax
+	addl	sstep,%ebx
+	adcl	advancetable+4(,%eax,4),%esi
+LLEntry3_8:
+	cmpw	10(%ecx),%bp
+	jl		Lp14
+	movb	(%esi),%al
+	cmpb	$(TRANSPARENT_COLOR),%al
+	jz		Lp14
+	movw	%bp,10(%ecx)
+	movb	%al,5(%edi)
+Lp14:
+	addl	izistep,%ebp
+	adcl	$0,%ebp
+	addl	tstep,%edx
+	sbbl	%eax,%eax
+	addl	sstep,%ebx
+	adcl	advancetable+4(,%eax,4),%esi
+LLEntry2_8:
+	cmpw	12(%ecx),%bp
+	jl		Lp15
+	movb	(%esi),%al
+	cmpb	$(TRANSPARENT_COLOR),%al
+	jz		Lp15
+	movw	%bp,12(%ecx)
+	movb	%al,6(%edi)
+Lp15:
+	addl	izistep,%ebp
+	adcl	$0,%ebp
+	addl	tstep,%edx
+	sbbl	%eax,%eax
+	addl	sstep,%ebx
+	adcl	advancetable+4(,%eax,4),%esi
+
+LEndSpan:
+	cmpw	14(%ecx),%bp
+	jl		Lp16
+	movb	(%esi),%al		// load first texel in segment
+	cmpb	$(TRANSPARENT_COLOR),%al
+	jz		Lp16
+	movw	%bp,14(%ecx)
+	movb	%al,7(%edi)
+Lp16:
+
+//
+// clear s/z, t/z, 1/z from FP stack
+//
+	fstp %st(0)
+	fstp %st(0)
+	fstp %st(0)
+
+	popl	%ebx				// restore spans pointer
+LNextSpan:
+	addl	$(sspan_t_size),%ebx // point to next span
+	movl	sspan_t_count(%ebx),%ecx
+	cmpl	$0,%ecx				// any more spans?
+	jg		LSpanLoop			// yes
+	jz		LNextSpan			// yes, but this one's empty
+
+	popl	%ebx				// restore register variables
+	popl	%esi
+	popl	%edi
+	popl	%ebp				// restore the caller's stack frame
+	ret
+
+#endif	// id386
diff --git a/source/d_varsa.S b/source/d_varsa.S
new file mode 100644
index 0000000..76d67eb
--- /dev/null
+++ b/source/d_varsa.S
@@ -0,0 +1,213 @@
+/*
+Copyright (C) 1996-1997 Id Software, Inc.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+
+See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+*/
+//
+// d_varsa.s
+//
+
+#include "asm_i386.h"
+#include "quakeasm.h"
+#include "asm_draw.h"
+#include "d_ifacea.h"
+
+#if	id386
+
+	.data
+
+//-------------------------------------------------------
+// global refresh variables
+//-------------------------------------------------------
+
+// FIXME: put all refresh variables into one contiguous block. Make into one
+// big structure, like cl or sv?
+
+	.align	4
+.globl	C(d_sdivzstepu)
+.globl	C(d_tdivzstepu)
+.globl	C(d_zistepu)
+.globl	C(d_sdivzstepv)
+.globl	C(d_tdivzstepv)
+.globl	C(d_zistepv)
+.globl	C(d_sdivzorigin)
+.globl	C(d_tdivzorigin)
+.globl	C(d_ziorigin)
+C(d_sdivzstepu):	.single	0
+C(d_tdivzstepu):	.single	0
+C(d_zistepu):		.single	0
+C(d_sdivzstepv):	.single	0
+C(d_tdivzstepv):	.single	0
+C(d_zistepv):		.single	0
+C(d_sdivzorigin):	.single	0
+C(d_tdivzorigin):	.single	0
+C(d_ziorigin):		.single	0
+
+.globl	C(sadjust)
+.globl	C(tadjust)
+.globl	C(bbextents)
+.globl	C(bbextentt)
+C(sadjust):			.long	0
+C(tadjust):			.long	0
+C(bbextents):		.long	0
+C(bbextentt):		.long	0
+
+.globl	C(cacheblock)
+.globl	C(d_viewbuffer)
+.globl	C(cachewidth)
+.globl	C(d_pzbuffer)
+.globl	C(d_zrowbytes)
+.globl	C(d_zwidth)
+C(cacheblock):		.long	0
+C(cachewidth):		.long	0
+C(d_viewbuffer):	.long	0
+C(d_pzbuffer):		.long	0
+C(d_zrowbytes):		.long	0
+C(d_zwidth):		.long	0
+
+
+//-------------------------------------------------------
+// ASM-only variables
+//-------------------------------------------------------
+.globl	izi
+izi:			.long	0
+
+.globl	pbase, s, t, sfracf, tfracf, snext, tnext
+.globl	spancountminus1, zi16stepu, sdivz16stepu, tdivz16stepu
+.globl	zi8stepu, sdivz8stepu, tdivz8stepu, pz
+s:				.long	0
+t:				.long	0
+snext:			.long	0
+tnext:			.long	0
+sfracf:			.long	0
+tfracf:			.long	0
+pbase:			.long	0
+zi8stepu:		.long	0
+sdivz8stepu:	.long	0
+tdivz8stepu:	.long	0
+zi16stepu:		.long	0
+sdivz16stepu:	.long	0
+tdivz16stepu:	.long	0
+spancountminus1: .long	0
+pz:				.long	0
+
+.globl	izistep
+izistep:				.long	0
+
+//-------------------------------------------------------
+// local variables for d_draw16.s
+//-------------------------------------------------------
+
+.globl	reciprocal_table_16, entryvec_table_16
+// 1/2, 1/3, 1/4, 1/5, 1/6, 1/7, 1/8, 1/9, 1/10, 1/11, 1/12, 1/13,
+// 1/14, and 1/15 in 0.32 form
+reciprocal_table_16:	.long	0x40000000, 0x2aaaaaaa, 0x20000000
+						.long	0x19999999, 0x15555555, 0x12492492
+						.long	0x10000000, 0xe38e38e, 0xccccccc, 0xba2e8ba
+						.long	0xaaaaaaa, 0x9d89d89, 0x9249249, 0x8888888
+
+#ifndef NeXT
+	.extern Entry2_16
+	.extern Entry3_16
+	.extern Entry4_16
+	.extern Entry5_16
+	.extern Entry6_16
+	.extern Entry7_16
+	.extern Entry8_16
+	.extern Entry9_16
+	.extern Entry10_16
+	.extern Entry11_16
+	.extern Entry12_16
+	.extern Entry13_16
+	.extern Entry14_16
+	.extern Entry15_16
+	.extern Entry16_16
+#endif
+
+entryvec_table_16:	.long	0, Entry2_16, Entry3_16, Entry4_16
+					.long	Entry5_16, Entry6_16, Entry7_16, Entry8_16
+					.long	Entry9_16, Entry10_16, Entry11_16, Entry12_16
+					.long	Entry13_16, Entry14_16, Entry15_16, Entry16_16
+
+//-------------------------------------------------------
+// local variables for d_parta.s
+//-------------------------------------------------------
+.globl	DP_Count, DP_u, DP_v, DP_32768, DP_Color, DP_Pix, DP_EntryTable
+DP_Count:		.long	0
+DP_u:			.long	0
+DP_v:			.long	0
+DP_32768:		.single	32768.0
+DP_Color:		.long	0
+DP_Pix:			.long	0
+
+
+#ifndef NeXT
+	.extern DP_1x1
+	.extern DP_2x2
+	.extern DP_3x3
+	.extern DP_4x4
+#endif
+
+DP_EntryTable:	.long	DP_1x1, DP_2x2, DP_3x3, DP_4x4
+
+//
+// advancetable is 8 bytes, but points to the middle of that range so negative
+// offsets will work
+//
+.globl	advancetable, sstep, tstep, pspantemp, counttemp, jumptemp
+advancetable:	.long	0, 0
+sstep:			.long	0
+tstep:			.long	0
+
+pspantemp:		.long	0
+counttemp:		.long	0
+jumptemp:		.long	0
+
+// 1/2, 1/3, 1/4, 1/5, 1/6, and 1/7 in 0.32 form
+.globl	reciprocal_table, entryvec_table
+reciprocal_table:	.long	0x40000000, 0x2aaaaaaa, 0x20000000
+					.long	0x19999999, 0x15555555, 0x12492492
+
+#ifndef NeXT
+	.extern Entry2_8
+	.extern Entry3_8
+	.extern Entry4_8
+	.extern Entry5_8
+	.extern Entry6_8
+	.extern Entry7_8
+	.extern Entry8_8
+#endif
+
+entryvec_table:	.long	0, Entry2_8, Entry3_8, Entry4_8
+				.long	Entry5_8, Entry6_8, Entry7_8, Entry8_8
+
+#ifndef NeXT
+	.extern Spr8Entry2_8
+	.extern Spr8Entry3_8
+	.extern Spr8Entry4_8
+	.extern Spr8Entry5_8
+	.extern Spr8Entry6_8
+	.extern Spr8Entry7_8
+	.extern Spr8Entry8_8
+#endif
+	
+.globl spr8entryvec_table
+spr8entryvec_table:	.long	0, Spr8Entry2_8, Spr8Entry3_8, Spr8Entry4_8
+					.long	Spr8Entry5_8, Spr8Entry6_8, Spr8Entry7_8, Spr8Entry8_8
+
+#endif	// id386
+
diff --git a/source/math.S b/source/math.S
index 465f914..725216b 100644
--- a/source/math.S
+++ b/source/math.S
@@ -1,3 +1,22 @@
+/*
+Copyright (C) 1996-1997 Id Software, Inc.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+
+See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+*/
 //
 // math.s
 // x86 assembly-language math routines.
@@ -6,7 +25,7 @@
 #include "quakeasm.h"
 
 
-#if id386
+#if	id386
 
 	.data
 
@@ -16,6 +35,73 @@ Ljmptab:	.long	Lcase0, Lcase1, Lcase2, Lcase3
 
 	.text
 
+// TODO: rounding needed?
+// stack parameter offset
+#define	val	4
+
+.globl C(Invert24To16)
+C(Invert24To16):
+
+	movl	val(%esp),%ecx
+	movl	$0x100,%edx		// 0x10000000000 as dividend
+	cmpl	%edx,%ecx
+	jle		LOutOfRange
+
+	subl	%eax,%eax
+	divl	%ecx
+
+	ret
+
+LOutOfRange:
+	movl	$0xFFFFFFFF,%eax
+	ret
+
+#define	in	4
+#define out	8
+
+	.align 2
+.globl C(TransformVector)
+C(TransformVector):
+	movl	in(%esp),%eax
+	movl	out(%esp),%edx
+
+	flds	(%eax)		// in[0]
+	fmuls	C(vright)		// in[0]*vright[0]
+	flds	(%eax)		// in[0] | in[0]*vright[0]
+	fmuls	C(vup)		// in[0]*vup[0] | in[0]*vright[0]
+	flds	(%eax)		// in[0] | in[0]*vup[0] | in[0]*vright[0]
+	fmuls	C(vpn)		// in[0]*vpn[0] | in[0]*vup[0] | in[0]*vright[0]
+
+	flds	4(%eax)		// in[1] | ...
+	fmuls	C(vright)+4	// in[1]*vright[1] | ...
+	flds	4(%eax)		// in[1] | in[1]*vright[1] | ...
+	fmuls	C(vup)+4		// in[1]*vup[1] | in[1]*vright[1] | ...
+	flds	4(%eax)		// in[1] | in[1]*vup[1] | in[1]*vright[1] | ...
+	fmuls	C(vpn)+4		// in[1]*vpn[1] | in[1]*vup[1] | in[1]*vright[1] | ...
+	fxch	%st(2)		// in[1]*vright[1] | in[1]*vup[1] | in[1]*vpn[1] | ...
+
+	faddp	%st(0),%st(5)	// in[1]*vup[1] | in[1]*vpn[1] | ...
+	faddp	%st(0),%st(3)	// in[1]*vpn[1] | ...
+	faddp	%st(0),%st(1)	// vpn_accum | vup_accum | vright_accum
+
+	flds	8(%eax)		// in[2] | ...
+	fmuls	C(vright)+8	// in[2]*vright[2] | ...
+	flds	8(%eax)		// in[2] | in[2]*vright[2] | ...
+	fmuls	C(vup)+8		// in[2]*vup[2] | in[2]*vright[2] | ...
+	flds	8(%eax)		// in[2] | in[2]*vup[2] | in[2]*vright[2] | ...
+	fmuls	C(vpn)+8		// in[2]*vpn[2] | in[2]*vup[2] | in[2]*vright[2] | ...
+	fxch	%st(2)		// in[2]*vright[2] | in[2]*vup[2] | in[2]*vpn[2] | ...
+
+	faddp	%st(0),%st(5)	// in[2]*vup[2] | in[2]*vpn[2] | ...
+	faddp	%st(0),%st(3)	// in[2]*vpn[2] | ...
+	faddp	%st(0),%st(1)	// vpn_accum | vup_accum | vright_accum
+
+	fstps	8(%edx)		// out[2]
+	fstps	4(%edx)		// out[1]
+	fstps	(%edx)		// out[0]
+
+	ret
+
 
 #define EMINS	4+4
 #define EMAXS	4+8
@@ -35,7 +121,7 @@ C(BoxOnPlaneSide):
 	jge		Lerror
 	flds	pl_normal(%edx)		// p->normal[0]
 	fld		%st(0)				// p->normal[0] | p->normal[0]
-	jmp		*Ljmptab(,%eax,4)
+	jmp		Ljmptab(,%eax,4)
 
 
 //dist1= p->normal[0]*emaxs[0] + p->normal[1]*emaxs[1] + p->normal[2]*emaxs[2];
diff --git a/source/r_aclipa.S b/source/r_aclipa.S
new file mode 100644
index 0000000..418306a
--- /dev/null
+++ b/source/r_aclipa.S
@@ -0,0 +1,216 @@
+/*
+Copyright (C) 1996-1997 Id Software, Inc.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+
+See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+*/
+//
+// r_aliasa.s
+// x86 assembly-language Alias model transform and project code.
+//
+
+#include "asm_i386.h"
+#include "quakeasm.h"
+#include "asm_draw.h"
+#include "d_ifacea.h"
+
+#if id386
+
+	.data
+Ltemp0:	.long	0
+Ltemp1:	.long	0
+
+	.text
+
+#define pfv0		8+4
+#define pfv1		8+8
+#define out			8+12
+
+.globl C(R_Alias_clip_bottom)
+C(R_Alias_clip_bottom):
+	pushl	%esi
+	pushl	%edi
+
+	movl	pfv0(%esp),%esi
+	movl	pfv1(%esp),%edi
+
+	movl	C(r_refdef)+rd_aliasvrectbottom,%eax
+
+LDoForwardOrBackward:
+
+	movl	fv_v+4(%esi),%edx
+	movl	fv_v+4(%edi),%ecx
+
+	cmpl	%ecx,%edx
+	jl		LDoForward
+
+	movl	fv_v+4(%esi),%ecx
+	movl	fv_v+4(%edi),%edx
+	movl	pfv0(%esp),%edi
+	movl	pfv1(%esp),%esi
+
+LDoForward:
+
+	subl	%edx,%ecx
+	subl	%edx,%eax
+	movl	%ecx,Ltemp1
+	movl	%eax,Ltemp0
+	fildl	Ltemp1
+	fildl	Ltemp0
+	movl	out(%esp),%edx
+	movl	$2,%eax
+
+	fdivp	%st(0),%st(1)					// scale
+
+LDo3Forward:
+	fildl	fv_v+0(%esi)	// fv0v0 | scale
+	fildl	fv_v+0(%edi)	// fv1v0 | fv0v0 | scale
+	fildl	fv_v+4(%esi)	// fv0v1 | fv1v0 | fv0v0 | scale
+	fildl	fv_v+4(%edi)	// fv1v1 | fv0v1 | fv1v0 | fv0v0 | scale
+	fildl	fv_v+8(%esi)	// fv0v2 | fv1v1 | fv0v1 | fv1v0 | fv0v0 | scale
+	fildl	fv_v+8(%edi)	// fv1v2 | fv0v2 | fv1v1 | fv0v1 | fv1v0 | fv0v0 |
+							//  scale
+	fxch	%st(5)			// fv0v0 | fv0v2 | fv1v1 | fv0v1 | fv1v0 | fv1v2 |
+							//  scale
+	fsubr	%st(0),%st(4)	// fv0v0 | fv0v2 | fv1v1 | fv0v1 | fv1v0-fv0v0 |
+							//  fv1v2 | scale
+	fxch	%st(3)			// fv0v1 | fv0v2 | fv1v1 | fv0v0 | fv1v0-fv0v0 |
+							//  fv1v2 | scale
+	fsubr	%st(0),%st(2)	// fv0v1 | fv0v2 | fv1v1-fv0v1 | fv0v0 |
+							//  fv1v0-fv0v0 | fv1v2 | scale
+	fxch	%st(1)			// fv0v2 | fv0v1 | fv1v1-fv0v1 | fv0v0 |
+							//  fv1v0-fv0v0 | fv1v2 | scale
+	fsubr	%st(0),%st(5)	// fv0v2 | fv0v1 | fv1v1-fv0v1 | fv0v0 |
+							//  fv1v0-fv0v0 | fv1v2-fv0v2 | scale
+	fxch	%st(6)			// scale | fv0v1 | fv1v1-fv0v1 | fv0v0 |
+							//  fv1v0-fv0v0 | fv1v2-fv0v2 | fv0v2
+	fmul	%st(0),%st(4)	// scale | fv0v1 | fv1v1-fv0v1 | fv0v0 |
+							//  (fv1v0-fv0v0)*scale | fv1v2-fv0v2 | fv0v2
+	addl	$12,%edi
+	fmul	%st(0),%st(2)	// scale | fv0v1 | (fv1v1-fv0v1)*scale | fv0v0 |
+							//  (fv1v0-fv0v0)*scale | fv1v2-fv0v2 | fv0v2
+	addl	$12,%esi
+	addl	$12,%edx
+	fmul	%st(0),%st(5)	// scale | fv0v1 | (fv1v1-fv0v1)*scale | fv0v0 |
+							//  (fv1v0-fv0v0)*scale | (fv1v2-fv0v2)*scale |
+							//  fv0v2
+	fxch	%st(3)			// fv0v0 | fv0v1 | (fv1v1-fv0v1)*scale | scale |
+							//  (fv1v0-fv0v0)*scale | (fv1v2-fv0v2)*scale |
+							//  fv0v2
+	faddp	%st(0),%st(4)	// fv0v1 | (fv1v1-fv0v1)*scale | scale |
+							//  fv0v0+(fv1v0-fv0v0)*scale |
+							//  (fv1v2-fv0v2)*scale | fv0v2
+	faddp	%st(0),%st(1)	// fv0v1+(fv1v1-fv0v1)*scale | scale |
+							//  fv0v0+(fv1v0-fv0v0)*scale |
+							//  (fv1v2-fv0v2)*scale | fv0v2
+	fxch	%st(4)			// fv0v2 | scale | fv0v0+(fv1v0-fv0v0)*scale |
+							//  (fv1v2-fv0v2)*scale | fv0v1+(fv1v1-fv0v1)*scale
+	faddp	%st(0),%st(3)	// scale | fv0v0+(fv1v0-fv0v0)*scale |
+							//  fv0v2+(fv1v2-fv0v2)*scale |
+							//  fv0v1+(fv1v1-fv0v1)*scale
+	fxch	%st(1)			// fv0v0+(fv1v0-fv0v0)*scale | scale | 
+							//  fv0v2+(fv1v2-fv0v2)*scale |
+							//  fv0v1+(fv1v1-fv0v1)*scale
+	fadds	float_point5
+	fxch	%st(3)			// fv0v1+(fv1v1-fv0v1)*scale | scale | 
+							//  fv0v2+(fv1v2-fv0v2)*scale |
+							//  fv0v0+(fv1v0-fv0v0)*scale
+	fadds	float_point5
+	fxch	%st(2)			// fv0v2+(fv1v2-fv0v2)*scale | scale | 
+							//  fv0v1+(fv1v1-fv0v1)*scale |
+							//  fv0v0+(fv1v0-fv0v0)*scale
+	fadds	float_point5
+	fxch	%st(3)			// fv0v0+(fv1v0-fv0v0)*scale | scale | 
+							//  fv0v1+(fv1v1-fv0v1)*scale |
+							//  fv0v2+(fv1v2-fv0v2)*scale
+	fistpl	fv_v+0-12(%edx)	// scale | fv0v1+(fv1v1-fv0v1)*scale |
+							//  fv0v2+(fv1v2-fv0v2)*scale
+	fxch	%st(1)			// fv0v1+(fv1v1-fv0v1)*scale | scale |
+							//  fv0v2+(fv1v2-fv0v2)*scale | scale
+	fistpl	fv_v+4-12(%edx)	// scale | fv0v2+(fv1v2-fv0v2)*scale
+	fxch	%st(1)			// fv0v2+(fv1v2-fv0v2)*sc | scale
+	fistpl	fv_v+8-12(%edx)	// scale
+
+	decl	%eax
+	jnz		LDo3Forward
+
+	fstp	%st(0)
+
+	popl	%edi
+	popl	%esi
+
+	ret
+
+
+.globl C(R_Alias_clip_top)
+C(R_Alias_clip_top):
+	pushl	%esi
+	pushl	%edi
+
+	movl	pfv0(%esp),%esi
+	movl	pfv1(%esp),%edi
+
+	movl	C(r_refdef)+rd_aliasvrect+4,%eax
+	jmp		LDoForwardOrBackward
+
+
+
+.globl C(R_Alias_clip_right)
+C(R_Alias_clip_right):
+	pushl	%esi
+	pushl	%edi
+
+	movl	pfv0(%esp),%esi
+	movl	pfv1(%esp),%edi
+
+	movl	C(r_refdef)+rd_aliasvrectright,%eax
+
+LRightLeftEntry:
+
+
+	movl	fv_v+4(%esi),%edx
+	movl	fv_v+4(%edi),%ecx
+
+	cmpl	%ecx,%edx
+	movl	fv_v+0(%esi),%edx
+
+	movl	fv_v+0(%edi),%ecx
+	jl		LDoForward2
+
+	movl	fv_v+0(%esi),%ecx
+	movl	fv_v+0(%edi),%edx
+	movl	pfv0(%esp),%edi
+	movl	pfv1(%esp),%esi
+
+LDoForward2:
+
+	jmp		LDoForward
+
+
+.globl C(R_Alias_clip_left)
+C(R_Alias_clip_left):
+	pushl	%esi
+	pushl	%edi
+
+	movl	pfv0(%esp),%esi
+	movl	pfv1(%esp),%edi
+
+	movl	C(r_refdef)+rd_aliasvrect+0,%eax
+	jmp		LRightLeftEntry
+
+
+#endif	// id386
+
diff --git a/source/r_aliasa.S b/source/r_aliasa.S
new file mode 100644
index 0000000..123cde2
--- /dev/null
+++ b/source/r_aliasa.S
@@ -0,0 +1,237 @@
+/*
+Copyright (C) 1996-1997 Id Software, Inc.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+
+See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+*/
+//
+// r_aliasa.s
+// x86 assembly-language Alias model transform and project code.
+//
+
+#include "asm_i386.h"
+#include "quakeasm.h"
+#include "asm_draw.h"
+#include "d_ifacea.h"
+
+#if id386
+
+	.data
+
+Lfloat_1:	.single	1.0
+Ltemp:		.long	0
+Lcoords:	.long	0, 0, 0
+
+	.text
+
+#define fv			12+4
+#define pstverts	12+8
+
+.globl C(R_AliasTransformAndProjectFinalVerts)
+C(R_AliasTransformAndProjectFinalVerts):
+	pushl	%ebp				// preserve caller's stack frame
+	pushl	%edi
+	pushl	%esi				// preserve register variables
+
+//	int			i, temp;
+//	float		lightcos, *plightnormal, zi;
+//	trivertx_t	*pverts;
+
+//	pverts = r_apverts;
+	movl	C(r_apverts),%esi
+
+//	for (i=0 ; i<r_anumverts ; i++, fv++, pverts++, pstverts++)
+//	{
+	movl	pstverts(%esp),%ebp
+	movl	fv(%esp),%edi
+	movl	C(r_anumverts),%ecx
+	subl	%edx,%edx
+
+Lloop:
+
+//	// transform and project
+//		zi = 1.0 / (DotProduct(pverts->v, aliastransform[2]) +
+//				aliastransform[2][3]);
+	movb	(%esi),%dl
+	movb	%dl,Lcoords
+	fildl	Lcoords				// v[0]
+	movb	1(%esi),%dl
+	movb	%dl,Lcoords+4
+	fildl	Lcoords+4			// v[1] | v[0]
+	movb	2(%esi),%dl	
+	movb	%dl,Lcoords+8
+	fildl	Lcoords+8			// v[2] | v[1] | v[0]
+
+	fld		%st(2)				// v[0] | v[2] | v[1] | v[0]
+	fmuls	C(aliastransform)+32 // accum | v[2] | v[1] | v[0]
+	fld		%st(2)				// v[1] | accum | v[2] | v[1] | v[0]
+	fmuls	C(aliastransform)+36 // accum2 | accum | v[2] | v[1] | v[0]
+	fxch	%st(1)				// accum | accum2 | v[2] | v[1] | v[0]
+	fadds	C(aliastransform)+44 // accum | accum2 | v[2] | v[1] | v[0]
+	fld		%st(2)				// v[2] | accum | accum2 | v[2] | v[1] | v[0]
+	fmuls	C(aliastransform)+40 // accum3 | accum | accum2 | v[2] | v[1] |
+								 //  v[0]
+	fxch	%st(1)				// accum | accum3 | accum2 | v[2] | v[1] | v[0]
+	faddp	%st(0),%st(2)		// accum3 | accum | v[2] | v[1] | v[0]
+	movb	tv_lightnormalindex(%esi),%dl
+	movl	stv_s(%ebp),%eax
+	movl	%eax,fv_v+8(%edi)
+	faddp	%st(0),%st(1)		// z | v[2] | v[1] | v[0]
+
+	movl	stv_t(%ebp),%eax
+	movl	%eax,fv_v+12(%edi)
+
+//	// lighting
+//		plightnormal = r_avertexnormals[pverts->lightnormalindex];
+
+	fdivrs	Lfloat_1			// zi | v[2] | v[1] | v[0]
+
+//		fv->v[2] = pstverts->s;
+//		fv->v[3] = pstverts->t;
+//		fv->flags = pstverts->onseam;
+	movl	stv_onseam(%ebp),%eax
+	movl	%eax,fv_flags(%edi)
+
+	movl	fv_size(%edi),%eax
+	movl	stv_size(%ebp),%eax
+	movl	4(%esi),%eax
+
+	leal	(%edx,%edx,2),%eax	// index*3
+
+	fxch	%st(3)				// v[0] | v[2] | v[1] | zi
+
+//		lightcos = DotProduct (plightnormal, r_plightvec);
+	flds	C(r_avertexnormals)(,%eax,4)
+	fmuls	C(r_plightvec)
+	flds	C(r_avertexnormals)+4(,%eax,4)
+	fmuls	C(r_plightvec)+4
+	flds	C(r_avertexnormals)+8(,%eax,4)
+	fmuls	C(r_plightvec)+8
+	fxch	%st(1)
+	faddp	%st(0),%st(2)
+	fld		%st(2)				 // v[0] | laccum | laccum2 | v[0] | v[2] |
+								 //  v[1] | zi
+	fmuls	C(aliastransform)+0  // xaccum | laccum | laccum2 | v[0] | v[2] |
+								 //  v[1] | zi
+	fxch	%st(2)				 // laccum2 | laccum | xaccum | v[0] | v[2] |
+								 //  v[1] | zi
+	faddp	%st(0),%st(1)		 // laccum | xaccum | v[0] | v[2] | v[1] | zi
+
+//		temp = r_ambientlight;
+//		if (lightcos < 0)
+//		{
+	fsts	Ltemp
+	movl	C(r_ambientlight),%eax
+	movb	Ltemp+3,%dl
+	testb	$0x80,%dl
+	jz		Lsavelight	// no need to clamp if only ambient lit, because
+						//  r_ambientlight is preclamped
+
+//			temp += (int)(r_shadelight * lightcos);
+	fmuls	C(r_shadelight)
+// FIXME: fast float->int conversion?
+	fistpl	Ltemp
+	addl	Ltemp,%eax
+
+//		// clamp; because we limited the minimum ambient and shading light, we
+//		// don't have to clamp low light, just bright
+//			if (temp < 0)
+//				temp = 0;
+	jns		Lp1
+	subl	%eax,%eax
+
+//		}
+
+Lp1:
+
+//		fv->v[4] = temp;
+//
+//	// x, y, and z are scaled down by 1/2**31 in the transform, so 1/z is
+//	// scaled up by 1/2**31, and the scaling cancels out for x and y in the
+//	// projection
+//		fv->v[0] = ((DotProduct(pverts->v, aliastransform[0]) +
+//				aliastransform[0][3]) * zi) + aliasxcenter;
+//		fv->v[1] = ((DotProduct(pverts->v, aliastransform[1]) +
+//				aliastransform[1][3]) * zi) + aliasycenter;
+//		fv->v[5] = zi;
+	fxch	%st(1)				 // v[0] | xaccum | v[2] | v[1] | zi
+	fmuls	C(aliastransform)+16 // yaccum | xaccum | v[2] | v[1] | zi
+	fxch	%st(3)				 // v[1] | xaccum | v[2] | yaccum | zi
+	fld		%st(0)				 // v[1] | v[1] | xaccum | v[2] | yaccum | zi
+	fmuls	C(aliastransform)+4	 // xaccum2 | v[1] | xaccum | v[2] | yaccum |zi
+	fxch	%st(1)				 // v[1] | xaccum2 | xaccum | v[2] | yaccum |zi
+	movl	%eax,fv_v+16(%edi)
+	fmuls	C(aliastransform)+20 // yaccum2 | xaccum2 | xaccum | v[2] | yaccum|
+								 //  zi
+	fxch	%st(2)				 // xaccum | xaccum2 | yaccum2 | v[2] | yaccum|
+								 //  zi
+	fadds	C(aliastransform)+12 // xaccum | xaccum2 | yaccum2 | v[2] | yaccum|
+								 //  zi
+	fxch	%st(4)				 // yaccum | xaccum2 | yaccum2 | v[2] | xaccum|
+								 //  zi
+	fadds	C(aliastransform)+28 // yaccum | xaccum2 | yaccum2 | v[2] | xaccum|
+								 //  zi
+	fxch	%st(3)				 // v[2] | xaccum2 | yaccum2 | yaccum | xaccum|
+								 //  zi
+	fld		%st(0)				 // v[2] | v[2] | xaccum2 | yaccum2 | yaccum |
+								 //  xaccum | zi
+	fmuls	C(aliastransform)+8	 // xaccum3 | v[2] | xaccum2 | yaccum2 |yaccum|
+								 //  xaccum | zi
+	fxch	%st(1)				 // v[2] | xaccum3 | xaccum2 | yaccum2 |yaccum|
+								 //  xaccum | zi
+	fmuls	C(aliastransform)+24 // yaccum3 | xaccum3 | xaccum2 | yaccum2 |
+								 // yaccum | xaccum | zi
+	fxch	%st(5)				 // xaccum | xaccum3 | xaccum2 | yaccum2 |
+								 // yaccum | yaccum3 | zi
+	faddp	%st(0),%st(2)		 // xaccum3 | xaccum | yaccum2 | yaccum |
+								 //  yaccum3 | zi
+	fxch	%st(3)				 // yaccum | xaccum | yaccum2 | xaccum3 |
+								 //  yaccum3 | zi
+	faddp	%st(0),%st(2)		 // xaccum | yaccum | xaccum3 | yaccum3 | zi
+	addl	$(tv_size),%esi
+	faddp	%st(0),%st(2)		 // yaccum | x | yaccum3 | zi
+	faddp	%st(0),%st(2)		 // x | y | zi
+	addl	$(stv_size),%ebp
+	fmul	%st(2),%st(0)		 // x/z | y | zi
+	fxch	%st(1)				 // y | x/z | zi
+	fmul	%st(2),%st(0)		 // y/z | x/z | zi
+	fxch	%st(1)				 // x/z | y/z | zi
+	fadds	C(aliasxcenter)		 // u | y/z | zi
+	fxch	%st(1)				 // y/z | u | zi
+	fadds	C(aliasycenter)		 // v | u | zi
+	fxch	%st(2)				 // zi | u | v
+// FIXME: fast float->int conversion?
+	fistpl	fv_v+20(%edi)		 // u | v
+	fistpl	fv_v+0(%edi)		 // v
+	fistpl	fv_v+4(%edi)
+
+//	}
+
+	addl	$(fv_size),%edi
+	decl	%ecx
+	jnz		Lloop
+
+	popl	%esi				// restore register variables
+	popl	%edi
+	popl	%ebp				// restore the caller's stack frame
+	ret
+
+Lsavelight:
+	fstp	%st(0)
+	jmp		Lp1
+
+#endif	// id386
+
diff --git a/source/r_drawa.S b/source/r_drawa.S
new file mode 100644
index 0000000..60874e8
--- /dev/null
+++ b/source/r_drawa.S
@@ -0,0 +1,838 @@
+/*
+Copyright (C) 1996-1997 Id Software, Inc.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+
+See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+*/
+//
+// r_drawa.s
+// x86 assembly-language edge clipping and emission code
+//
+
+#include "asm_i386.h"
+#include "quakeasm.h"
+#include "asm_draw.h"
+#include "d_ifacea.h"
+
+#if	id386
+
+// !!! if these are changed, they must be changed in r_draw.c too !!!
+#define FULLY_CLIPPED_CACHED	0x80000000
+#define FRAMECOUNT_MASK			0x7FFFFFFF
+
+	.data
+
+Ld0:			.single		0.0
+Ld1:			.single		0.0
+Lstack:			.long		0
+Lfp_near_clip:	.single		NEAR_CLIP
+Lceilv0:		.long		0
+Lv:				.long		0
+Lu0:			.long		0
+Lv0:			.long		0
+Lzi0:			.long		0
+
+	.text
+
+//----------------------------------------------------------------------
+// edge clipping code
+//----------------------------------------------------------------------
+
+#define pv0		4+12
+#define pv1		8+12
+#define clip	12+12
+
+	.align 4
+.globl C(R_ClipEdge)
+C(R_ClipEdge):
+	pushl	%esi				// preserve register variables
+	pushl	%edi
+	pushl	%ebx
+	movl	%esp,Lstack			// for clearing the stack later
+
+//	float		d0, d1, f;
+//	mvertex_t	clipvert;
+
+	movl	clip(%esp),%ebx
+	movl	pv0(%esp),%esi
+	movl	pv1(%esp),%edx
+
+//	if (clip)
+//	{
+	testl	%ebx,%ebx
+	jz		Lemit
+
+//		do
+//		{
+
+Lcliploop:
+
+//			d0 = DotProduct (pv0->position, clip->normal) - clip->dist;
+//			d1 = DotProduct (pv1->position, clip->normal) - clip->dist;
+	flds	mv_position+0(%esi)
+	fmuls	cp_normal+0(%ebx)
+	flds	mv_position+4(%esi)
+	fmuls	cp_normal+4(%ebx)
+	flds	mv_position+8(%esi)
+	fmuls	cp_normal+8(%ebx)
+	fxch	%st(1)
+	faddp	%st(0),%st(2)		// d0mul2 | d0add0
+
+	flds	mv_position+0(%edx)
+	fmuls	cp_normal+0(%ebx)
+	flds	mv_position+4(%edx)
+	fmuls	cp_normal+4(%ebx)
+	flds	mv_position+8(%edx)
+	fmuls	cp_normal+8(%ebx)
+	fxch	%st(1)
+	faddp	%st(0),%st(2)		// d1mul2 | d1add0 | d0mul2 | d0add0
+	fxch	%st(3)				// d0add0 | d1add0 | d0mul2 | d1mul2
+
+	faddp	%st(0),%st(2)		// d1add0 | dot0 | d1mul2 
+	faddp	%st(0),%st(2)		// dot0 | dot1
+
+	fsubs	cp_dist(%ebx)		// d0 | dot1
+	fxch	%st(1)				// dot1 | d0
+	fsubs	cp_dist(%ebx)		// d1 | d0
+	fxch	%st(1)
+	fstps	Ld0
+	fstps	Ld1
+
+//			if (d0 >= 0)
+//			{
+	movl	Ld0,%eax
+	movl	Ld1,%ecx
+	orl		%eax,%ecx
+	js		Lp2
+
+// both points are unclipped
+
+Lcontinue:
+
+//
+//				R_ClipEdge (&clipvert, pv1, clip->next);
+//				return;
+//			}
+//		} while ((clip = clip->next) != NULL);
+	movl	cp_next(%ebx),%ebx
+	testl	%ebx,%ebx
+	jnz		Lcliploop
+
+//	}
+
+//// add the edge
+//	R_EmitEdge (pv0, pv1);
+Lemit:
+
+//
+// set integer rounding to ceil mode, set to single precision
+//
+// FIXME: do away with by manually extracting integers from floats?
+// FIXME: set less often
+	fldcw	ceil_cw
+
+//	edge_t	*edge, *pcheck;
+//	int		u_check;
+//	float	u, u_step;
+//	vec3_t	local, transformed;
+//	float	*world;
+//	int		v, v2, ceilv0;
+//	float	scale, lzi0, u0, v0;
+//	int		side;
+
+//	if (r_lastvertvalid)
+//	{
+	cmpl	$0,C(r_lastvertvalid)
+	jz		LCalcFirst
+
+//		u0 = r_u1;
+//		v0 = r_v1;
+//		lzi0 = r_lzi1;
+//		ceilv0 = r_ceilv1;
+	movl	C(r_lzi1),%eax
+	movl	C(r_u1),%ecx
+	movl	%eax,Lzi0
+	movl	%ecx,Lu0
+	movl	C(r_v1),%ecx
+	movl	C(r_ceilv1),%eax
+	movl	%ecx,Lv0
+	movl	%eax,Lceilv0
+	jmp		LCalcSecond
+
+//	}
+
+LCalcFirst:
+
+//	else
+//	{
+//		world = &pv0->position[0];
+
+	call	LTransformAndProject	// v0 | lzi0 | u0
+
+	fsts	Lv0
+	fxch	%st(2)					// u0 | lzi0 | v0
+	fstps	Lu0						// lzi0 | v0
+	fstps	Lzi0					// v0
+
+//		ceilv0 = (int)(v0 - 2000) + 2000; // ceil(v0);
+	fistpl	Lceilv0
+
+//	}
+
+LCalcSecond:
+
+//	world = &pv1->position[0];
+	movl	%edx,%esi
+
+	call	LTransformAndProject	// v1 | lzi1 | u1
+
+	flds	Lu0						// u0 | v1 | lzi1 | u1
+	fxch	%st(3)					// u1 | v1 | lzi1 | u0
+	flds	Lzi0					// lzi0 | u1 | v1 | lzi1 | u0
+	fxch	%st(3)					// lzi1 | u1 | v1 | lzi0 | u0
+	flds	Lv0						// v0 | lzi1 | u1 | v1 | lzi0 | u0
+	fxch	%st(3)					// v1 | lzi1 | u1 | v0 | lzi0 | u0
+
+//	r_ceilv1 = (int)(r_v1 - 2000) + 2000; // ceil(r_v1);
+	fistl	C(r_ceilv1)
+
+	fldcw	single_cw				// put back normal floating-point state
+
+	fsts	C(r_v1)
+	fxch	%st(4)					// lzi0 | lzi1 | u1 | v0 | v1 | u0
+
+//	if (r_lzi1 > lzi0)
+//		lzi0 = r_lzi1;
+	fcom	%st(1)
+	fnstsw	%ax
+	testb	$1,%ah
+	jz		LP0
+	fstp	%st(0)
+	fld		%st(0)
+LP0:
+
+	fxch	%st(1)					// lzi1 | lzi0 | u1 | v0 | v1 | u0
+	fstps	C(r_lzi1)				// lzi0 | u1 | v0 | v1 | u0
+	fxch	%st(1)
+	fsts	C(r_u1)
+	fxch	%st(1)
+
+//	if (lzi0 > r_nearzi)	// for mipmap finding
+//		r_nearzi = lzi0;
+	fcoms	C(r_nearzi)
+	fnstsw	%ax
+	testb	$0x45,%ah
+	jnz		LP1
+	fsts	C(r_nearzi)
+LP1:
+
+// // for right edges, all we want is the effect on 1/z
+//	if (r_nearzionly)
+//		return;
+	movl	C(r_nearzionly),%eax
+	testl	%eax,%eax
+	jz		LP2
+LPop5AndDone:
+	movl	C(cacheoffset),%eax
+	movl	C(r_framecount),%edx
+	cmpl	$0x7FFFFFFF,%eax
+	jz		LDoPop
+	andl	$(FRAMECOUNT_MASK),%edx
+	orl		$(FULLY_CLIPPED_CACHED),%edx
+	movl	%edx,C(cacheoffset)
+
+LDoPop:
+	fstp	%st(0)			// u1 | v0 | v1 | u0
+	fstp	%st(0)			// v0 | v1 | u0
+	fstp	%st(0)			// v1 | u0
+	fstp	%st(0)			// u0
+	fstp	%st(0)
+	jmp		Ldone
+
+LP2:
+
+// // create the edge
+//	if (ceilv0 == r_ceilv1)
+//		return;		// horizontal edge
+	movl	Lceilv0,%ebx
+	movl	C(edge_p),%edi
+	movl	C(r_ceilv1),%ecx
+	movl	%edi,%edx
+	movl	C(r_pedge),%esi
+	addl	$(et_size),%edx
+	cmpl	%ecx,%ebx
+	jz		LPop5AndDone
+
+	movl	C(r_pedge),%eax
+	movl	%eax,et_owner(%edi)
+
+//	side = ceilv0 > r_ceilv1;
+//
+//	edge->nearzi = lzi0;
+	fstps	et_nearzi(%edi)		// u1 | v0 | v1 | u0
+
+//	if (side == 1)
+//	{
+	jc		LSide0
+
+LSide1:
+
+//	// leading edge (go from p2 to p1)
+
+//		u_step = ((u0 - r_u1) / (v0 - r_v1));
+	fsubrp	%st(0),%st(3)		// v0 | v1 | u0-u1
+	fsub	%st(1),%st(0)		// v0-v1 | v1 | u0-u1
+	fdivrp	%st(0),%st(2)		// v1 | ustep
+
+//	r_emitted = 1;
+	movl	$1,C(r_emitted)
+
+//	edge = edge_p++;
+	movl	%edx,C(edge_p)
+
+// pretouch next edge
+	movl	(%edx),%eax
+
+//		v2 = ceilv0 - 1;
+//		v = r_ceilv1;
+	movl	%ecx,%eax
+	leal	-1(%ebx),%ecx
+	movl	%eax,%ebx
+
+//		edge->surfs[0] = 0;
+//		edge->surfs[1] = surface_p - surfaces;
+	movl	C(surface_p),%eax
+	movl	C(surfaces),%esi
+	subl	%edx,%edx
+	subl	%esi,%eax
+	shrl	$(SURF_T_SHIFT),%eax
+	movl	%edx,et_surfs(%edi)
+	movl	%eax,et_surfs+2(%edi)
+
+	subl	%esi,%esi
+
+//		u = r_u1 + ((float)v - r_v1) * u_step;
+	movl	%ebx,Lv
+	fildl	Lv					// v | v1 | ustep
+	fsubp	%st(0),%st(1)		// v-v1 | ustep
+	fmul	%st(1),%st(0)		// (v-v1)*ustep | ustep
+	fadds	C(r_u1)				// u | ustep
+
+	jmp		LSideDone
+
+//	}
+
+LSide0:
+
+//	else
+//	{
+//	// trailing edge (go from p1 to p2)
+
+//		u_step = ((r_u1 - u0) / (r_v1 - v0));
+	fsub	%st(3),%st(0)		// u1-u0 | v0 | v1 | u0
+	fxch	%st(2)				// v1 | v0 | u1-u0 | u0
+	fsub	%st(1),%st(0)		// v1-v0 | v0 | u1-u0 | u0
+	fdivrp	%st(0),%st(2)		// v0 | ustep | u0
+
+//	r_emitted = 1;
+	movl	$1,C(r_emitted)
+
+//	edge = edge_p++;
+	movl	%edx,C(edge_p)
+
+// pretouch next edge
+	movl	(%edx),%eax
+
+//		v = ceilv0;
+//		v2 = r_ceilv1 - 1;
+	decl	%ecx
+
+//		edge->surfs[0] = surface_p - surfaces;
+//		edge->surfs[1] = 0;
+	movl	C(surface_p),%eax
+	movl	C(surfaces),%esi
+	subl	%edx,%edx
+	subl	%esi,%eax
+	shrl	$(SURF_T_SHIFT),%eax
+	movl	%edx,et_surfs+2(%edi)
+	movl	%eax,et_surfs(%edi)
+
+	movl	$1,%esi
+
+//		u = u0 + ((float)v - v0) * u_step;
+	movl	%ebx,Lv
+	fildl	Lv					// v | v0 | ustep | u0
+	fsubp	%st(0),%st(1)		// v-v0 | ustep | u0
+	fmul	%st(1),%st(0)		// (v-v0)*ustep | ustep | u0
+	faddp	%st(0),%st(2)		// ustep | u
+	fxch	%st(1)				// u | ustep
+
+//	}
+
+LSideDone:
+
+//	edge->u_step = u_step*0x100000;
+//	edge->u = u*0x100000 + 0xFFFFF;
+
+	fmuls	fp_1m				// u*0x100000 | ustep
+	fxch	%st(1)				// ustep | u*0x100000
+	fmuls	fp_1m				// ustep*0x100000 | u*0x100000
+	fxch	%st(1)				// u*0x100000 | ustep*0x100000
+	fadds	fp_1m_minus_1		// u*0x100000 + 0xFFFFF | ustep*0x100000
+	fxch	%st(1)				// ustep*0x100000 | u*0x100000 + 0xFFFFF
+	fistpl	et_u_step(%edi)		// u*0x100000 + 0xFFFFF
+	fistpl	et_u(%edi)
+
+// // we need to do this to avoid stepping off the edges if a very nearly
+// // horizontal edge is less than epsilon above a scan, and numeric error
+// // causes it to incorrectly extend to the scan, and the extension of the
+// // line goes off the edge of the screen
+// // FIXME: is this actually needed?
+//	if (edge->u < r_refdef.vrect_x_adj_shift20)
+//		edge->u = r_refdef.vrect_x_adj_shift20;
+//	if (edge->u > r_refdef.vrectright_adj_shift20)
+//		edge->u = r_refdef.vrectright_adj_shift20;
+	movl	et_u(%edi),%eax
+	movl	C(r_refdef)+rd_vrect_x_adj_shift20,%edx
+	cmpl	%edx,%eax
+	jl		LP4
+	movl	C(r_refdef)+rd_vrectright_adj_shift20,%edx
+	cmpl	%edx,%eax
+	jng		LP5
+LP4:
+	movl	%edx,et_u(%edi)
+	movl	%edx,%eax
+LP5:
+
+// // sort the edge in normally
+//	u_check = edge->u;
+//
+//	if (edge->surfs[0])
+//		u_check++;	// sort trailers after leaders
+	addl	%esi,%eax
+
+//	if (!newedges[v] || newedges[v]->u >= u_check)
+//	{
+	movl	C(newedges)(,%ebx,4),%esi
+	testl	%esi,%esi
+	jz		LDoFirst
+	cmpl	%eax,et_u(%esi)
+	jl		LNotFirst
+LDoFirst:
+
+//		edge->next = newedges[v];
+//		newedges[v] = edge;
+	movl	%esi,et_next(%edi)
+	movl	%edi,C(newedges)(,%ebx,4)
+
+	jmp		LSetRemove
+
+//	}
+
+LNotFirst:
+
+//	else
+//	{
+//		pcheck = newedges[v];
+//
+//		while (pcheck->next && pcheck->next->u < u_check)
+//			pcheck = pcheck->next;
+LFindInsertLoop:
+	movl	%esi,%edx
+	movl	et_next(%esi),%esi
+	testl	%esi,%esi
+	jz		LInsertFound
+	cmpl	%eax,et_u(%esi)
+	jl		LFindInsertLoop
+
+LInsertFound:
+
+//		edge->next = pcheck->next;
+//		pcheck->next = edge;
+	movl	%esi,et_next(%edi)
+	movl	%edi,et_next(%edx)
+
+//	}
+
+LSetRemove:
+
+//	edge->nextremove = removeedges[v2];
+//	removeedges[v2] = edge;
+	movl	C(removeedges)(,%ecx,4),%eax
+	movl	%edi,C(removeedges)(,%ecx,4)
+	movl	%eax,et_nextremove(%edi)
+
+Ldone:
+	movl	Lstack,%esp			// clear temporary variables from stack
+
+	popl	%ebx				// restore register variables
+	popl	%edi
+	popl	%esi
+	ret
+
+// at least one point is clipped
+
+Lp2:
+	testl	%eax,%eax
+	jns		Lp1
+
+//			else
+//			{
+//			// point 0 is clipped
+
+//				if (d1 < 0)
+//				{
+	movl	Ld1,%eax
+	testl	%eax,%eax
+	jns		Lp3
+
+//				// both points are clipped
+//				// we do cache fully clipped edges
+//					if (!leftclipped)
+	movl	C(r_leftclipped),%eax
+	movl	C(r_pedge),%ecx
+	testl	%eax,%eax
+	jnz		Ldone
+
+//						r_pedge->framecount = r_framecount;
+	movl	C(r_framecount),%eax
+	andl	$(FRAMECOUNT_MASK),%eax
+	orl		$(FULLY_CLIPPED_CACHED),%eax
+	movl	%eax,C(cacheoffset)
+
+//					return;
+	jmp		Ldone
+
+//				}
+
+Lp1:
+
+//			// point 0 is unclipped
+//				if (d1 >= 0)
+//				{
+//				// both points are unclipped
+//					continue;
+
+//			// only point 1 is clipped
+
+//				f = d0 / (d0 - d1);
+	flds	Ld0
+	flds	Ld1
+	fsubr	%st(1),%st(0)
+
+//			// we don't cache partially clipped edges
+	movl	$0x7FFFFFFF,C(cacheoffset)
+
+	fdivrp	%st(0),%st(1)
+
+	subl	$(mv_size),%esp			// allocate space for clipvert
+
+//				clipvert.position[0] = pv0->position[0] +
+//						f * (pv1->position[0] - pv0->position[0]);
+//				clipvert.position[1] = pv0->position[1] +
+//						f * (pv1->position[1] - pv0->position[1]);
+//				clipvert.position[2] = pv0->position[2] +
+//						f * (pv1->position[2] - pv0->position[2]);
+	flds	mv_position+8(%edx)
+	fsubs	mv_position+8(%esi)
+	flds	mv_position+4(%edx)
+	fsubs	mv_position+4(%esi)
+	flds	mv_position+0(%edx)
+	fsubs	mv_position+0(%esi)		// 0 | 1 | 2
+
+// replace pv1 with the clip point
+	movl	%esp,%edx
+	movl	cp_leftedge(%ebx),%eax
+	testb	%al,%al
+
+	fmul	%st(3),%st(0)
+	fxch	%st(1)					// 1 | 0 | 2
+	fmul	%st(3),%st(0)
+	fxch	%st(2)					// 2 | 0 | 1
+	fmulp	%st(0),%st(3)			// 0 | 1 | 2
+	fadds	mv_position+0(%esi)
+	fxch	%st(1)					// 1 | 0 | 2
+	fadds	mv_position+4(%esi)
+	fxch	%st(2)					// 2 | 0 | 1
+	fadds	mv_position+8(%esi)
+	fxch	%st(1)					// 0 | 2 | 1
+	fstps	mv_position+0(%esp)		// 2 | 1
+	fstps	mv_position+8(%esp)		// 1
+	fstps	mv_position+4(%esp)
+
+//				if (clip->leftedge)
+//				{
+	jz		Ltestright
+
+//					r_leftclipped = true;
+//					r_leftexit = clipvert;
+	movl	$1,C(r_leftclipped)
+	movl	mv_position+0(%esp),%eax
+	movl	%eax,C(r_leftexit)+mv_position+0
+	movl	mv_position+4(%esp),%eax
+	movl	%eax,C(r_leftexit)+mv_position+4
+	movl	mv_position+8(%esp),%eax
+	movl	%eax,C(r_leftexit)+mv_position+8
+
+	jmp		Lcontinue
+
+//				}
+
+Ltestright:
+//				else if (clip->rightedge)
+//				{
+	testb	%ah,%ah
+	jz		Lcontinue
+
+//					r_rightclipped = true;
+//					r_rightexit = clipvert;
+	movl	$1,C(r_rightclipped)
+	movl	mv_position+0(%esp),%eax
+	movl	%eax,C(r_rightexit)+mv_position+0
+	movl	mv_position+4(%esp),%eax
+	movl	%eax,C(r_rightexit)+mv_position+4
+	movl	mv_position+8(%esp),%eax
+	movl	%eax,C(r_rightexit)+mv_position+8
+
+//				}
+//
+//				R_ClipEdge (pv0, &clipvert, clip->next);
+//				return;
+//			}
+	jmp		Lcontinue
+
+//			}
+
+Lp3:
+
+//			// only point 0 is clipped
+//				r_lastvertvalid = false;
+
+	movl	$0,C(r_lastvertvalid)
+
+//				f = d0 / (d0 - d1);
+	flds	Ld0
+	flds	Ld1
+	fsubr	%st(1),%st(0)
+
+//			// we don't cache partially clipped edges
+	movl	$0x7FFFFFFF,C(cacheoffset)
+
+	fdivrp	%st(0),%st(1)
+
+	subl	$(mv_size),%esp			// allocate space for clipvert
+
+//				clipvert.position[0] = pv0->position[0] +
+//						f * (pv1->position[0] - pv0->position[0]);
+//				clipvert.position[1] = pv0->position[1] +
+//						f * (pv1->position[1] - pv0->position[1]);
+//				clipvert.position[2] = pv0->position[2] +
+//						f * (pv1->position[2] - pv0->position[2]);
+	flds	mv_position+8(%edx)
+	fsubs	mv_position+8(%esi)
+	flds	mv_position+4(%edx)
+	fsubs	mv_position+4(%esi)
+	flds	mv_position+0(%edx)
+	fsubs	mv_position+0(%esi)		// 0 | 1 | 2
+
+	movl	cp_leftedge(%ebx),%eax
+	testb	%al,%al
+
+	fmul	%st(3),%st(0)
+	fxch	%st(1)					// 1 | 0 | 2
+	fmul	%st(3),%st(0)
+	fxch	%st(2)					// 2 | 0 | 1
+	fmulp	%st(0),%st(3)			// 0 | 1 | 2
+	fadds	mv_position+0(%esi)
+	fxch	%st(1)					// 1 | 0 | 2
+	fadds	mv_position+4(%esi)
+	fxch	%st(2)					// 2 | 0 | 1
+	fadds	mv_position+8(%esi)
+	fxch	%st(1)					// 0 | 2 | 1
+	fstps	mv_position+0(%esp)		// 2 | 1
+	fstps	mv_position+8(%esp)		// 1
+	fstps	mv_position+4(%esp)
+
+// replace pv0 with the clip point
+	movl	%esp,%esi
+
+//				if (clip->leftedge)
+//				{
+	jz		Ltestright2
+
+//					r_leftclipped = true;
+//					r_leftenter = clipvert;
+	movl	$1,C(r_leftclipped)
+	movl	mv_position+0(%esp),%eax
+	movl	%eax,C(r_leftenter)+mv_position+0
+	movl	mv_position+4(%esp),%eax
+	movl	%eax,C(r_leftenter)+mv_position+4
+	movl	mv_position+8(%esp),%eax
+	movl	%eax,C(r_leftenter)+mv_position+8
+
+	jmp		Lcontinue
+
+//				}
+
+Ltestright2:
+//				else if (clip->rightedge)
+//				{
+	testb	%ah,%ah
+	jz		Lcontinue
+
+//					r_rightclipped = true;
+//					r_rightenter = clipvert;
+	movl	$1,C(r_rightclipped)
+	movl	mv_position+0(%esp),%eax
+	movl	%eax,C(r_rightenter)+mv_position+0
+	movl	mv_position+4(%esp),%eax
+	movl	%eax,C(r_rightenter)+mv_position+4
+	movl	mv_position+8(%esp),%eax
+	movl	%eax,C(r_rightenter)+mv_position+8
+
+//				}
+	jmp		Lcontinue
+
+// %esi = vec3_t point to transform and project
+// %edx preserved
+LTransformAndProject:
+
+//	// transform and project
+//		VectorSubtract (world, modelorg, local);
+	flds	mv_position+0(%esi)
+	fsubs	C(modelorg)+0
+	flds	mv_position+4(%esi)
+	fsubs	C(modelorg)+4
+	flds	mv_position+8(%esi)	
+	fsubs	C(modelorg)+8
+	fxch	%st(2)				// local[0] | local[1] | local[2]
+
+//		TransformVector (local, transformed);
+//	
+//		if (transformed[2] < NEAR_CLIP)
+//			transformed[2] = NEAR_CLIP;
+//	
+//		lzi0 = 1.0 / transformed[2];
+	fld		%st(0)				// local[0] | local[0] | local[1] | local[2]
+	fmuls	C(vpn)+0			// zm0 | local[0] | local[1] | local[2]
+	fld		%st(1)				// local[0] | zm0 | local[0] | local[1] |
+								//  local[2]
+	fmuls	C(vright)+0			// xm0 | zm0 | local[0] | local[1] | local[2]
+	fxch	%st(2)				// local[0] | zm0 | xm0 | local[1] | local[2]
+	fmuls	C(vup)+0			// ym0 |  zm0 | xm0 | local[1] | local[2]
+	fld		%st(3)				// local[1] | ym0 |  zm0 | xm0 | local[1] |
+								//  local[2]
+	fmuls	C(vpn)+4			// zm1 | ym0 | zm0 | xm0 | local[1] |
+								//  local[2]
+	fld		%st(4)				// local[1] | zm1 | ym0 | zm0 | xm0 |
+								//  local[1] | local[2]
+	fmuls	C(vright)+4			// xm1 | zm1 | ym0 |  zm0 | xm0 |
+								//  local[1] | local[2]
+	fxch	%st(5)				// local[1] | zm1 | ym0 | zm0 | xm0 |
+								//  xm1 | local[2]
+	fmuls	C(vup)+4			// ym1 | zm1 | ym0 | zm0 | xm0 |
+								//  xm1 | local[2]
+	fxch	%st(1)				// zm1 | ym1 | ym0 | zm0 | xm0 |
+								//  xm1 | local[2]
+	faddp	%st(0),%st(3)		// ym1 | ym0 | zm2 | xm0 | xm1 | local[2]
+	fxch	%st(3)				// xm0 | ym0 | zm2 | ym1 | xm1 | local[2]
+	faddp	%st(0),%st(4)		// ym0 | zm2 | ym1 | xm2 | local[2]
+	faddp	%st(0),%st(2)		// zm2 | ym2 | xm2 | local[2]
+	fld		%st(3)				// local[2] | zm2 | ym2 | xm2 | local[2]
+	fmuls	C(vpn)+8			// zm3 | zm2 | ym2 | xm2 | local[2]
+	fld		%st(4)				// local[2] | zm3 | zm2 | ym2 | xm2 | local[2]
+	fmuls	C(vright)+8			// xm3 | zm3 | zm2 | ym2 | xm2 | local[2]
+	fxch	%st(5)				// local[2] | zm3 | zm2 | ym2 | xm2 | xm3
+	fmuls	C(vup)+8			// ym3 | zm3 | zm2 | ym2 | xm2 | xm3
+	fxch	%st(1)				// zm3 | ym3 | zm2 | ym2 | xm2 | xm3
+	faddp	%st(0),%st(2)		// ym3 | zm4 | ym2 | xm2 | xm3
+	fxch	%st(4)				// xm3 | zm4 | ym2 | xm2 | ym3
+	faddp	%st(0),%st(3)		// zm4 | ym2 | xm4 | ym3
+	fxch	%st(1)				// ym2 | zm4 | xm4 | ym3
+	faddp	%st(0),%st(3)		// zm4 | xm4 | ym4
+
+	fcoms	Lfp_near_clip
+	fnstsw	%ax
+	testb	$1,%ah
+	jz		LNoClip
+	fstp	%st(0)
+	flds	Lfp_near_clip
+
+LNoClip:
+
+	fdivrs	float_1				// lzi0 | x | y
+	fxch	%st(1)				// x | lzi0 | y
+
+//	// FIXME: build x/yscale into transform?
+//		scale = xscale * lzi0;
+//		u0 = (xcenter + scale*transformed[0]);
+	flds	C(xscale)			// xscale | x | lzi0 | y
+	fmul	%st(2),%st(0)		// scale | x | lzi0 | y
+	fmulp	%st(0),%st(1)		// scale*x | lzi0 | y
+	fadds	C(xcenter)			// u0 | lzi0 | y
+
+//		if (u0 < r_refdef.fvrectx_adj)
+//			u0 = r_refdef.fvrectx_adj;
+//		if (u0 > r_refdef.fvrectright_adj)
+//			u0 = r_refdef.fvrectright_adj;
+// FIXME: use integer compares of floats?
+	fcoms	C(r_refdef)+rd_fvrectx_adj
+	fnstsw	%ax
+	testb	$1,%ah
+	jz		LClampP0
+	fstp	%st(0)
+	flds	C(r_refdef)+rd_fvrectx_adj
+LClampP0:
+	fcoms	C(r_refdef)+rd_fvrectright_adj
+	fnstsw	%ax
+	testb	$0x45,%ah
+	jnz		LClampP1
+	fstp	%st(0)
+	flds	C(r_refdef)+rd_fvrectright_adj
+LClampP1:
+
+	fld		%st(1)				// lzi0 | u0 | lzi0 | y
+
+//		scale = yscale * lzi0;
+//		v0 = (ycenter - scale*transformed[1]);
+	fmuls	C(yscale)			// scale | u0 | lzi0 | y
+	fmulp	%st(0),%st(3)		// u0 | lzi0 | scale*y
+	fxch	%st(2)				// scale*y | lzi0 | u0
+	fsubrs	C(ycenter)			// v0 | lzi0 | u0
+
+//		if (v0 < r_refdef.fvrecty_adj)
+//			v0 = r_refdef.fvrecty_adj;
+//		if (v0 > r_refdef.fvrectbottom_adj)
+//			v0 = r_refdef.fvrectbottom_adj;
+// FIXME: use integer compares of floats?
+	fcoms	C(r_refdef)+rd_fvrecty_adj
+	fnstsw	%ax
+	testb	$1,%ah
+	jz		LClampP2
+	fstp	%st(0)
+	flds	C(r_refdef)+rd_fvrecty_adj
+LClampP2:
+	fcoms	C(r_refdef)+rd_fvrectbottom_adj
+	fnstsw	%ax
+	testb	$0x45,%ah
+	jnz		LClampP3
+	fstp	%st(0)
+	flds	C(r_refdef)+rd_fvrectbottom_adj
+LClampP3:
+	ret
+
+#endif	// id386
+
diff --git a/source/r_edgea.S b/source/r_edgea.S
new file mode 100644
index 0000000..8507a5b
--- /dev/null
+++ b/source/r_edgea.S
@@ -0,0 +1,750 @@
+/*
+Copyright (C) 1996-1997 Id Software, Inc.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+
+See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+*/
+//
+// r_edgea.s
+// x86 assembly-language edge-processing code.
+//
+
+#include "asm_i386.h"
+#include "quakeasm.h"
+#include "asm_draw.h"
+
+#if	id386
+
+	.data
+Ltemp:					.long	0
+float_1_div_0100000h:	.long	0x35800000	// 1.0/(float)0x100000
+float_point_999:		.single	0.999
+float_1_point_001:		.single	1.001
+
+	.text
+
+//--------------------------------------------------------------------
+
+#define edgestoadd	4+8		// note odd stack offsets because of interleaving
+#define edgelist	8+12	// with pushes
+
+.globl C(R_EdgeCodeStart)
+C(R_EdgeCodeStart):
+
+.globl C(R_InsertNewEdges)
+C(R_InsertNewEdges):
+	pushl	%edi
+	pushl	%esi				// preserve register variables
+	movl	edgestoadd(%esp),%edx
+	pushl	%ebx
+	movl	edgelist(%esp),%ecx
+
+LDoNextEdge:
+	movl	et_u(%edx),%eax
+	movl	%edx,%edi
+
+LContinueSearch:
+	movl	et_u(%ecx),%ebx
+	movl	et_next(%ecx),%esi
+	cmpl	%ebx,%eax
+	jle		LAddedge
+	movl	et_u(%esi),%ebx
+	movl	et_next(%esi),%ecx
+	cmpl	%ebx,%eax
+	jle		LAddedge2
+	movl	et_u(%ecx),%ebx
+	movl	et_next(%ecx),%esi
+	cmpl	%ebx,%eax
+	jle		LAddedge
+	movl	et_u(%esi),%ebx
+	movl	et_next(%esi),%ecx
+	cmpl	%ebx,%eax
+	jg		LContinueSearch
+
+LAddedge2:
+	movl	et_next(%edx),%edx
+	movl	et_prev(%esi),%ebx
+	movl	%esi,et_next(%edi)
+	movl	%ebx,et_prev(%edi)
+	movl	%edi,et_next(%ebx)
+	movl	%edi,et_prev(%esi)
+	movl	%esi,%ecx
+
+	cmpl	$0,%edx
+	jnz		LDoNextEdge
+	jmp		LDone
+
+	.align 4
+LAddedge:
+	movl	et_next(%edx),%edx
+	movl	et_prev(%ecx),%ebx
+	movl	%ecx,et_next(%edi)
+	movl	%ebx,et_prev(%edi)
+	movl	%edi,et_next(%ebx)
+	movl	%edi,et_prev(%ecx)
+
+	cmpl	$0,%edx
+	jnz		LDoNextEdge
+
+LDone:
+	popl	%ebx				// restore register variables
+	popl	%esi
+	popl	%edi
+
+	ret
+
+//--------------------------------------------------------------------
+
+#define predge	4+4
+
+.globl C(R_RemoveEdges)
+C(R_RemoveEdges):
+	pushl	%ebx
+	movl	predge(%esp),%eax
+
+Lre_loop:
+	movl	et_next(%eax),%ecx
+	movl	et_nextremove(%eax),%ebx
+	movl	et_prev(%eax),%edx
+	testl	%ebx,%ebx
+	movl	%edx,et_prev(%ecx)
+	jz		Lre_done
+	movl	%ecx,et_next(%edx)
+
+	movl	et_next(%ebx),%ecx
+	movl	et_prev(%ebx),%edx
+	movl	et_nextremove(%ebx),%eax
+	movl	%edx,et_prev(%ecx)
+	testl	%eax,%eax
+	movl	%ecx,et_next(%edx)
+	jnz		Lre_loop
+
+	popl	%ebx
+	ret
+
+Lre_done:
+	movl	%ecx,et_next(%edx)
+	popl	%ebx
+
+	ret
+
+//--------------------------------------------------------------------
+
+#define pedgelist	4+4		// note odd stack offset because of interleaving
+							// with pushes
+
+.globl C(R_StepActiveU)
+C(R_StepActiveU):
+	pushl	%edi
+	movl	pedgelist(%esp),%edx
+	pushl	%esi				// preserve register variables
+	pushl	%ebx
+
+	movl	et_prev(%edx),%esi
+
+LNewEdge:
+	movl	et_u(%esi),%edi
+
+LNextEdge:
+	movl	et_u(%edx),%eax
+	movl	et_u_step(%edx),%ebx
+	addl	%ebx,%eax
+	movl	et_next(%edx),%esi
+	movl	%eax,et_u(%edx)
+	cmpl	%edi,%eax
+	jl		LPushBack
+
+	movl	et_u(%esi),%edi
+	movl	et_u_step(%esi),%ebx
+	addl	%ebx,%edi
+	movl	et_next(%esi),%edx
+	movl	%edi,et_u(%esi)
+	cmpl	%eax,%edi
+	jl		LPushBack2
+
+	movl	et_u(%edx),%eax
+	movl	et_u_step(%edx),%ebx
+	addl	%ebx,%eax
+	movl	et_next(%edx),%esi
+	movl	%eax,et_u(%edx)
+	cmpl	%edi,%eax
+	jl		LPushBack
+
+	movl	et_u(%esi),%edi
+	movl	et_u_step(%esi),%ebx
+	addl	%ebx,%edi
+	movl	et_next(%esi),%edx
+	movl	%edi,et_u(%esi)
+	cmpl	%eax,%edi
+	jnl		LNextEdge
+
+LPushBack2:
+	movl	%edx,%ebx
+	movl	%edi,%eax
+	movl	%esi,%edx
+	movl	%ebx,%esi
+
+LPushBack:
+// push it back to keep it sorted
+	movl	et_prev(%edx),%ecx
+	movl	et_next(%edx),%ebx
+
+// done if the -1 in edge_aftertail triggered this
+	cmpl	$(C(edge_aftertail)),%edx
+	jz		LUDone
+
+// pull the edge out of the edge list
+	movl	et_prev(%ecx),%edi
+	movl	%ecx,et_prev(%esi)
+	movl	%ebx,et_next(%ecx)
+
+// find out where the edge goes in the edge list
+LPushBackLoop:
+	movl	et_prev(%edi),%ecx
+	movl	et_u(%edi),%ebx
+	cmpl	%ebx,%eax
+	jnl		LPushBackFound
+
+	movl	et_prev(%ecx),%edi
+	movl	et_u(%ecx),%ebx
+	cmpl	%ebx,%eax
+	jl		LPushBackLoop
+
+	movl	%ecx,%edi
+
+// put the edge back into the edge list
+LPushBackFound:
+	movl	et_next(%edi),%ebx
+	movl	%edi,et_prev(%edx)
+	movl	%ebx,et_next(%edx)
+	movl	%edx,et_next(%edi)
+	movl	%edx,et_prev(%ebx)
+
+	movl	%esi,%edx
+	movl	et_prev(%esi),%esi
+
+	cmpl	$(C(edge_tail)),%edx
+	jnz		LNewEdge
+
+LUDone:
+	popl	%ebx				// restore register variables
+	popl	%esi
+	popl	%edi
+
+	ret
+
+//--------------------------------------------------------------------
+
+#define surf	4		// note this is loaded before any pushes
+
+	.align 4
+TrailingEdge:
+	movl	st_spanstate(%esi),%eax	// check for edge inversion
+	decl	%eax
+	jnz		LInverted
+
+	movl	%eax,st_spanstate(%esi)
+	movl	st_insubmodel(%esi),%ecx
+	movl	0x12345678,%edx		// surfaces[1].st_next
+LPatch0:
+	movl	C(r_bmodelactive),%eax
+	subl	%ecx,%eax
+	cmpl	%esi,%edx
+	movl	%eax,C(r_bmodelactive)
+	jnz		LNoEmit				// surface isn't on top, just remove
+
+// emit a span (current top going away)
+	movl	et_u(%ebx),%eax
+	shrl	$20,%eax				// iu = integral pixel u
+	movl	st_last_u(%esi),%edx
+	movl	st_next(%esi),%ecx
+	cmpl	%edx,%eax
+	jle		LNoEmit2				// iu <= surf->last_u, so nothing to emit
+
+	movl	%eax,st_last_u(%ecx)	// surf->next->last_u = iu;
+	subl	%edx,%eax
+	movl	%edx,espan_t_u(%ebp)		// span->u = surf->last_u;
+
+	movl	%eax,espan_t_count(%ebp)	// span->count = iu - span->u;
+	movl	C(current_iv),%eax
+	movl	%eax,espan_t_v(%ebp)		// span->v = current_iv;
+	movl	st_spans(%esi),%eax
+	movl	%eax,espan_t_pnext(%ebp)	// span->pnext = surf->spans;
+	movl	%ebp,st_spans(%esi)			// surf->spans = span;
+	addl	$(espan_t_size),%ebp
+
+	movl	st_next(%esi),%edx		// remove the surface from the surface
+	movl	st_prev(%esi),%esi		// stack
+
+	movl	%edx,st_next(%esi)
+	movl	%esi,st_prev(%edx)
+	ret
+
+LNoEmit2:
+	movl	%eax,st_last_u(%ecx)	// surf->next->last_u = iu;
+	movl	st_next(%esi),%edx		// remove the surface from the surface
+	movl	st_prev(%esi),%esi		// stack
+
+	movl	%edx,st_next(%esi)
+	movl	%esi,st_prev(%edx)
+	ret
+
+LNoEmit:
+	movl	st_next(%esi),%edx		// remove the surface from the surface
+	movl	st_prev(%esi),%esi		// stack
+
+	movl	%edx,st_next(%esi)
+	movl	%esi,st_prev(%edx)
+	ret
+
+LInverted:
+	movl	%eax,st_spanstate(%esi)
+	ret
+
+//--------------------------------------------------------------------
+
+// trailing edge only
+Lgs_trailing:
+	pushl	$Lgs_nextedge
+	jmp		TrailingEdge
+
+
+.globl C(R_GenerateSpans)
+C(R_GenerateSpans):
+	pushl	%ebp				// preserve caller's stack frame
+	pushl	%edi
+	pushl	%esi				// preserve register variables
+	pushl	%ebx
+
+// clear active surfaces to just the background surface
+	movl	C(surfaces),%eax
+	movl	C(edge_head_u_shift20),%edx
+	addl	$(st_size),%eax
+// %ebp = span_p throughout
+	movl	C(span_p),%ebp
+
+	movl	$0,C(r_bmodelactive)
+
+	movl	%eax,st_next(%eax)
+	movl	%eax,st_prev(%eax)
+	movl	%edx,st_last_u(%eax)
+	movl	C(edge_head)+et_next,%ebx		// edge=edge_head.next
+
+// generate spans
+	cmpl	$(C(edge_tail)),%ebx		// done if empty list
+	jz		Lgs_lastspan
+
+Lgs_edgeloop:
+
+	movl	et_surfs(%ebx),%edi
+	movl	C(surfaces),%eax
+	movl	%edi,%esi
+	andl	$0xFFFF0000,%edi
+	andl	$0xFFFF,%esi
+	jz		Lgs_leading		// not a trailing edge
+
+// it has a left surface, so a surface is going away for this span
+	shll	$(SURF_T_SHIFT),%esi
+	addl	%eax,%esi
+	testl	%edi,%edi
+	jz		Lgs_trailing
+
+// both leading and trailing
+	call	TrailingEdge
+	movl	C(surfaces),%eax
+
+// ---------------------------------------------------------------
+// handle a leading edge
+// ---------------------------------------------------------------
+
+Lgs_leading:
+	shrl	$16-SURF_T_SHIFT,%edi
+	movl	C(surfaces),%eax
+	addl	%eax,%edi
+	movl	0x12345678,%esi		// surf2 = surfaces[1].next;
+LPatch2:
+	movl	st_spanstate(%edi),%edx
+	movl	st_insubmodel(%edi),%eax
+	testl	%eax,%eax
+	jnz		Lbmodel_leading
+
+// handle a leading non-bmodel edge
+
+// don't start a span if this is an inverted span, with the end edge preceding
+// the start edge (that is, we've already seen the end edge)
+	testl	%edx,%edx
+	jnz		Lxl_done
+
+
+// if (surf->key < surf2->key)
+//		goto newtop;
+	incl	%edx
+	movl	st_key(%edi),%eax
+	movl	%edx,st_spanstate(%edi)
+	movl	st_key(%esi),%ecx
+	cmpl	%ecx,%eax
+	jl		Lnewtop
+
+// main sorting loop to search through surface stack until insertion point
+// found. Always terminates because background surface is sentinel
+// do
+// {
+// 		surf2 = surf2->next;
+// } while (surf->key >= surf2->key);
+Lsortloopnb:
+	movl	st_next(%esi),%esi
+	movl	st_key(%esi),%ecx
+	cmpl	%ecx,%eax
+	jge		Lsortloopnb
+
+	jmp		LInsertAndExit
+
+
+// handle a leading bmodel edge
+	.align	4
+Lbmodel_leading:
+
+// don't start a span if this is an inverted span, with the end edge preceding
+// the start edge (that is, we've already seen the end edge)
+	testl	%edx,%edx
+	jnz		Lxl_done
+
+	movl	C(r_bmodelactive),%ecx
+	incl	%edx
+	incl	%ecx
+	movl	%edx,st_spanstate(%edi)
+	movl	%ecx,C(r_bmodelactive)
+
+// if (surf->key < surf2->key)
+//		goto newtop;
+	movl	st_key(%edi),%eax
+	movl	st_key(%esi),%ecx
+	cmpl	%ecx,%eax
+	jl		Lnewtop
+
+// if ((surf->key == surf2->key) && surf->insubmodel)
+// {
+	jz		Lzcheck_for_newtop
+
+// main sorting loop to search through surface stack until insertion point
+// found. Always terminates because background surface is sentinel
+// do
+// {
+// 		surf2 = surf2->next;
+// } while (surf->key > surf2->key);
+Lsortloop:
+	movl	st_next(%esi),%esi
+	movl	st_key(%esi),%ecx
+	cmpl	%ecx,%eax
+	jg		Lsortloop
+
+	jne		LInsertAndExit
+
+// Do 1/z sorting to see if we've arrived in the right position
+	movl	et_u(%ebx),%eax
+	subl	$0xFFFFF,%eax
+	movl	%eax,Ltemp
+	fildl	Ltemp
+
+	fmuls	float_1_div_0100000h // fu = (float)(edge->u - 0xFFFFF) *
+								//      (1.0 / 0x100000);
+
+	fld		%st(0)				// fu | fu
+	fmuls	st_d_zistepu(%edi)	// fu*surf->d_zistepu | fu
+	flds	C(fv)					// fv | fu*surf->d_zistepu | fu
+	fmuls	st_d_zistepv(%edi)	// fv*surf->d_zistepv | fu*surf->d_zistepu | fu
+	fxch	%st(1)				// fu*surf->d_zistepu | fv*surf->d_zistepv | fu
+	fadds	st_d_ziorigin(%edi)	// fu*surf->d_zistepu + surf->d_ziorigin |
+								//  fv*surf->d_zistepv | fu
+
+	flds	st_d_zistepu(%esi)	// surf2->d_zistepu |
+								//  fu*surf->d_zistepu + surf->d_ziorigin |
+								//  fv*surf->d_zistepv | fu
+	fmul	%st(3),%st(0)		// fu*surf2->d_zistepu |
+								//  fu*surf->d_zistepu + surf->d_ziorigin |
+								//  fv*surf->d_zistepv | fu
+	fxch	%st(1)				// fu*surf->d_zistepu + surf->d_ziorigin |
+								//  fu*surf2->d_zistepu |
+								//  fv*surf->d_zistepv | fu
+	faddp	%st(0),%st(2)		// fu*surf2->d_zistepu | newzi | fu
+
+	flds	C(fv)					// fv | fu*surf2->d_zistepu | newzi | fu
+	fmuls	st_d_zistepv(%esi)	// fv*surf2->d_zistepv |
+								//  fu*surf2->d_zistepu | newzi | fu
+	fld		%st(2)				// newzi | fv*surf2->d_zistepv |
+								//  fu*surf2->d_zistepu | newzi | fu
+	fmuls	float_point_999		// newzibottom | fv*surf2->d_zistepv |
+								//  fu*surf2->d_zistepu | newzi | fu
+
+	fxch	%st(2)				// fu*surf2->d_zistepu | fv*surf2->d_zistepv |
+								//  newzibottom | newzi | fu
+	fadds	st_d_ziorigin(%esi)	// fu*surf2->d_zistepu + surf2->d_ziorigin |
+								//  fv*surf2->d_zistepv | newzibottom | newzi |
+								//  fu
+	faddp	%st(0),%st(1)		// testzi | newzibottom | newzi | fu
+	fxch	%st(1)				// newzibottom | testzi | newzi | fu
+
+// if (newzibottom >= testzi)
+//     goto Lgotposition;
+
+	fcomp	%st(1)				// testzi | newzi | fu
+
+	fxch	%st(1)				// newzi | testzi | fu
+	fmuls	float_1_point_001	// newzitop | testzi | fu
+	fxch	%st(1)				// testzi | newzitop | fu
+
+	fnstsw	%ax
+	testb	$0x01,%ah
+	jz		Lgotposition_fpop3
+
+// if (newzitop >= testzi)
+// {
+
+	fcomp	%st(1)				// newzitop | fu
+	fnstsw	%ax
+	testb	$0x45,%ah
+	jz		Lsortloop_fpop2
+
+// if (surf->d_zistepu >= surf2->d_zistepu)
+//     goto newtop;
+
+	flds	st_d_zistepu(%edi)	// surf->d_zistepu | newzitop| fu
+	fcomps	st_d_zistepu(%esi)	// newzitop | fu
+	fnstsw	%ax
+	testb	$0x01,%ah
+	jz		Lgotposition_fpop2
+
+	fstp	%st(0)				// clear the FPstack
+	fstp	%st(0)
+	movl	st_key(%edi),%eax
+	jmp		Lsortloop
+
+
+Lgotposition_fpop3:
+	fstp	%st(0)
+Lgotposition_fpop2:
+	fstp	%st(0)
+	fstp	%st(0)
+	jmp		LInsertAndExit
+
+
+// emit a span (obscures current top)
+
+Lnewtop_fpop3:
+	fstp	%st(0)
+Lnewtop_fpop2:
+	fstp	%st(0)
+	fstp	%st(0)
+	movl	st_key(%edi),%eax		// reload the sorting key
+
+Lnewtop:
+	movl	et_u(%ebx),%eax
+	movl	st_last_u(%esi),%edx
+	shrl	$20,%eax				// iu = integral pixel u
+	movl	%eax,st_last_u(%edi)	// surf->last_u = iu;
+	cmpl	%edx,%eax
+	jle		LInsertAndExit			// iu <= surf->last_u, so nothing to emit
+
+	subl	%edx,%eax
+	movl	%edx,espan_t_u(%ebp)		// span->u = surf->last_u;
+
+	movl	%eax,espan_t_count(%ebp)	// span->count = iu - span->u;
+	movl	C(current_iv),%eax
+	movl	%eax,espan_t_v(%ebp)		// span->v = current_iv;
+	movl	st_spans(%esi),%eax
+	movl	%eax,espan_t_pnext(%ebp)	// span->pnext = surf->spans;
+	movl	%ebp,st_spans(%esi)			// surf->spans = span;
+	addl	$(espan_t_size),%ebp
+
+LInsertAndExit:
+// insert before surf2
+	movl	%esi,st_next(%edi)		// surf->next = surf2;
+	movl	st_prev(%esi),%eax
+	movl	%eax,st_prev(%edi)		// surf->prev = surf2->prev;
+	movl	%edi,st_prev(%esi)		// surf2->prev = surf;
+	movl	%edi,st_next(%eax)		// surf2->prev->next = surf;
+
+// ---------------------------------------------------------------
+// leading edge done
+// ---------------------------------------------------------------
+
+// ---------------------------------------------------------------
+// see if there are any more edges
+// ---------------------------------------------------------------
+
+Lgs_nextedge:
+	movl	et_next(%ebx),%ebx
+	cmpl	$(C(edge_tail)),%ebx
+	jnz		Lgs_edgeloop
+
+// clean up at the right edge
+Lgs_lastspan:
+
+// now that we've reached the right edge of the screen, we're done with any
+// unfinished surfaces, so emit a span for whatever's on top
+	movl	0x12345678,%esi		// surfaces[1].st_next
+LPatch3:
+	movl	C(edge_tail_u_shift20),%eax
+	xorl	%ecx,%ecx
+	movl	st_last_u(%esi),%edx
+	subl	%edx,%eax
+	jle		Lgs_resetspanstate
+
+	movl	%edx,espan_t_u(%ebp)
+	movl	%eax,espan_t_count(%ebp)
+	movl	C(current_iv),%eax
+	movl	%eax,espan_t_v(%ebp)
+	movl	st_spans(%esi),%eax
+	movl	%eax,espan_t_pnext(%ebp)
+	movl	%ebp,st_spans(%esi)
+	addl	$(espan_t_size),%ebp
+
+// reset spanstate for all surfaces in the surface stack
+Lgs_resetspanstate:
+	movl	%ecx,st_spanstate(%esi)
+	movl	st_next(%esi),%esi
+	cmpl	$0x12345678,%esi		// &surfaces[1]
+LPatch4:
+	jnz		Lgs_resetspanstate
+
+// store the final span_p
+	movl	%ebp,C(span_p)
+
+	popl	%ebx				// restore register variables
+	popl	%esi
+	popl	%edi
+	popl	%ebp				// restore the caller's stack frame
+	ret
+
+
+// ---------------------------------------------------------------
+// 1/z sorting for bmodels in the same leaf
+// ---------------------------------------------------------------
+	.align	4
+Lxl_done:
+	incl	%edx
+	movl	%edx,st_spanstate(%edi)
+
+	jmp		Lgs_nextedge
+
+
+	.align	4
+Lzcheck_for_newtop:
+	movl	et_u(%ebx),%eax
+	subl	$0xFFFFF,%eax
+	movl	%eax,Ltemp
+	fildl	Ltemp
+
+	fmuls	float_1_div_0100000h // fu = (float)(edge->u - 0xFFFFF) *
+								//      (1.0 / 0x100000);
+
+	fld		%st(0)				// fu | fu
+	fmuls	st_d_zistepu(%edi)	// fu*surf->d_zistepu | fu
+	flds	C(fv)				// fv | fu*surf->d_zistepu | fu
+	fmuls	st_d_zistepv(%edi)	// fv*surf->d_zistepv | fu*surf->d_zistepu | fu
+	fxch	%st(1)				// fu*surf->d_zistepu | fv*surf->d_zistepv | fu
+	fadds	st_d_ziorigin(%edi)	// fu*surf->d_zistepu + surf->d_ziorigin |
+								//  fv*surf->d_zistepv | fu
+
+	flds	st_d_zistepu(%esi)	// surf2->d_zistepu |
+								//  fu*surf->d_zistepu + surf->d_ziorigin |
+								//  fv*surf->d_zistepv | fu
+	fmul	%st(3),%st(0)		// fu*surf2->d_zistepu |
+								//  fu*surf->d_zistepu + surf->d_ziorigin |
+								//  fv*surf->d_zistepv | fu
+	fxch	%st(1)				// fu*surf->d_zistepu + surf->d_ziorigin |
+								//  fu*surf2->d_zistepu |
+								//  fv*surf->d_zistepv | fu
+	faddp	%st(0),%st(2)		// fu*surf2->d_zistepu | newzi | fu
+
+	flds	C(fv)				// fv | fu*surf2->d_zistepu | newzi | fu
+	fmuls	st_d_zistepv(%esi)	// fv*surf2->d_zistepv |
+								//  fu*surf2->d_zistepu | newzi | fu
+	fld		%st(2)				// newzi | fv*surf2->d_zistepv |
+								//  fu*surf2->d_zistepu | newzi | fu
+	fmuls	float_point_999		// newzibottom | fv*surf2->d_zistepv |
+								//  fu*surf2->d_zistepu | newzi | fu
+
+	fxch	%st(2)				// fu*surf2->d_zistepu | fv*surf2->d_zistepv |
+								//  newzibottom | newzi | fu
+	fadds	st_d_ziorigin(%esi)	// fu*surf2->d_zistepu + surf2->d_ziorigin |
+								//  fv*surf2->d_zistepv | newzibottom | newzi |
+								//  fu
+	faddp	%st(0),%st(1)		// testzi | newzibottom | newzi | fu
+	fxch	%st(1)				// newzibottom | testzi | newzi | fu
+
+// if (newzibottom >= testzi)
+//     goto newtop;
+
+	fcomp	%st(1)				// testzi | newzi | fu
+
+	fxch	%st(1)				// newzi | testzi | fu
+	fmuls	float_1_point_001	// newzitop | testzi | fu
+	fxch	%st(1)				// testzi | newzitop | fu
+
+	fnstsw	%ax
+	testb	$0x01,%ah
+	jz		Lnewtop_fpop3
+
+// if (newzitop >= testzi)
+// {
+
+	fcomp	%st(1)				// newzitop | fu
+	fnstsw	%ax
+	testb	$0x45,%ah
+	jz		Lsortloop_fpop2
+
+// if (surf->d_zistepu >= surf2->d_zistepu)
+//     goto newtop;
+
+	flds	st_d_zistepu(%edi)	// surf->d_zistepu | newzitop | fu
+	fcomps	st_d_zistepu(%esi)	// newzitop | fu
+	fnstsw	%ax
+	testb	$0x01,%ah
+	jz		Lnewtop_fpop2
+
+Lsortloop_fpop2:
+	fstp	%st(0)				// clear the FP stack
+	fstp	%st(0)
+	movl	st_key(%edi),%eax
+	jmp		Lsortloop
+
+
+.globl C(R_EdgeCodeEnd)
+C(R_EdgeCodeEnd):
+
+
+//----------------------------------------------------------------------
+// Surface array address code patching routine
+//----------------------------------------------------------------------
+
+	.align 4
+.globl C(R_SurfacePatch)
+C(R_SurfacePatch):
+
+	movl	C(surfaces),%eax
+	addl	$(st_size),%eax
+	movl	%eax,LPatch4-4
+
+	addl	$(st_next),%eax
+	movl	%eax,LPatch0-4
+	movl	%eax,LPatch2-4
+	movl	%eax,LPatch3-4
+
+	ret
+
+#endif	// id386
+
diff --git a/source/r_varsa.S b/source/r_varsa.S
new file mode 100644
index 0000000..0b1cda0
--- /dev/null
+++ b/source/r_varsa.S
@@ -0,0 +1,64 @@
+/*
+Copyright (C) 1996-1997 Id Software, Inc.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+
+See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+*/
+//
+// r_varsa.s
+//
+
+#include "asm_i386.h"
+#include "quakeasm.h"
+#include "asm_draw.h"
+#include "d_ifacea.h"
+
+#if id386
+
+	.data
+
+//-------------------------------------------------------
+// ASM-only variables
+//-------------------------------------------------------
+.globl	float_1, float_particle_z_clip, float_point5
+.globl	float_minus_1, float_0
+float_0:		.single	0.0
+float_1:		.single	1.0
+float_minus_1:	.single	-1.0
+float_particle_z_clip:	.single	PARTICLE_Z_CLIP
+float_point5:	.single	0.5
+
+.globl	fp_16, fp_64k, fp_1m, fp_64kx64k
+.globl	fp_1m_minus_1
+.globl	fp_8 
+fp_1m:			.single	1048576.0
+fp_1m_minus_1:	.single	1048575.0
+fp_64k:			.single	65536.0
+fp_8:			.single	8.0
+fp_16:			.single	16.0
+fp_64kx64k:		.long	0x4f000000	// (float)0x8000*0x10000
+
+
+.globl	FloatZero, Float2ToThe31nd, FloatMinus2ToThe31nd
+FloatZero:				.long	0
+Float2ToThe31nd:		.long	0x4f000000
+FloatMinus2ToThe31nd:	.long	0xcf000000
+
+.globl	C(r_bmodelactive)
+C(r_bmodelactive):	.long	0
+
+#endif	// id386
+
diff --git a/source/snd_mixa.S b/source/snd_mixa.S
new file mode 100644
index 0000000..6abb8a5
--- /dev/null
+++ b/source/snd_mixa.S
@@ -0,0 +1,218 @@
+/*
+Copyright (C) 1996-1997 Id Software, Inc.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+
+See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+*/
+//
+// snd_mixa.s
+// x86 assembly-language sound code
+//
+
+#include "asm_i386.h"
+#include "quakeasm.h"
+
+#if	id386
+
+	.text
+
+//----------------------------------------------------------------------
+// 8-bit sound-mixing code
+//----------------------------------------------------------------------
+
+#define ch		4+16
+#define sc		8+16
+#define count	12+16
+
+.globl C(SND_PaintChannelFrom8)
+C(SND_PaintChannelFrom8):
+	pushl	%esi				// preserve register variables
+	pushl	%edi
+	pushl	%ebx
+	pushl	%ebp
+
+//	int 	data;
+//	short	*lscale, *rscale;
+//	unsigned char *sfx;
+//	int		i;
+
+	movl	ch(%esp),%ebx
+	movl	sc(%esp),%esi
+
+//	if (ch->leftvol > 255)
+//		ch->leftvol = 255;
+//	if (ch->rightvol > 255)
+//		ch->rightvol = 255;
+	movl	ch_leftvol(%ebx),%eax
+	movl	ch_rightvol(%ebx),%edx
+	cmpl	$255,%eax
+	jna		LLeftSet
+	movl	$255,%eax
+LLeftSet:
+	cmpl	$255,%edx
+	jna		LRightSet
+	movl	$255,%edx
+LRightSet:
+
+//	lscale = snd_scaletable[ch->leftvol >> 3];
+//	rscale = snd_scaletable[ch->rightvol >> 3];
+//	sfx = (signed char *)sc->data + ch->pos;
+//	ch->pos += count;
+	andl	$0xF8,%eax
+	addl	$(sfxc_data),%esi
+	andl	$0xF8,%edx
+	movl	ch_pos(%ebx),%edi
+	movl	count(%esp),%ecx
+	addl	%edi,%esi
+	shll	$7,%eax
+	addl	%ecx,%edi
+	shll	$7,%edx
+	movl	%edi,ch_pos(%ebx)
+	addl	$(C(snd_scaletable)),%eax
+	addl	$(C(snd_scaletable)),%edx
+	subl	%ebx,%ebx
+	movb	-1(%esi,%ecx,1),%bl
+
+	testl	$1,%ecx
+	jz		LMix8Loop
+
+	movl	(%eax,%ebx,4),%edi
+	movl	(%edx,%ebx,4),%ebp
+	addl	C(paintbuffer)+psp_left-psp_size(,%ecx,psp_size),%edi
+	addl	C(paintbuffer)+psp_right-psp_size(,%ecx,psp_size),%ebp
+	movl	%edi,C(paintbuffer)+psp_left-psp_size(,%ecx,psp_size)
+	movl	%ebp,C(paintbuffer)+psp_right-psp_size(,%ecx,psp_size)
+	movb	-2(%esi,%ecx,1),%bl
+
+	decl	%ecx
+	jz		LDone
+
+//	for (i=0 ; i<count ; i++)
+//	{
+LMix8Loop:
+
+//		data = sfx[i];
+//		paintbuffer[i].left += lscale[data];
+//		paintbuffer[i].right += rscale[data];
+	movl	(%eax,%ebx,4),%edi
+	movl	(%edx,%ebx,4),%ebp
+	addl	C(paintbuffer)+psp_left-psp_size(,%ecx,psp_size),%edi
+	addl	C(paintbuffer)+psp_right-psp_size(,%ecx,psp_size),%ebp
+	movb	-2(%esi,%ecx,1),%bl
+	movl	%edi,C(paintbuffer)+psp_left-psp_size(,%ecx,psp_size)
+	movl	%ebp,C(paintbuffer)+psp_right-psp_size(,%ecx,psp_size)
+
+	movl	(%eax,%ebx,4),%edi
+	movl	(%edx,%ebx,4),%ebp
+	movb	-3(%esi,%ecx,1),%bl
+	addl	C(paintbuffer)+psp_left-psp_size*2(,%ecx,psp_size),%edi
+	addl	C(paintbuffer)+psp_right-psp_size*2(,%ecx,psp_size),%ebp
+	movl	%edi,C(paintbuffer)+psp_left-psp_size*2(,%ecx,psp_size)
+	movl	%ebp,C(paintbuffer)+psp_right-psp_size*2(,%ecx,psp_size)
+
+//	}
+	subl	$2,%ecx
+	jnz		LMix8Loop
+
+LDone:
+	popl	%ebp
+	popl	%ebx
+	popl	%edi
+	popl	%esi
+
+	ret
+
+
+//----------------------------------------------------------------------
+// Transfer of stereo buffer to 16-bit DMA buffer code
+//----------------------------------------------------------------------
+
+.globl C(Snd_WriteLinearBlastStereo16)
+C(Snd_WriteLinearBlastStereo16):
+	pushl	%esi				// preserve register variables
+	pushl	%edi
+	pushl	%ebx
+
+//	int		i;
+//	int		val;
+	movl	C(snd_linear_count),%ecx
+	movl	C(snd_p),%ebx
+	movl	C(snd_vol),%esi
+	movl	C(snd_out),%edi
+
+//	for (i=0 ; i<snd_linear_count ; i+=2)
+//	{
+LWLBLoopTop:
+
+//		val = (snd_p[i]*snd_vol)>>8;
+//		if (val > 0x7fff)
+//			snd_out[i] = 0x7fff;
+//		else if (val < (short)0x8000)
+//			snd_out[i] = (short)0x8000;
+//		else
+//			snd_out[i] = val;
+	movl	-8(%ebx,%ecx,4),%eax
+	imull	%esi,%eax
+	sarl	$8,%eax
+	cmpl	$0x7FFF,%eax
+	jg		LClampHigh
+	cmpl	$0xFFFF8000,%eax
+	jnl		LClampDone
+	movl	$0xFFFF8000,%eax
+	jmp		LClampDone
+LClampHigh:
+	movl	$0x7FFF,%eax
+LClampDone:
+
+//		val = (snd_p[i+1]*snd_vol)>>8;
+//		if (val > 0x7fff)
+//			snd_out[i+1] = 0x7fff;
+//		else if (val < (short)0x8000)
+//			snd_out[i+1] = (short)0x8000;
+//		else
+//			snd_out[i+1] = val;
+	movl	-4(%ebx,%ecx,4),%edx
+	imull	%esi,%edx
+	sarl	$8,%edx
+	cmpl	$0x7FFF,%edx
+	jg		LClampHigh2
+	cmpl	$0xFFFF8000,%edx
+	jnl		LClampDone2
+	movl	$0xFFFF8000,%edx
+	jmp		LClampDone2
+LClampHigh2:
+	movl	$0x7FFF,%edx
+LClampDone2:
+	shll	$16,%edx
+	andl	$0xFFFF,%eax
+	orl		%eax,%edx
+	movl	%edx,-4(%edi,%ecx,2)
+
+//	}
+	subl	$2,%ecx
+	jnz		LWLBLoopTop
+
+//	snd_p += snd_linear_count;
+
+	popl	%ebx
+	popl	%edi
+	popl	%esi
+
+	ret
+
+
+#endif	// id386
+
diff --git a/source/surf16.S b/source/surf16.S
new file mode 100644
index 0000000..8fffc40
--- /dev/null
+++ b/source/surf16.S
@@ -0,0 +1,172 @@
+/*
+Copyright (C) 1996-1997 Id Software, Inc.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+
+See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+*/
+//
+// surf16.s
+// x86 assembly-language 16 bpp surface block drawing code.
+//
+
+#include "asm_i386.h"
+#include "quakeasm.h"
+#include "asm_draw.h"
+
+#if id386
+
+//----------------------------------------------------------------------
+// Surface block drawer
+//----------------------------------------------------------------------
+
+	.data
+
+k:			.long	0
+loopentry:	.long	0
+
+	.align	4
+blockjumptable16:
+	.long	LEnter2_16
+	.long	LEnter4_16
+	.long	0, LEnter8_16
+	.long	0, 0, 0, LEnter16_16
+
+
+	.text
+
+	.align 4
+.globl C(R_Surf16Start)
+C(R_Surf16Start):
+
+	.align 4
+.globl C(R_DrawSurfaceBlock16)
+C(R_DrawSurfaceBlock16):
+	pushl	%ebp				// preserve caller's stack frame
+	pushl	%edi
+	pushl	%esi				// preserve register variables
+	pushl	%ebx
+
+	movl	C(blocksize),%eax
+	movl	C(prowdestbase),%edi
+	movl	C(pbasesource),%esi
+	movl	C(sourcesstep),%ebx
+	movl	blockjumptable16-4(,%eax,2),%ecx
+	movl	%eax,k
+	movl	%ecx,loopentry
+	movl	C(lightleft),%edx
+	movl	C(lightright),%ebp
+
+Lblockloop16:
+
+	subl	%edx,%ebp
+	movb	C(blockdivshift),%cl
+	sarl	%cl,%ebp
+	jns		Lp1_16
+	testl	C(blockdivmask),%ebp
+	jz		Lp1_16
+	incl	%ebp
+Lp1_16:
+
+	subl	%eax,%eax
+	subl	%ecx,%ecx	// high words must be 0 in loop for addressing
+
+	jmp		*loopentry
+
+	.align	4
+
+#include "block16.h"
+
+	movl	C(pbasesource),%esi
+	movl	C(lightleft),%edx
+	movl	C(lightright),%ebp
+	movl	C(sourcetstep),%eax
+	movl	C(lightrightstep),%ecx
+	movl	C(prowdestbase),%edi
+
+	addl	%eax,%esi
+	addl	%ecx,%ebp
+
+	movl	C(lightleftstep),%eax
+	movl	C(surfrowbytes),%ecx
+
+	addl	%eax,%edx
+	addl	%ecx,%edi
+
+	movl	%esi,C(pbasesource)
+	movl	%ebp,C(lightright)
+	movl	k,%eax
+	movl	%edx,C(lightleft)
+	decl	%eax
+	movl	%edi,C(prowdestbase)
+	movl	%eax,k
+	jnz		Lblockloop16
+
+	popl	%ebx				// restore register variables
+	popl	%esi
+	popl	%edi
+	popl	%ebp				// restore the caller's stack frame
+	ret
+
+.globl C(R_Surf16End)
+C(R_Surf16End):
+
+//----------------------------------------------------------------------
+// Code patching routines
+//----------------------------------------------------------------------
+	.data
+
+	.align 4
+LPatchTable16:
+	.long	LBPatch0-4
+	.long	LBPatch1-4
+	.long	LBPatch2-4
+	.long	LBPatch3-4
+	.long	LBPatch4-4
+	.long	LBPatch5-4
+	.long	LBPatch6-4
+	.long	LBPatch7-4
+	.long	LBPatch8-4
+	.long	LBPatch9-4
+	.long	LBPatch10-4
+	.long	LBPatch11-4
+	.long	LBPatch12-4
+	.long	LBPatch13-4
+	.long	LBPatch14-4
+	.long	LBPatch15-4
+
+	.text
+
+	.align 4
+.globl C(R_Surf16Patch)
+C(R_Surf16Patch):
+	pushl	%ebx
+
+	movl	C(colormap),%eax
+	movl	$LPatchTable16,%ebx
+	movl	$16,%ecx
+LPatchLoop16:
+	movl	(%ebx),%edx
+	addl	$4,%ebx
+	movl	%eax,(%edx)
+	decl	%ecx
+	jnz		LPatchLoop16
+
+	popl	%ebx
+
+	ret
+
+
+#endif	// id386
diff --git a/source/surf8.S b/source/surf8.S
new file mode 100644
index 0000000..7229e15
--- /dev/null
+++ b/source/surf8.S
@@ -0,0 +1,783 @@
+/*
+Copyright (C) 1996-1997 Id Software, Inc.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+
+See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+*/
+//
+// surf8.s
+// x86 assembly-language 8 bpp surface block drawing code.
+//
+
+#include "asm_i386.h"
+#include "quakeasm.h"
+#include "asm_draw.h"
+
+#if	id386
+
+	.data
+
+sb_v:		.long	0
+
+	.text
+
+	.align 4
+.globl C(R_Surf8Start)
+C(R_Surf8Start):
+
+//----------------------------------------------------------------------
+// Surface block drawer for mip level 0
+//----------------------------------------------------------------------
+
+	.align 4
+.globl C(R_DrawSurfaceBlock8_mip0)
+C(R_DrawSurfaceBlock8_mip0):
+	pushl	%ebp				// preserve caller's stack frame
+	pushl	%edi
+	pushl	%esi				// preserve register variables
+	pushl	%ebx
+
+//		for (v=0 ; v<numvblocks ; v++)
+//		{
+	movl	C(r_lightptr),%ebx
+	movl	C(r_numvblocks),%eax
+
+	movl	%eax,sb_v
+	movl	C(prowdestbase),%edi
+
+	movl	C(pbasesource),%esi
+
+Lv_loop_mip0:
+
+//			lightleft = lightptr[0];
+//			lightright = lightptr[1];
+//			lightdelta = (lightleft - lightright) & 0xFFFFF;
+	movl	(%ebx),%eax			// lightleft
+	movl	4(%ebx),%edx		// lightright
+
+	movl	%eax,%ebp
+	movl	C(r_lightwidth),%ecx
+
+	movl	%edx,C(lightright)
+	subl	%edx,%ebp
+
+	andl	$0xFFFFF,%ebp
+	leal	(%ebx,%ecx,4),%ebx
+
+//			lightptr += lightwidth;
+	movl	%ebx,C(r_lightptr)
+
+//			lightleftstep = (lightptr[0] - lightleft) >> blockdivshift;
+//			lightrightstep = (lightptr[1] - lightright) >> blockdivshift;
+//			lightdeltastep = ((lightleftstep - lightrightstep) & 0xFFFFF) |
+//					0xF0000000;
+	movl	4(%ebx),%ecx	// lightptr[1]
+	movl	(%ebx),%ebx		// lightptr[0]
+
+	subl	%eax,%ebx
+	subl	%edx,%ecx
+
+	sarl	$4,%ecx
+	orl		$0xF0000000,%ebp
+
+	sarl	$4,%ebx
+	movl	%ecx,C(lightrightstep)
+
+	subl	%ecx,%ebx
+	andl	$0xFFFFF,%ebx
+
+	orl		$0xF0000000,%ebx
+	subl	%ecx,%ecx	// high word must be 0 in loop for addressing
+
+	movl	%ebx,C(lightdeltastep)
+	subl	%ebx,%ebx	// high word must be 0 in loop for addressing
+
+Lblockloop8_mip0:
+	movl	%ebp,C(lightdelta)
+	movb	14(%esi),%cl
+
+	sarl	$4,%ebp
+	movb	%dh,%bh
+
+	movb	15(%esi),%bl
+	addl	%ebp,%edx
+
+	movb	%dh,%ch
+	addl	%ebp,%edx
+
+	movb	0x12345678(%ebx),%ah
+LBPatch0:
+	movb	13(%esi),%bl
+
+	movb	0x12345678(%ecx),%al
+LBPatch1:
+	movb	12(%esi),%cl
+
+	movb	%dh,%bh
+	addl	%ebp,%edx
+
+	rorl	$16,%eax
+	movb	%dh,%ch
+
+	addl	%ebp,%edx
+	movb	0x12345678(%ebx),%ah
+LBPatch2:
+
+	movb	11(%esi),%bl
+	movb	0x12345678(%ecx),%al
+LBPatch3:
+
+	movb	10(%esi),%cl
+	movl	%eax,12(%edi)
+
+	movb	%dh,%bh
+	addl	%ebp,%edx
+
+	movb	%dh,%ch
+	addl	%ebp,%edx
+
+	movb	0x12345678(%ebx),%ah
+LBPatch4:
+	movb	9(%esi),%bl
+
+	movb	0x12345678(%ecx),%al
+LBPatch5:
+	movb	8(%esi),%cl
+
+	movb	%dh,%bh
+	addl	%ebp,%edx
+
+	rorl	$16,%eax
+	movb	%dh,%ch
+
+	addl	%ebp,%edx
+	movb	0x12345678(%ebx),%ah
+LBPatch6:
+
+	movb	7(%esi),%bl
+	movb	0x12345678(%ecx),%al
+LBPatch7:
+
+	movb	6(%esi),%cl
+	movl	%eax,8(%edi)
+
+	movb	%dh,%bh
+	addl	%ebp,%edx
+
+	movb	%dh,%ch
+	addl	%ebp,%edx
+
+	movb	0x12345678(%ebx),%ah
+LBPatch8:
+	movb	5(%esi),%bl
+
+	movb	0x12345678(%ecx),%al
+LBPatch9:
+	movb	4(%esi),%cl
+
+	movb	%dh,%bh
+	addl	%ebp,%edx
+
+	rorl	$16,%eax
+	movb	%dh,%ch
+
+	addl	%ebp,%edx
+	movb	0x12345678(%ebx),%ah
+LBPatch10:
+
+	movb	3(%esi),%bl
+	movb	0x12345678(%ecx),%al
+LBPatch11:
+
+	movb	2(%esi),%cl
+	movl	%eax,4(%edi)
+
+	movb	%dh,%bh
+	addl	%ebp,%edx
+
+	movb	%dh,%ch
+	addl	%ebp,%edx
+
+	movb	0x12345678(%ebx),%ah
+LBPatch12:
+	movb	1(%esi),%bl
+
+	movb	0x12345678(%ecx),%al
+LBPatch13:
+	movb	(%esi),%cl
+
+	movb	%dh,%bh
+	addl	%ebp,%edx
+
+	rorl	$16,%eax
+	movb	%dh,%ch
+
+	movb	0x12345678(%ebx),%ah
+LBPatch14:
+	movl	C(lightright),%edx
+
+	movb	0x12345678(%ecx),%al
+LBPatch15:
+	movl	C(lightdelta),%ebp
+
+	movl	%eax,(%edi)
+
+	addl	C(sourcetstep),%esi
+	addl	C(surfrowbytes),%edi
+
+	addl	C(lightrightstep),%edx
+	addl	C(lightdeltastep),%ebp
+
+	movl	%edx,C(lightright)
+	jc		Lblockloop8_mip0
+
+//			if (pbasesource >= r_sourcemax)
+//				pbasesource -= stepback;
+
+	cmpl	C(r_sourcemax),%esi
+	jb		LSkip_mip0
+	subl	C(r_stepback),%esi
+LSkip_mip0:
+
+	movl	C(r_lightptr),%ebx
+	decl	sb_v
+
+	jnz		Lv_loop_mip0
+
+	popl	%ebx				// restore register variables
+	popl	%esi
+	popl	%edi
+	popl	%ebp				// restore the caller's stack frame
+	ret
+
+
+//----------------------------------------------------------------------
+// Surface block drawer for mip level 1
+//----------------------------------------------------------------------
+
+	.align 4
+.globl C(R_DrawSurfaceBlock8_mip1)
+C(R_DrawSurfaceBlock8_mip1):
+	pushl	%ebp				// preserve caller's stack frame
+	pushl	%edi
+	pushl	%esi				// preserve register variables
+	pushl	%ebx
+
+//		for (v=0 ; v<numvblocks ; v++)
+//		{
+	movl	C(r_lightptr),%ebx
+	movl	C(r_numvblocks),%eax
+
+	movl	%eax,sb_v
+	movl	C(prowdestbase),%edi
+
+	movl	C(pbasesource),%esi
+
+Lv_loop_mip1:
+
+//			lightleft = lightptr[0];
+//			lightright = lightptr[1];
+//			lightdelta = (lightleft - lightright) & 0xFFFFF;
+	movl	(%ebx),%eax			// lightleft
+	movl	4(%ebx),%edx		// lightright
+
+	movl	%eax,%ebp
+	movl	C(r_lightwidth),%ecx
+
+	movl	%edx,C(lightright)
+	subl	%edx,%ebp
+
+	andl	$0xFFFFF,%ebp
+	leal	(%ebx,%ecx,4),%ebx
+
+//			lightptr += lightwidth;
+	movl	%ebx,C(r_lightptr)
+
+//			lightleftstep = (lightptr[0] - lightleft) >> blockdivshift;
+//			lightrightstep = (lightptr[1] - lightright) >> blockdivshift;
+//			lightdeltastep = ((lightleftstep - lightrightstep) & 0xFFFFF) |
+//					0xF0000000;
+	movl	4(%ebx),%ecx	// lightptr[1]
+	movl	(%ebx),%ebx		// lightptr[0]
+
+	subl	%eax,%ebx
+	subl	%edx,%ecx
+
+	sarl	$3,%ecx
+	orl		$0x70000000,%ebp
+
+	sarl	$3,%ebx
+	movl	%ecx,C(lightrightstep)
+
+	subl	%ecx,%ebx
+	andl	$0xFFFFF,%ebx
+
+	orl		$0xF0000000,%ebx
+	subl	%ecx,%ecx	// high word must be 0 in loop for addressing
+
+	movl	%ebx,C(lightdeltastep)
+	subl	%ebx,%ebx	// high word must be 0 in loop for addressing
+
+Lblockloop8_mip1:
+	movl	%ebp,C(lightdelta)
+	movb	6(%esi),%cl
+
+	sarl	$3,%ebp
+	movb	%dh,%bh
+
+	movb	7(%esi),%bl
+	addl	%ebp,%edx
+
+	movb	%dh,%ch
+	addl	%ebp,%edx
+
+	movb	0x12345678(%ebx),%ah
+LBPatch22:
+	movb	5(%esi),%bl
+
+	movb	0x12345678(%ecx),%al
+LBPatch23:
+	movb	4(%esi),%cl
+
+	movb	%dh,%bh
+	addl	%ebp,%edx
+
+	rorl	$16,%eax
+	movb	%dh,%ch
+
+	addl	%ebp,%edx
+	movb	0x12345678(%ebx),%ah
+LBPatch24:
+
+	movb	3(%esi),%bl
+	movb	0x12345678(%ecx),%al
+LBPatch25:
+
+	movb	2(%esi),%cl
+	movl	%eax,4(%edi)
+
+	movb	%dh,%bh
+	addl	%ebp,%edx
+
+	movb	%dh,%ch
+	addl	%ebp,%edx
+
+	movb	0x12345678(%ebx),%ah
+LBPatch26:
+	movb	1(%esi),%bl
+
+	movb	0x12345678(%ecx),%al
+LBPatch27:
+	movb	(%esi),%cl
+
+	movb	%dh,%bh
+	addl	%ebp,%edx
+
+	rorl	$16,%eax
+	movb	%dh,%ch
+
+	movb	0x12345678(%ebx),%ah
+LBPatch28:
+	movl	C(lightright),%edx
+
+	movb	0x12345678(%ecx),%al
+LBPatch29:
+	movl	C(lightdelta),%ebp
+
+	movl	%eax,(%edi)
+	movl	C(sourcetstep),%eax
+
+	addl	%eax,%esi
+	movl	C(surfrowbytes),%eax
+
+	addl	%eax,%edi
+	movl	C(lightrightstep),%eax
+
+	addl	%eax,%edx
+	movl	C(lightdeltastep),%eax
+
+	addl	%eax,%ebp
+	movl	%edx,C(lightright)
+
+	jc		Lblockloop8_mip1
+
+//			if (pbasesource >= r_sourcemax)
+//				pbasesource -= stepback;
+
+	cmpl	C(r_sourcemax),%esi
+	jb		LSkip_mip1
+	subl	C(r_stepback),%esi
+LSkip_mip1:
+
+	movl	C(r_lightptr),%ebx
+	decl	sb_v
+
+	jnz		Lv_loop_mip1
+
+	popl	%ebx				// restore register variables
+	popl	%esi
+	popl	%edi
+	popl	%ebp				// restore the caller's stack frame
+	ret
+
+
+//----------------------------------------------------------------------
+// Surface block drawer for mip level 2
+//----------------------------------------------------------------------
+
+	.align 4
+.globl C(R_DrawSurfaceBlock8_mip2)
+C(R_DrawSurfaceBlock8_mip2):
+	pushl	%ebp				// preserve caller's stack frame
+	pushl	%edi
+	pushl	%esi				// preserve register variables
+	pushl	%ebx
+
+//		for (v=0 ; v<numvblocks ; v++)
+//		{
+	movl	C(r_lightptr),%ebx
+	movl	C(r_numvblocks),%eax
+
+	movl	%eax,sb_v
+	movl	C(prowdestbase),%edi
+
+	movl	C(pbasesource),%esi
+
+Lv_loop_mip2:
+
+//			lightleft = lightptr[0];
+//			lightright = lightptr[1];
+//			lightdelta = (lightleft - lightright) & 0xFFFFF;
+	movl	(%ebx),%eax			// lightleft
+	movl	4(%ebx),%edx		// lightright
+
+	movl	%eax,%ebp
+	movl	C(r_lightwidth),%ecx
+
+	movl	%edx,C(lightright)
+	subl	%edx,%ebp
+
+	andl	$0xFFFFF,%ebp
+	leal	(%ebx,%ecx,4),%ebx
+
+//			lightptr += lightwidth;
+	movl	%ebx,C(r_lightptr)
+
+//			lightleftstep = (lightptr[0] - lightleft) >> blockdivshift;
+//			lightrightstep = (lightptr[1] - lightright) >> blockdivshift;
+//			lightdeltastep = ((lightleftstep - lightrightstep) & 0xFFFFF) |
+//					0xF0000000;
+	movl	4(%ebx),%ecx	// lightptr[1]
+	movl	(%ebx),%ebx		// lightptr[0]
+
+	subl	%eax,%ebx
+	subl	%edx,%ecx
+
+	sarl	$2,%ecx
+	orl		$0x30000000,%ebp
+
+	sarl	$2,%ebx
+	movl	%ecx,C(lightrightstep)
+
+	subl	%ecx,%ebx
+
+	andl	$0xFFFFF,%ebx
+
+	orl		$0xF0000000,%ebx
+	subl	%ecx,%ecx	// high word must be 0 in loop for addressing
+
+	movl	%ebx,C(lightdeltastep)
+	subl	%ebx,%ebx	// high word must be 0 in loop for addressing
+
+Lblockloop8_mip2:
+	movl	%ebp,C(lightdelta)
+	movb	2(%esi),%cl
+
+	sarl	$2,%ebp
+	movb	%dh,%bh
+
+	movb	3(%esi),%bl
+	addl	%ebp,%edx
+
+	movb	%dh,%ch
+	addl	%ebp,%edx
+
+	movb	0x12345678(%ebx),%ah
+LBPatch18:
+	movb	1(%esi),%bl
+
+	movb	0x12345678(%ecx),%al
+LBPatch19:
+	movb	(%esi),%cl
+
+	movb	%dh,%bh
+	addl	%ebp,%edx
+
+	rorl	$16,%eax
+	movb	%dh,%ch
+
+	movb	0x12345678(%ebx),%ah
+LBPatch20:
+	movl	C(lightright),%edx
+
+	movb	0x12345678(%ecx),%al
+LBPatch21:
+	movl	C(lightdelta),%ebp
+
+	movl	%eax,(%edi)
+	movl	C(sourcetstep),%eax
+
+	addl	%eax,%esi
+	movl	C(surfrowbytes),%eax
+
+	addl	%eax,%edi
+	movl	C(lightrightstep),%eax
+
+	addl	%eax,%edx
+	movl	C(lightdeltastep),%eax
+
+	addl	%eax,%ebp
+	movl	%edx,C(lightright)
+
+	jc		Lblockloop8_mip2
+
+//			if (pbasesource >= r_sourcemax)
+//				pbasesource -= stepback;
+
+	cmpl	C(r_sourcemax),%esi
+	jb		LSkip_mip2
+	subl	C(r_stepback),%esi
+LSkip_mip2:
+
+	movl	C(r_lightptr),%ebx
+	decl	sb_v
+
+	jnz		Lv_loop_mip2
+
+	popl	%ebx				// restore register variables
+	popl	%esi
+	popl	%edi
+	popl	%ebp				// restore the caller's stack frame
+	ret
+
+
+//----------------------------------------------------------------------
+// Surface block drawer for mip level 3
+//----------------------------------------------------------------------
+
+	.align 4
+.globl C(R_DrawSurfaceBlock8_mip3)
+C(R_DrawSurfaceBlock8_mip3):
+	pushl	%ebp				// preserve caller's stack frame
+	pushl	%edi
+	pushl	%esi				// preserve register variables
+	pushl	%ebx
+
+//		for (v=0 ; v<numvblocks ; v++)
+//		{
+	movl	C(r_lightptr),%ebx
+	movl	C(r_numvblocks),%eax
+
+	movl	%eax,sb_v
+	movl	C(prowdestbase),%edi
+
+	movl	C(pbasesource),%esi
+
+Lv_loop_mip3:
+
+//			lightleft = lightptr[0];
+//			lightright = lightptr[1];
+//			lightdelta = (lightleft - lightright) & 0xFFFFF;
+	movl	(%ebx),%eax			// lightleft
+	movl	4(%ebx),%edx		// lightright
+
+	movl	%eax,%ebp
+	movl	C(r_lightwidth),%ecx
+
+	movl	%edx,C(lightright)
+	subl	%edx,%ebp
+
+	andl	$0xFFFFF,%ebp
+	leal	(%ebx,%ecx,4),%ebx
+
+	movl	%ebp,C(lightdelta)
+//			lightptr += lightwidth;
+	movl	%ebx,C(r_lightptr)
+
+//			lightleftstep = (lightptr[0] - lightleft) >> blockdivshift;
+//			lightrightstep = (lightptr[1] - lightright) >> blockdivshift;
+//			lightdeltastep = ((lightleftstep - lightrightstep) & 0xFFFFF) |
+//					0xF0000000;
+	movl	4(%ebx),%ecx	// lightptr[1]
+	movl	(%ebx),%ebx		// lightptr[0]
+
+	subl	%eax,%ebx
+	subl	%edx,%ecx
+
+	sarl	$1,%ecx
+
+	sarl	$1,%ebx
+	movl	%ecx,C(lightrightstep)
+
+	subl	%ecx,%ebx
+	andl	$0xFFFFF,%ebx
+
+	sarl	$1,%ebp
+	orl		$0xF0000000,%ebx
+
+	movl	%ebx,C(lightdeltastep)
+	subl	%ebx,%ebx	// high word must be 0 in loop for addressing
+
+	movb	1(%esi),%bl
+	subl	%ecx,%ecx	// high word must be 0 in loop for addressing
+
+	movb	%dh,%bh
+	movb	(%esi),%cl
+
+	addl	%ebp,%edx
+	movb	%dh,%ch
+
+	movb	0x12345678(%ebx),%al
+LBPatch16:
+	movl	C(lightright),%edx
+
+	movb	%al,1(%edi)
+	movb	0x12345678(%ecx),%al
+LBPatch17:
+
+	movb	%al,(%edi)
+	movl	C(sourcetstep),%eax
+
+	addl	%eax,%esi
+	movl	C(surfrowbytes),%eax
+
+	addl	%eax,%edi
+	movl	C(lightdeltastep),%eax
+
+	movl	C(lightdelta),%ebp
+	movb	(%esi),%cl
+
+	addl	%eax,%ebp
+	movl	C(lightrightstep),%eax
+
+	sarl	$1,%ebp
+	addl	%eax,%edx
+
+	movb	%dh,%bh
+	movb	1(%esi),%bl
+
+	addl	%ebp,%edx
+	movb	%dh,%ch
+
+	movb	0x12345678(%ebx),%al
+LBPatch30:
+	movl	C(sourcetstep),%edx
+
+	movb	%al,1(%edi)
+	movb	0x12345678(%ecx),%al
+LBPatch31:
+
+	movb	%al,(%edi)
+	movl	C(surfrowbytes),%ebp
+
+	addl	%edx,%esi
+	addl	%ebp,%edi
+
+//			if (pbasesource >= r_sourcemax)
+//				pbasesource -= stepback;
+
+	cmpl	C(r_sourcemax),%esi
+	jb		LSkip_mip3
+	subl	C(r_stepback),%esi
+LSkip_mip3:
+
+	movl	C(r_lightptr),%ebx
+	decl	sb_v
+
+	jnz		Lv_loop_mip3
+
+	popl	%ebx				// restore register variables
+	popl	%esi
+	popl	%edi
+	popl	%ebp				// restore the caller's stack frame
+	ret
+
+
+.globl C(R_Surf8End)
+C(R_Surf8End):
+
+//----------------------------------------------------------------------
+// Code patching routines
+//----------------------------------------------------------------------
+	.data
+
+	.align 4
+LPatchTable8:
+	.long	LBPatch0-4
+	.long	LBPatch1-4
+	.long	LBPatch2-4
+	.long	LBPatch3-4
+	.long	LBPatch4-4
+	.long	LBPatch5-4
+	.long	LBPatch6-4
+	.long	LBPatch7-4
+	.long	LBPatch8-4
+	.long	LBPatch9-4
+	.long	LBPatch10-4
+	.long	LBPatch11-4
+	.long	LBPatch12-4
+	.long	LBPatch13-4
+	.long	LBPatch14-4
+	.long	LBPatch15-4
+	.long	LBPatch16-4
+	.long	LBPatch17-4
+	.long	LBPatch18-4
+	.long	LBPatch19-4
+	.long	LBPatch20-4
+	.long	LBPatch21-4
+	.long	LBPatch22-4
+	.long	LBPatch23-4
+	.long	LBPatch24-4
+	.long	LBPatch25-4
+	.long	LBPatch26-4
+	.long	LBPatch27-4
+	.long	LBPatch28-4
+	.long	LBPatch29-4
+	.long	LBPatch30-4
+	.long	LBPatch31-4
+
+	.text
+
+	.align 4
+.globl C(R_Surf8Patch)
+C(R_Surf8Patch):
+	pushl	%ebx
+
+	movl	C(colormap),%eax
+	movl	$LPatchTable8,%ebx
+	movl	$32,%ecx
+LPatchLoop8:
+	movl	(%ebx),%edx
+	addl	$4,%ebx
+	movl	%eax,(%edx)
+	decl	%ecx
+	jnz		LPatchLoop8
+
+	popl	%ebx
+
+	ret
+
+#endif	// id386
diff --git a/source/sv_misc.c b/source/sv_misc.c
index 1f01f95..2186573 100644
--- a/source/sv_misc.c
+++ b/source/sv_misc.c
@@ -7,3 +7,8 @@ void
 Draw_EndDisc(void)
 {
 }
+
+void
+Cmd_ForwardToServer (void)
+{
+}
diff --git a/source/sys_dosa.S b/source/sys_dosa.S
new file mode 100644
index 0000000..7b0ea61
--- /dev/null
+++ b/source/sys_dosa.S
@@ -0,0 +1,95 @@
+//
+// sys_dosa.s
+// x86 assembly-language DOS-dependent routines.
+
+#include "asm_i386.h"
+#include "quakeasm.h"
+
+
+	.data
+
+	.align	4
+fpenv:
+	.long	0, 0, 0, 0, 0, 0, 0, 0
+
+	.text
+
+.globl C(MaskExceptions)
+C(MaskExceptions):
+	fnstenv	fpenv
+	orl		$0x3F,fpenv
+	fldenv	fpenv
+
+	ret
+
+#if 0
+.globl C(unmaskexceptions)
+C(unmaskexceptions):
+	fnstenv	fpenv
+	andl		$0xFFFFFFE0,fpenv
+	fldenv	fpenv
+
+	ret
+#endif
+
+	.data
+
+	.align	4
+.globl	ceil_cw, single_cw, full_cw, cw, pushed_cw
+ceil_cw:	.long	0
+single_cw:	.long	0
+full_cw:	.long	0
+cw:			.long	0
+pushed_cw:	.long	0
+
+	.text
+
+.globl C(Sys_LowFPPrecision)
+C(Sys_LowFPPrecision):
+	fldcw	single_cw
+
+	ret
+
+.globl C(Sys_HighFPPrecision)
+C(Sys_HighFPPrecision):
+	fldcw	full_cw
+
+	ret
+
+.globl C(Sys_PushFPCW_SetHigh)
+C(Sys_PushFPCW_SetHigh):
+	fnstcw	pushed_cw
+	fldcw	full_cw
+
+	ret
+
+.globl C(Sys_PopFPCW)
+C(Sys_PopFPCW):
+	fldcw	pushed_cw
+
+	ret
+
+.globl C(Sys_SetFPCW)
+C(Sys_SetFPCW):
+	fnstcw	cw
+	movl	cw,%eax
+#if	id386
+	andb	$0xF0,%ah
+	orb		$0x03,%ah	// round mode, 64-bit precision
+#endif
+	movl	%eax,full_cw
+
+#if	id386
+	andb	$0xF0,%ah
+	orb		$0x0C,%ah	// chop mode, single precision
+#endif
+	movl	%eax,single_cw
+
+#if	id386
+	andb	$0xF0,%ah
+	orb		$0x08,%ah	// ceil mode, single precision
+#endif
+	movl	%eax,ceil_cw
+
+	ret
+
diff --git a/source/sys_wina.S b/source/sys_wina.S
new file mode 100644
index 0000000..6de31c2
--- /dev/null
+++ b/source/sys_wina.S
@@ -0,0 +1,115 @@
+/*
+Copyright (C) 1996-1997 Id Software, Inc.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+
+See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+*/
+//
+// sys_wina.s
+// x86 assembly-language Win-dependent routines.
+
+#include "asm_i386.h"
+#include "quakeasm.h"
+
+//@@@ should be id386-dependent, and have an equivalent C path
+
+	.data
+
+	.align	4
+fpenv:
+	.long	0, 0, 0, 0, 0, 0, 0, 0
+
+	.text
+
+.globl C(MaskExceptions)
+C(MaskExceptions):
+	fnstenv	fpenv
+	orl		$0x3F,fpenv
+	fldenv	fpenv
+
+	ret
+
+#if 0
+.globl C(unmaskexceptions)
+C(unmaskexceptions):
+	fnstenv	fpenv
+	andl		$0xFFFFFFE0,fpenv
+	fldenv	fpenv
+
+	ret
+#endif
+
+	.data
+
+	.align	4
+.globl	ceil_cw, single_cw, full_cw, cw, pushed_cw
+ceil_cw:	.long	0
+single_cw:	.long	0
+full_cw:	.long	0
+cw:			.long	0
+pushed_cw:	.long	0
+
+	.text
+
+.globl C(Sys_LowFPPrecision)
+C(Sys_LowFPPrecision):
+	fldcw	single_cw
+
+	ret
+
+.globl C(Sys_HighFPPrecision)
+C(Sys_HighFPPrecision):
+	fldcw	full_cw
+
+	ret
+
+.globl C(Sys_PushFPCW_SetHigh)
+C(Sys_PushFPCW_SetHigh):
+	fnstcw	pushed_cw
+	fldcw	full_cw
+
+	ret
+
+.globl C(Sys_PopFPCW)
+C(Sys_PopFPCW):
+	fldcw	pushed_cw
+
+	ret
+
+.globl C(Sys_SetFPCW)
+C(Sys_SetFPCW):
+	fnstcw	cw
+	movl	cw,%eax
+#if	id386
+	andb	$0xF0,%ah
+	orb		$0x03,%ah	// round mode, 64-bit precision
+#endif
+	movl	%eax,full_cw
+
+#if	id386
+	andb	$0xF0,%ah
+	orb		$0x0C,%ah	// chop mode, single precision
+#endif
+	movl	%eax,single_cw
+
+#if	id386
+	andb	$0xF0,%ah
+	orb		$0x08,%ah	// ceil mode, single precision
+#endif
+	movl	%eax,ceil_cw
+
+	ret
+