/*
	d_polysa.S

	x86 assembly-language polygon model drawing code

	Copyright (C) 1996-1997  Id Software, Inc.

	This program is free software; you can redistribute it and/or
	modify it under the terms of the GNU General Public License
	as published by the Free Software Foundation; either version 2
	of the License, or (at your option) any later version.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

	See the GNU General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with this program; if not, write to:

		Free Software Foundation, Inc.
		59 Temple Place - Suite 330
		Boston, MA  02111-1307, USA

	$Id$
*/

#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
#include "asm_i386.h"
#include "quakeasm.h"
#include "asm_draw.h"
#include "d_ifacea.h"

#ifdef PIC
#undef USE_INTEL_ASM //XXX asm pic hack
#endif

#ifdef USE_INTEL_ASM

// !!! if this is changed, it must be changed in d_polyse.c too !!!
#define DPS_MAXSPANS			MAXHEIGHT+1	
									// 1 extra for spanpackage that marks end

//#define	SPAN_SIZE	(((DPS_MAXSPANS + 1 + ((CACHE_SIZE - 1) / spanpackage_t_size)) + 1) * spanpackage_t_size)
#define SPAN_SIZE (1024+1+1+1)*32 


	.data

	.align	4
p10_minus_p20:	.single		0
p01_minus_p21:	.single		0
temp0:			.single		0
temp1:			.single		0
Ltemp:			.single		0

aff8entryvec_table:	.long	LDraw8, LDraw7, LDraw6, LDraw5
				.long	LDraw4, LDraw3, LDraw2, LDraw1

lzistepx:		.long	0


	.text

#ifndef NeXT
	.extern C(D_PolysetSetEdgeTable)
	.extern C(D_RasterizeAliasPolySmooth)
#endif

//----------------------------------------------------------------------
// affine triangle gradient calculation code
//----------------------------------------------------------------------

#define skinwidth	4+0

.globl C(D_PolysetCalcGradients)
C(D_PolysetCalcGradients):

//	p00_minus_p20 = r_p0[0] - r_p2[0];
//	p01_minus_p21 = r_p0[1] - r_p2[1];
//	p10_minus_p20 = r_p1[0] - r_p2[0];
//	p11_minus_p21 = r_p1[1] - r_p2[1];
//
//	xstepdenominv = 1.0 / (p10_minus_p20 * p01_minus_p21 -
//			     p00_minus_p20 * p11_minus_p21);
//
//	ystepdenominv = -xstepdenominv;

	fildl	C(r_p0)+0		// r_p0[0]
	fildl	C(r_p2)+0		// r_p2[0] | r_p0[0]
	fildl	C(r_p0)+4		// r_p0[1] | r_p2[0] | r_p0[0]
	fildl	C(r_p2)+4		// r_p2[1] | r_p0[1] | r_p2[0] | r_p0[0]
	fildl	C(r_p1)+0		// r_p1[0] | r_p2[1] | r_p0[1] | r_p2[0] | r_p0[0]
	fildl	C(r_p1)+4		// r_p1[1] | r_p1[0] | r_p2[1] | r_p0[1] |
							//  r_p2[0] | r_p0[0]
	fxch	%st(3)			// r_p0[1] | r_p1[0] | r_p2[1] | r_p1[1] |
							//  r_p2[0] | r_p0[0]
	fsub	%st(2),%st(0)	// p01_minus_p21 | r_p1[0] | r_p2[1] | r_p1[1] |
							//  r_p2[0] | r_p0[0]
	fxch	%st(1)			// r_p1[0] | p01_minus_p21 | r_p2[1] | r_p1[1] |
							//  r_p2[0] | r_p0[0]
	fsub	%st(4),%st(0)	// p10_minus_p20 | p01_minus_p21 | r_p2[1] |
							//  r_p1[1] | r_p2[0] | r_p0[0]
	fxch	%st(5)			// r_p0[0] | p01_minus_p21 | r_p2[1] |
							//  r_p1[1] | r_p2[0] | p10_minus_p20
	fsubp	%st(0),%st(4)	// p01_minus_p21 | r_p2[1] | r_p1[1] |
							//  p00_minus_p20 | p10_minus_p20
	fxch	%st(2)			// r_p1[1] | r_p2[1] | p01_minus_p21 |
							//  p00_minus_p20 | p10_minus_p20
	fsubp	%st(0),%st(1)	// p11_minus_p21 | p01_minus_p21 |
							//  p00_minus_p20 | p10_minus_p20
	fxch	%st(1)			// p01_minus_p21 | p11_minus_p21 |
							//  p00_minus_p20 | p10_minus_p20
	flds	C(d_xdenom)		// d_xdenom | p01_minus_p21 | p11_minus_p21 |
							//  p00_minus_p20 | p10_minus_p20
	fxch	%st(4)			// p10_minus_p20 | p01_minus_p21 | p11_minus_p21 |
							//  p00_minus_p20 | d_xdenom
	fstps	p10_minus_p20	// p01_minus_p21 | p11_minus_p21 |
							//  p00_minus_p20 | d_xdenom
	fstps	p01_minus_p21	// p11_minus_p21 | p00_minus_p20 | xstepdenominv
	fxch	%st(2)			// xstepdenominv | p00_minus_p20 | p11_minus_p21

//// ceil () for light so positive steps are exaggerated, negative steps
//// diminished,  pushing us away from underflow toward overflow. Underflow is
//// very visible, overflow is very unlikely, because of ambient lighting
//	t0 = r_p0[4] - r_p2[4];
//	t1 = r_p1[4] - r_p2[4];

	fildl	C(r_p2)+16		// r_p2[4] | xstepdenominv | p00_minus_p20 |
							//  p11_minus_p21
	fildl	C(r_p0)+16		// r_p0[4] | r_p2[4] | xstepdenominv |
							//  p00_minus_p20 | p11_minus_p21
	fildl	C(r_p1)+16		// r_p1[4] | r_p0[4] | r_p2[4] | xstepdenominv |
							//  p00_minus_p20 | p11_minus_p21
	fxch	%st(2)			// r_p2[4] | r_p0[4] | r_p1[4] | xstepdenominv |
							//  p00_minus_p20 | p11_minus_p21
	fld		%st(0)			// r_p2[4] | r_p2[4] | r_p0[4] | r_p1[4] |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fsubrp	%st(0),%st(2)	// r_p2[4] | t0 | r_p1[4] | xstepdenominv |
							//  p00_minus_p20 | p11_minus_p21
	fsubrp	%st(0),%st(2)	// t0 | t1 | xstepdenominv | p00_minus_p20 |
							//  p11_minus_p21

//	r_lstepx = (int)
//			ceil((t1 * p01_minus_p21 - t0 * p11_minus_p21) * xstepdenominv);
//	r_lstepy = (int)
//			ceil((t1 * p00_minus_p20 - t0 * p10_minus_p20) * ystepdenominv);

	fld		%st(0)			// t0 | t0 | t1 | xstepdenominv | p00_minus_p20 |
							//  p11_minus_p21
	fmul	%st(5),%st(0)	// t0*p11_minus_p21 | t0 | t1 | xstepdenominv |
							//  p00_minus_p20 | p11_minus_p21
	fxch	%st(2)			// t1 | t0 | t0*p11_minus_p21 | xstepdenominv |
							//  p00_minus_p20 | p11_minus_p21
	fld		%st(0)			// t1 | t1 | t0 | t0*p11_minus_p21 |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fmuls	p01_minus_p21	// t1*p01_minus_p21 | t1 | t0 | t0*p11_minus_p21 |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fxch	%st(2)			// t0 | t1 | t1*p01_minus_p21 | t0*p11_minus_p21 |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fmuls	p10_minus_p20	// t0*p10_minus_p20 | t1 | t1*p01_minus_p21 |
							//  t0*p11_minus_p21 | xstepdenominv |
							//  p00_minus_p20 | p11_minus_p21
	fxch	%st(1)			// t1 | t0*p10_minus_p20 | t1*p01_minus_p21 |
							//  t0*p11_minus_p21 | xstepdenominv |
							//  p00_minus_p20 | p11_minus_p21
	fmul	%st(5),%st(0)	// t1*p00_minus_p20 | t0*p10_minus_p20 |
							//  t1*p01_minus_p21 | t0*p11_minus_p21 |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fxch	%st(2)			// t1*p01_minus_p21 | t0*p10_minus_p20 |
							//  t1*p00_minus_p20 | t0*p11_minus_p21 |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fsubp	%st(0),%st(3)	// t0*p10_minus_p20 | t1*p00_minus_p20 |
							//  t1*p01_minus_p21 - t0*p11_minus_p21 |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fsubrp	%st(0),%st(1)	// t1*p00_minus_p20 - t0*p10_minus_p20 |
							//  t1*p01_minus_p21 - t0*p11_minus_p21 |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fld		%st(2)			// xstepdenominv |
							//  t1*p00_minus_p20 - t0*p10_minus_p20 |
							//  t1*p01_minus_p21 - t0*p11_minus_p21 |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fmuls	float_minus_1	// ystepdenominv |
							//  t1*p00_minus_p20 - t0*p10_minus_p20 |
							//  t1*p01_minus_p21 - t0*p11_minus_p21 |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fxch	%st(2)			// t1*p01_minus_p21 - t0*p11_minus_p21 |
							//  t1*p00_minus_p20 - t0*p10_minus_p20 |
							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
							//  p11_minus_p21
	fmul	%st(3),%st(0)	// (t1*p01_minus_p21 - t0*p11_minus_p21)*
							//   xstepdenominv |
							//  t1*p00_minus_p20 - t0*p10_minus_p20 |
							//   | ystepdenominv | xstepdenominv |
							//   p00_minus_p20 | p11_minus_p21
	fxch	%st(1)			// t1*p00_minus_p20 - t0*p10_minus_p20 |
							//  (t1*p01_minus_p21 - t0*p11_minus_p21)*
							//   xstepdenominv | ystepdenominv |
							//   xstepdenominv | p00_minus_p20 | p11_minus_p21
	fmul	%st(2),%st(0)	// (t1*p00_minus_p20 - t0*p10_minus_p20)*
							//  ystepdenominv |
							//  (t1*p01_minus_p21 - t0*p11_minus_p21)*
							//  xstepdenominv | ystepdenominv |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fldcw	r_ceil_cw
	fistpl	C(r_lstepy)		// r_lstepx | ystepdenominv | xstepdenominv |
							//  p00_minus_p20 | p11_minus_p21
	fistpl	C(r_lstepx)		// ystepdenominv | xstepdenominv | p00_minus_p20 |
							//  p11_minus_p21
	fldcw	r_single_cw

//	t0 = r_p0[2] - r_p2[2];
//	t1 = r_p1[2] - r_p2[2];

	fildl	C(r_p2)+8		// r_p2[2] | ystepdenominv | xstepdenominv |
							//  p00_minus_p20 | p11_minus_p21
	fildl	C(r_p0)+8		// r_p0[2] | r_p2[2] | ystepdenominv |
							//   xstepdenominv | p00_minus_p20 | p11_minus_p21
	fildl	C(r_p1)+8		// r_p1[2] | r_p0[2] | r_p2[2] | ystepdenominv |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fxch	%st(2)			// r_p2[2] | r_p0[2] | r_p1[2] | ystepdenominv |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fld		%st(0)			// r_p2[2] | r_p2[2] | r_p0[2] | r_p1[2] |
							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
							//  p11_minus_p21
	fsubrp	%st(0),%st(2)	// r_p2[2] | t0 | r_p1[2] | ystepdenominv |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fsubrp	%st(0),%st(2)	// t0 | t1 | ystepdenominv | xstepdenominv |
							//  p00_minus_p20 | p11_minus_p21

//	r_sstepx = (int)((t1 * p01_minus_p21 - t0 * p11_minus_p21) *
//			xstepdenominv);
//	r_sstepy = (int)((t1 * p00_minus_p20 - t0 * p10_minus_p20) *
//			ystepdenominv);

	fld		%st(0)			// t0 | t0 | t1 | ystepdenominv | xstepdenominv
	fmul	%st(6),%st(0)	// t0*p11_minus_p21 | t0 | t1 | ystepdenominv |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fxch	%st(2)			// t1 | t0 | t0*p11_minus_p21 | ystepdenominv |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fld		%st(0)			// t1 | t1 | t0 | t0*p11_minus_p21 |
							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
							//  p11_minus_p21
	fmuls	p01_minus_p21	// t1*p01_minus_p21 | t1 | t0 | t0*p11_minus_p21 |
							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
							//  p11_minus_p21
	fxch	%st(2)			// t0 | t1 | t1*p01_minus_p21 | t0*p11_minus_p21 |
							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
							//  p11_minus_p21
	fmuls	p10_minus_p20	// t0*p10_minus_p20 | t1 | t1*p01_minus_p21 |
							//  t0*p11_minus_p21 | ystepdenominv |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fxch	%st(1)			// t1 | t0*p10_minus_p20 | t1*p01_minus_p21 |
							//  t0*p11_minus_p21 | ystepdenominv |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fmul	%st(6),%st(0)	// t1*p00_minus_p20 | t0*p10_minus_p20 |
							//  t1*p01_minus_p21 | t0*p11_minus_p21 |
							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
							//  p11_minus_p21
	fxch	%st(2)			// t1*p01_minus_p21 | t0*p10_minus_p20 |
							//  t1*p00_minus_p20 | t0*p11_minus_p21 |
							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
							//  p11_minus_p21
	fsubp	%st(0),%st(3)	// t0*p10_minus_p20 | t1*p00_minus_p20 |
							//  t1*p01_minus_p21 - t0*p11_minus_p21 |
							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
							//  p11_minus_p21
	fsubrp	%st(0),%st(1)	// t1*p00_minus_p20 - t0*p10_minus_p20 |
							//  t1*p01_minus_p21 - t0*p11_minus_p21 |
							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
							//  p11_minus_p21
	fmul	%st(2),%st(0)	// (t1*p00_minus_p20 - t0*p10_minus_p20)*
							//   ystepdenominv |
							//  t1*p01_minus_p21 - t0*p11_minus_p21 |
							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
							//  p11_minus_p21
	fxch	%st(1)			// t1*p01_minus_p21 - t0*p11_minus_p21 |
							//  (t1*p00_minus_p20 - t0*p10_minus_p20)*
							//   ystepdenominv | ystepdenominv |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fmul	%st(3),%st(0)	// (t1*p01_minus_p21 - t0*p11_minus_p21)*
							//  xstepdenominv |
							//  (t1*p00_minus_p20 - t0*p10_minus_p20)*
							//  ystepdenominv | ystepdenominv |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fxch	%st(1)			// (t1*p00_minus_p20 - t0*p10_minus_p20)*
							//  ystepdenominv |
							//  (t1*p01_minus_p21 - t0*p11_minus_p21)*
							//  xstepdenominv | ystepdenominv |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fistpl	C(r_sstepy)		// r_sstepx | ystepdenominv | xstepdenominv |
							//  p00_minus_p20 | p11_minus_p21
	fistpl	C(r_sstepx)		// ystepdenominv | xstepdenominv | p00_minus_p20 |
							//  p11_minus_p21

//	t0 = r_p0[3] - r_p2[3];
//	t1 = r_p1[3] - r_p2[3];

	fildl	C(r_p2)+12		// r_p2[3] | ystepdenominv | xstepdenominv |
							//  p00_minus_p20 | p11_minus_p21
	fildl	C(r_p0)+12		// r_p0[3] | r_p2[3] | ystepdenominv |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fildl	C(r_p1)+12		// r_p1[3] | r_p0[3] | r_p2[3] | ystepdenominv |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fxch	%st(2)			// r_p2[3] | r_p0[3] | r_p1[3] | ystepdenominv |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fld		%st(0)			// r_p2[3] | r_p2[3] | r_p0[3] | r_p1[3] |
							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
							//  p11_minus_p21
	fsubrp	%st(0),%st(2)	// r_p2[3] | t0 | r_p1[3] | ystepdenominv |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fsubrp	%st(0),%st(2)	// t0 | t1 | ystepdenominv | xstepdenominv |
							//  p00_minus_p20 | p11_minus_p21

//	r_tstepx = (int)((t1 * p01_minus_p21 - t0 * p11_minus_p21) *
//			xstepdenominv);
//	r_tstepy = (int)((t1 * p00_minus_p20 - t0 * p10_minus_p20) *
//			ystepdenominv);

	fld		%st(0)			// t0 | t0 | t1 | ystepdenominv | xstepdenominv |
							//  p00_minus_p20 | p11_minus_p21
	fmul	%st(6),%st(0)	// t0*p11_minus_p21 | t0 | t1 | ystepdenominv |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fxch	%st(2)			// t1 | t0 | t0*p11_minus_p21 | ystepdenominv |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fld		%st(0)			// t1 | t1 | t0 | t0*p11_minus_p21 |
							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
							//  p11_minus_p21
	fmuls	p01_minus_p21	// t1*p01_minus_p21 | t1 | t0 | t0*p11_minus_p21 |
							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
							//  p11_minus_p21
	fxch	%st(2)			// t0 | t1 | t1*p01_minus_p21 | t0*p11_minus_p21 |
							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
							//  p11_minus_p21
	fmuls	p10_minus_p20	// t0*p10_minus_p20 | t1 | t1*p01_minus_p21 |
							//  t0*p11_minus_p21 | ystepdenominv |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fxch	%st(1)			// t1 | t0*p10_minus_p20 | t1*p01_minus_p21 |
							//  t0*p11_minus_p21 | ystepdenominv |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fmul	%st(6),%st(0)	// t1*p00_minus_p20 | t0*p10_minus_p20 |
							//  t1*p01_minus_p21 | t0*p11_minus_p21 |
							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
							//  p11_minus_p21
	fxch	%st(2)			// t1*p01_minus_p21 | t0*p10_minus_p20 |
							//  t1*p00_minus_p20 | t0*p11_minus_p21 |
							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
							//  p11_minus_p21
	fsubp	%st(0),%st(3)	// t0*p10_minus_p20 | t1*p00_minus_p20 |
							//  t1*p01_minus_p21 - t0*p11_minus_p21 |
							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
							//  p11_minus_p21
	fsubrp	%st(0),%st(1)	// t1*p00_minus_p20 - t0*p10_minus_p20 |
							//  t1*p01_minus_p21 - t0*p11_minus_p21 |
							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
							//  p11_minus_p21
	fmul	%st(2),%st(0)	// (t1*p00_minus_p20 - t0*p10_minus_p20)*
							//   ystepdenominv |
							//  t1*p01_minus_p21 - t0*p11_minus_p21 |
							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
							//  p11_minus_p21
	fxch	%st(1)			// t1*p01_minus_p21 - t0*p11_minus_p21 |
							//  (t1*p00_minus_p20 - t0*p10_minus_p20)*
							//  ystepdenominv | ystepdenominv |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fmul	%st(3),%st(0)	// (t1*p01_minus_p21 - t0*p11_minus_p21)*
							//  xstepdenominv |
							//  (t1*p00_minus_p20 - t0*p10_minus_p20)*
							//  ystepdenominv | ystepdenominv |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fxch	%st(1)			// (t1*p00_minus_p20 - t0*p10_minus_p20)*
							//  ystepdenominv |
							//  (t1*p01_minus_p21 - t0*p11_minus_p21)*
							//  xstepdenominv | ystepdenominv |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fistpl	C(r_tstepy)		// r_tstepx | ystepdenominv | xstepdenominv |
							//  p00_minus_p20 | p11_minus_p21
	fistpl	C(r_tstepx)		// ystepdenominv | xstepdenominv | p00_minus_p20 |
							//  p11_minus_p21

//	t0 = r_p0[5] - r_p2[5];
//	t1 = r_p1[5] - r_p2[5];

	fildl	C(r_p2)+20		// r_p2[5] | ystepdenominv | xstepdenominv |
							//  p00_minus_p20 | p11_minus_p21
	fildl	C(r_p0)+20		// r_p0[5] | r_p2[5] | ystepdenominv |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fildl	C(r_p1)+20		// r_p1[5] | r_p0[5] | r_p2[5] | ystepdenominv |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fxch	%st(2)			// r_p2[5] | r_p0[5] | r_p1[5] | ystepdenominv |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fld		%st(0)			// r_p2[5] | r_p2[5] | r_p0[5] | r_p1[5] |
							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
							//  p11_minus_p21
	fsubrp	%st(0),%st(2)	// r_p2[5] | t0 | r_p1[5] | ystepdenominv |
							//  xstepdenominv | p00_minus_p20 | p11_minus_p21
	fsubrp	%st(0),%st(2)	// t0 | t1 | ystepdenominv | xstepdenominv |
							//  p00_minus_p20 | p11_minus_p21

//	r_zistepx = (int)((t1 * p01_minus_p21 - t0 * p11_minus_p21) *
//			xstepdenominv);
//	r_zistepy = (int)((t1 * p00_minus_p20 - t0 * p10_minus_p20) *
//			ystepdenominv);

	fld		%st(0)			// t0 | t0 | t1 | ystepdenominv | xstepdenominv |
							//  p00_minus_p20 | p11_minus_p21
	fmulp	%st(0),%st(6)	// t0 | t1 | ystepdenominv | xstepdenominv |
							//  p00_minus_p20 | t0*p11_minus_p21
	fxch	%st(1)			// t1 | t0 | ystepdenominv | xstepdenominv |
							//  p00_minus_p20 | t0*p11_minus_p21
	fld		%st(0)			// t1 | t1 | t0 | ystepdenominv | xstepdenominv |
							//  p00_minus_p20 | t0*p11_minus_p21
	fmuls	p01_minus_p21	// t1*p01_minus_p21 | t1 | t0 | ystepdenominv |
							//  xstepdenominv | p00_minus_p20 |
							//  t0*p11_minus_p21
	fxch	%st(2)			// t0 | t1 | t1*p01_minus_p21 | ystepdenominv |
							//  xstepdenominv | p00_minus_p20 |
							//  t0*p11_minus_p21
	fmuls	p10_minus_p20	// t0*p10_minus_p20 | t1 | t1*p01_minus_p21 |
							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
							//  t0*p11_minus_p21
	fxch	%st(1)			// t1 | t0*p10_minus_p20 | t1*p01_minus_p21 |
							//  ystepdenominv | xstepdenominv | p00_minus_p20 |
							//  t0*p11_minus_p21
	fmulp	%st(0),%st(5)	// t0*p10_minus_p20 | t1*p01_minus_p21 |
							//  ystepdenominv | xstepdenominv |
							//  t1*p00_minus_p20 | t0*p11_minus_p21
	fxch	%st(5)			// t0*p11_minus_p21 | t1*p01_minus_p21 |
							//  ystepdenominv | xstepdenominv |
							//  t1*p00_minus_p20 | t0*p10_minus_p20
	fsubrp	%st(0),%st(1)	// t1*p01_minus_p21 - t0*p11_minus_p21 |
							//  ystepdenominv | xstepdenominv |
							//  t1*p00_minus_p20 | t0*p10_minus_p20
	fxch	%st(3)			// t1*p00_minus_p20 | ystepdenominv |
							//  xstepdenominv |
							//  t1*p01_minus_p21 - t0*p11_minus_p21 |
							//  t0*p10_minus_p20
	fsubp	%st(0),%st(4)	// ystepdenominv | xstepdenominv |
							//  t1*p01_minus_p21 - t0*p11_minus_p21 |
							//  t1*p00_minus_p20 - t0*p10_minus_p20
	fxch	%st(1)			// xstepdenominv | ystepdenominv |
							//  t1*p01_minus_p21 - t0*p11_minus_p21 |
							//  t1*p00_minus_p20 - t0*p10_minus_p20
	fmulp	%st(0),%st(2)	// ystepdenominv |
							//  (t1*p01_minus_p21 - t0*p11_minus_p21) *
							//  xstepdenominv |
							//  t1*p00_minus_p20 - t0*p10_minus_p20
	fmulp	%st(0),%st(2)	// (t1*p01_minus_p21 - t0*p11_minus_p21) *
							//  xstepdenominv |
							//  (t1*p00_minus_p20 - t0*p10_minus_p20) *
							//  ystepdenominv
	fistpl	C(r_zistepx)	// (t1*p00_minus_p20 - t0*p10_minus_p20) *
							//  ystepdenominv
	fistpl	C(r_zistepy)

//	a_sstepxfrac = r_sstepx << 16;
//	a_tstepxfrac = r_tstepx << 16;
//
//	a_ststepxwhole = r_affinetridesc.skinwidth * (r_tstepx >> 16) +
//			(r_sstepx >> 16);

	movl	C(r_sstepx),%eax
	movl	C(r_tstepx),%edx
	shll	$16,%eax
	shll	$16,%edx
	movl	%eax,C(a_sstepxfrac)
	movl	%edx,C(a_tstepxfrac)

	movl	C(r_sstepx),%ecx
	movl	C(r_tstepx),%eax
	sarl	$16,%ecx
	sarl	$16,%eax
	imull	skinwidth(%esp)
	addl	%ecx,%eax
	movl	%eax,C(a_ststepxwhole)

	ret


//----------------------------------------------------------------------
// recursive subdivision affine triangle drawing code
//
// not C-callable because of stdcall return
//----------------------------------------------------------------------

#define lp1	4+16
#define lp2	8+16
#define lp3	12+16

.globl C(D_PolysetRecursiveTriangle)
C(D_PolysetRecursiveTriangle):
	pushl	%ebp				// preserve caller stack frame pointer
	pushl	%esi				// preserve register variables
	pushl	%edi
	pushl	%ebx

//	int		*temp;
//	int		d;
//	int		new[6];
//	int		i;
//	int		z;
//	short	*zbuf;
	movl	lp2(%esp),%esi
	movl	lp1(%esp),%ebx
	movl	lp3(%esp),%edi

//	d = lp2[0] - lp1[0];
//	if (d < -1 || d > 1)
//		goto split;
	movl	0(%esi),%eax

	movl	0(%ebx),%edx
	movl	4(%esi),%ebp

	subl	%edx,%eax
	movl	4(%ebx),%ecx

	subl	%ecx,%ebp
	incl	%eax

	cmpl	$2,%eax
	ja		LSplit

//	d = lp2[1] - lp1[1];
//	if (d < -1 || d > 1)
//		goto split;
	movl	0(%edi),%eax
	incl	%ebp

	cmpl	$2,%ebp
	ja		LSplit

//	d = lp3[0] - lp2[0];
//	if (d < -1 || d > 1)
//		goto split2;
	movl	0(%esi),%edx
	movl	4(%edi),%ebp

	subl	%edx,%eax
	movl	4(%esi),%ecx

	subl	%ecx,%ebp
	incl	%eax

	cmpl	$2,%eax
	ja		LSplit2

//	d = lp3[1] - lp2[1];
//	if (d < -1 || d > 1)
//		goto split2;
	movl	0(%ebx),%eax
	incl	%ebp

	cmpl	$2,%ebp
	ja		LSplit2

//	d = lp1[0] - lp3[0];
//	if (d < -1 || d > 1)
//		goto split3;
	movl	0(%edi),%edx
	movl	4(%ebx),%ebp

	subl	%edx,%eax
	movl	4(%edi),%ecx

	subl	%ecx,%ebp
	incl	%eax

	incl	%ebp
	movl	%ebx,%edx

	cmpl	$2,%eax
	ja		LSplit3

//	d = lp1[1] - lp3[1];
//	if (d < -1 || d > 1)
//	{
//split3:
//		temp = lp1;
//		lp3 = lp2;
//		lp1 = lp3;
//		lp2 = temp;
//		goto split;
//	}
//
//	return;			// entire tri is filled
//
	cmpl	$2,%ebp
	jna		LDone

LSplit3:
	movl	%edi,%ebx
	movl	%esi,%edi
	movl	%edx,%esi
	jmp		LSplit

//split2:
LSplit2:

//	temp = lp1;
//	lp1 = lp2;
//	lp2 = lp3;
//	lp3 = temp;
	movl	%ebx,%eax
	movl	%esi,%ebx
	movl	%edi,%esi
	movl	%eax,%edi

//split:
LSplit:

	subl	$24,%esp		// allocate space for a new vertex

//// split this edge
//	new[0] = (lp1[0] + lp2[0]) >> 1;
//	new[1] = (lp1[1] + lp2[1]) >> 1;
//	new[2] = (lp1[2] + lp2[2]) >> 1;
//	new[3] = (lp1[3] + lp2[3]) >> 1;
//	new[5] = (lp1[5] + lp2[5]) >> 1;
	movl	8(%ebx),%eax

	movl	8(%esi),%edx
	movl	12(%ebx),%ecx

	addl	%edx,%eax
	movl	12(%esi),%edx

	sarl	$1,%eax
	addl	%edx,%ecx

	movl	%eax,8(%esp)
	movl	20(%ebx),%eax

	sarl	$1,%ecx
	movl	20(%esi),%edx

	movl	%ecx,12(%esp)
	addl	%edx,%eax

	movl	0(%ebx),%ecx
	movl	0(%esi),%edx

	sarl	$1,%eax
	addl	%ecx,%edx

	movl	%eax,20(%esp)
	movl	4(%ebx),%eax

	sarl	$1,%edx
	movl	4(%esi),%ebp

	movl	%edx,0(%esp)
	addl	%eax,%ebp

	sarl	$1,%ebp
	movl	%ebp,4(%esp)

//// draw the point if splitting a leading edge
//	if (lp2[1] > lp1[1])
//		goto nodraw;
	cmpl	%eax,4(%esi)
	jg		LNoDraw

//	if ((lp2[1] == lp1[1]) && (lp2[0] < lp1[0]))
//		goto nodraw;
	movl	0(%esi),%edx
	jnz		LDraw

	cmpl	%ecx,%edx
	jl		LNoDraw

LDraw:

// z = new[5] >> 16;
	movl	20(%esp),%edx
	movl	4(%esp),%ecx

	sarl	$16,%edx
	movl	0(%esp),%ebp

//	zbuf = zspantable[new[1]] + new[0];
	movl	C(zspantable)(,%ecx,4),%eax

//	if (z >= *zbuf)
//	{
	cmpw	(%eax,%ebp,2),%dx
	jnge	LNoDraw

//		int		pix;
//		
//		*zbuf = z;
	movw	%dx,(%eax,%ebp,2)

//		pix = d_pcolormap[skintable[new[3]>>16][new[2]>>16]];
	movl	12(%esp),%eax

	sarl	$16,%eax
	movl	8(%esp),%edx

	sarl	$16,%edx
	subl	%ecx,%ecx

	movl	C(skintable)(,%eax,4),%eax
	movl	4(%esp),%ebp

	movb	(%eax,%edx,),%cl
	movl	C(d_pcolormap),%edx

	movb	(%edx,%ecx,),%dl
	movl	0(%esp),%ecx

//		d_viewbuffer[d_scantable[new[1]] + new[0]] = pix;
	movl	C(d_scantable)(,%ebp,4),%eax
	addl	%eax,%ecx
	movl	C(d_viewbuffer),%eax
	movb	%dl,(%eax,%ecx,1)

//	}
//
//nodraw:
LNoDraw:

//// recursively continue
//	D_PolysetRecursiveTriangle (lp3, lp1, new);
	pushl	%esp
	pushl	%ebx
	pushl	%edi
	call	C(D_PolysetRecursiveTriangle)

//	D_PolysetRecursiveTriangle (lp3, new, lp2);
	movl	%esp,%ebx
	pushl	%esi
	pushl	%ebx
	pushl	%edi
	call	C(D_PolysetRecursiveTriangle)
	addl	$24,%esp

LDone:
	popl	%ebx				// restore register variables
	popl	%edi
	popl	%esi
	popl	%ebp				// restore caller stack frame pointer
	ret		$12


//----------------------------------------------------------------------
// 8-bpp horizontal span drawing code for affine polygons, with smooth
// shading and no transparency
//----------------------------------------------------------------------

#define pspans	4+8

.globl C(D_PolysetAff8Start)
C(D_PolysetAff8Start):

.globl C(D_PolysetDrawSpans8)
C(D_PolysetDrawSpans8):
	pushl	%esi				// preserve register variables
	pushl	%ebx

	movl	pspans(%esp),%esi	// point to the first span descriptor
	movl	C(r_zistepx),%ecx

	pushl	%ebp				// preserve caller's stack frame
	pushl	%edi

	rorl	$16,%ecx			// put high 16 bits of 1/z step in low word
	movl	spanpackage_t_count(%esi),%edx

	movl	%ecx,lzistepx

LSpanLoop:

//		lcount = d_aspancount - pspanpackage->count;
//
//		errorterm += erroradjustup;
//		if (errorterm >= 0)
//		{
//			d_aspancount += d_countextrastep;
//			errorterm -= erroradjustdown;
//		}
//		else
//		{
//			d_aspancount += ubasestep;
//		}
	movl	C(d_aspancount),%eax
	subl	%edx,%eax

	movl	C(erroradjustup),%edx
	movl	C(errorterm),%ebx
	addl	%edx,%ebx
	js		LNoTurnover

	movl	C(erroradjustdown),%edx
	movl	C(d_countextrastep),%edi
	subl	%edx,%ebx
	movl	C(d_aspancount),%ebp
	movl	%ebx,C(errorterm)
	addl	%edi,%ebp
	movl	%ebp,C(d_aspancount)
	jmp		LRightEdgeStepped

LNoTurnover:
	movl	C(d_aspancount),%edi
	movl	C(ubasestep),%edx
	movl	%ebx,C(errorterm)
	addl	%edx,%edi
	movl	%edi,C(d_aspancount)

LRightEdgeStepped:
	cmpl	$1,%eax

	jl		LNextSpan
	jz		LExactlyOneLong

//
// set up advancetable
//
	movl	C(a_ststepxwhole),%ecx
	movl	C(r_affinetridesc)+atd_skinwidth,%edx

	movl	%ecx,advancetable+4	// advance base in t
	addl	%edx,%ecx

	movl	%ecx,advancetable	// advance extra in t
	movl	C(a_tstepxfrac),%ecx

	movw	C(r_lstepx),%cx
	movl	%eax,%edx			// count

	movl	%ecx,tstep
	addl	$7,%edx

	shrl	$3,%edx				// count of full and partial loops
	movl	spanpackage_t_sfrac(%esi),%ebx

	movw	%dx,%bx
	movl	spanpackage_t_pz(%esi),%ecx

	negl	%eax

	movl	spanpackage_t_pdest(%esi),%edi
	andl	$7,%eax		// 0->0, 1->7, 2->6, ... , 7->1

	subl	%eax,%edi	// compensate for hardwired offsets
	subl	%eax,%ecx

	subl	%eax,%ecx
	movl	spanpackage_t_tfrac(%esi),%edx

	movw	spanpackage_t_light(%esi),%dx
	movl	spanpackage_t_zi(%esi),%ebp

	rorl	$16,%ebp	// put high 16 bits of 1/z in low word
	pushl	%esi

	movl	spanpackage_t_ptex(%esi),%esi
	jmp		*aff8entryvec_table(,%eax,4)

// %bx = count of full and partial loops
// %ebx high word = sfrac
// %ecx = pz
// %dx = light
// %edx high word = tfrac
// %esi = ptex
// %edi = pdest
// %ebp = 1/z
// tstep low word = C(r_lstepx)
// tstep high word = C(a_tstepxfrac)
// C(a_sstepxfrac) low word = 0
// C(a_sstepxfrac) high word = C(a_sstepxfrac)

LDrawLoop:

// FIXME: do we need to clamp light? We may need at least a buffer bit to
// keep it from poking into tfrac and causing problems

LDraw8:
	cmpw	(%ecx),%bp
	jl		Lp1
	xorl	%eax,%eax
	movb	%dh,%ah
	movb	(%esi),%al
	movw	%bp,(%ecx)
	movb	0x12345678(%eax),%al
LPatch8:
	movb	%al,(%edi)
Lp1:
	addl	tstep,%edx
	sbbl	%eax,%eax
	addl	lzistepx,%ebp
	adcl	$0,%ebp
	addl	C(a_sstepxfrac),%ebx
	adcl	advancetable+4(,%eax,4),%esi

LDraw7:
	cmpw	2(%ecx),%bp
	jl		Lp2
	xorl	%eax,%eax
	movb	%dh,%ah
	movb	(%esi),%al
	movw	%bp,2(%ecx)
	movb	0x12345678(%eax),%al
LPatch7:
	movb	%al,1(%edi)
Lp2:
	addl	tstep,%edx
	sbbl	%eax,%eax
	addl	lzistepx,%ebp
	adcl	$0,%ebp
	addl	C(a_sstepxfrac),%ebx
	adcl	advancetable+4(,%eax,4),%esi

LDraw6:
	cmpw	4(%ecx),%bp
	jl		Lp3
	xorl	%eax,%eax
	movb	%dh,%ah
	movb	(%esi),%al
	movw	%bp,4(%ecx)
	movb	0x12345678(%eax),%al
LPatch6:
	movb	%al,2(%edi)
Lp3:
	addl	tstep,%edx
	sbbl	%eax,%eax
	addl	lzistepx,%ebp
	adcl	$0,%ebp
	addl	C(a_sstepxfrac),%ebx
	adcl	advancetable+4(,%eax,4),%esi

LDraw5:
	cmpw	6(%ecx),%bp
	jl		Lp4
	xorl	%eax,%eax
	movb	%dh,%ah
	movb	(%esi),%al
	movw	%bp,6(%ecx)
	movb	0x12345678(%eax),%al
LPatch5:
	movb	%al,3(%edi)
Lp4:
	addl	tstep,%edx
	sbbl	%eax,%eax
	addl	lzistepx,%ebp
	adcl	$0,%ebp
	addl	C(a_sstepxfrac),%ebx
	adcl	advancetable+4(,%eax,4),%esi

LDraw4:
	cmpw	8(%ecx),%bp
	jl		Lp5
	xorl	%eax,%eax
	movb	%dh,%ah
	movb	(%esi),%al
	movw	%bp,8(%ecx)
	movb	0x12345678(%eax),%al
LPatch4:
	movb	%al,4(%edi)
Lp5:
	addl	tstep,%edx
	sbbl	%eax,%eax
	addl	lzistepx,%ebp
	adcl	$0,%ebp
	addl	C(a_sstepxfrac),%ebx
	adcl	advancetable+4(,%eax,4),%esi

LDraw3:
	cmpw	10(%ecx),%bp
	jl		Lp6
	xorl	%eax,%eax
	movb	%dh,%ah
	movb	(%esi),%al
	movw	%bp,10(%ecx)
	movb	0x12345678(%eax),%al
LPatch3:
	movb	%al,5(%edi)
Lp6:
	addl	tstep,%edx
	sbbl	%eax,%eax
	addl	lzistepx,%ebp
	adcl	$0,%ebp
	addl	C(a_sstepxfrac),%ebx
	adcl	advancetable+4(,%eax,4),%esi

LDraw2:
	cmpw	12(%ecx),%bp
	jl		Lp7
	xorl	%eax,%eax
	movb	%dh,%ah
	movb	(%esi),%al
	movw	%bp,12(%ecx)
	movb	0x12345678(%eax),%al
LPatch2:
	movb	%al,6(%edi)
Lp7:
	addl	tstep,%edx
	sbbl	%eax,%eax
	addl	lzistepx,%ebp
	adcl	$0,%ebp
	addl	C(a_sstepxfrac),%ebx
	adcl	advancetable+4(,%eax,4),%esi

LDraw1:
	cmpw	14(%ecx),%bp
	jl		Lp8
	xorl	%eax,%eax
	movb	%dh,%ah
	movb	(%esi),%al
	movw	%bp,14(%ecx)
	movb	0x12345678(%eax),%al
LPatch1:
	movb	%al,7(%edi)
Lp8:
	addl	tstep,%edx
	sbbl	%eax,%eax
	addl	lzistepx,%ebp
	adcl	$0,%ebp
	addl	C(a_sstepxfrac),%ebx
	adcl	advancetable+4(,%eax,4),%esi

	addl	$8,%edi
	addl	$16,%ecx

	decw	%bx
	jnz		LDrawLoop

	popl	%esi				// restore spans pointer
LNextSpan:
	addl	$(spanpackage_t_size),%esi	// point to next span
LNextSpanESISet:
	movl	spanpackage_t_count(%esi),%edx
	cmpl	$-999999,%edx		// any more spans?
	jnz		LSpanLoop			// yes

	popl	%edi
	popl	%ebp				// restore the caller's stack frame
	popl	%ebx				// restore register variables
	popl	%esi
	ret


// draw a one-long span

LExactlyOneLong:

	movl	spanpackage_t_pz(%esi),%ecx
	movl	spanpackage_t_zi(%esi),%ebp

	rorl	$16,%ebp	// put high 16 bits of 1/z in low word
	movl	spanpackage_t_ptex(%esi),%ebx

	cmpw	(%ecx),%bp
	jl		LNextSpan
	xorl	%eax,%eax
	movl	spanpackage_t_pdest(%esi),%edi
	movb	spanpackage_t_light+1(%esi),%ah
	addl	$(spanpackage_t_size),%esi	// point to next span
	movb	(%ebx),%al
	movw	%bp,(%ecx)
	movb	0x12345678(%eax),%al
LPatch9:
	movb	%al,(%edi)

	jmp		LNextSpanESISet

.globl C(D_PolysetAff8End)
C(D_PolysetAff8End):


#define pcolormap		4

.globl C(D_Aff8Patch)
C(D_Aff8Patch):
	movl	pcolormap(%esp),%eax
	movl	%eax,LPatch1-4
	movl	%eax,LPatch2-4
	movl	%eax,LPatch3-4
	movl	%eax,LPatch4-4
	movl	%eax,LPatch5-4
	movl	%eax,LPatch6-4
	movl	%eax,LPatch7-4
	movl	%eax,LPatch8-4
	movl	%eax,LPatch9-4

	ret


//----------------------------------------------------------------------
// Alias model polygon dispatching code, combined with subdivided affine
// triangle drawing code
//----------------------------------------------------------------------

.globl C(D_PolysetDraw)
C(D_PolysetDraw):

//	spanpackage_t	spans[DPS_MAXSPANS + 1 +
//			((CACHE_SIZE - 1) / sizeof(spanpackage_t)) + 1];
//						// one extra because of cache line pretouching
//
//	a_spans = (spanpackage_t *)
//			(((long)&spans[0] + CACHE_SIZE - 1) & ~(CACHE_SIZE - 1));
	subl	$(SPAN_SIZE),%esp
	movl	%esp,%eax
	addl	$(CACHE_SIZE - 1),%eax
	andl	$(~(CACHE_SIZE - 1)),%eax
	movl	%eax,C(a_spans)

//	if (r_affinetridesc.drawtype)
//		D_DrawSubdiv ();
//	else
//		D_DrawNonSubdiv ();
	movl	C(r_affinetridesc)+atd_drawtype,%eax
	testl	%eax,%eax
	jz		C(D_DrawNonSubdiv)

	pushl	%ebp				// preserve caller stack frame pointer

//	lnumtriangles = r_affinetridesc.numtriangles;
	movl	C(r_affinetridesc)+atd_numtriangles,%ebp

	pushl	%esi				// preserve register variables
	shll	$4,%ebp

	pushl	%ebx
//	ptri = r_affinetridesc.ptriangles;
	movl	C(r_affinetridesc)+atd_ptriangles,%ebx

	pushl	%edi

//	mtriangle_t		*ptri;
//	finalvert_t		*pfv, *index0, *index1, *index2;
//	int				i;
//	int				lnumtriangles;
//	int				s0, s1, s2;

//	pfv = r_affinetridesc.pfinalverts;
	movl	C(r_affinetridesc)+atd_pfinalverts,%edi

//	for (i=0 ; i<lnumtriangles ; i++)
//	{

Llooptop:

//		index0 = pfv + ptri[i].vertindex[0];
//		index1 = pfv + ptri[i].vertindex[1];
//		index2 = pfv + ptri[i].vertindex[2];
	movl	mtri_vertindex-16+0(%ebx,%ebp,),%ecx
	movl	mtri_vertindex-16+4(%ebx,%ebp,),%esi

	shll	$(fv_shift),%ecx
	movl	mtri_vertindex-16+8(%ebx,%ebp,),%edx

	shll	$(fv_shift),%esi
	addl	%edi,%ecx

	shll	$(fv_shift),%edx
	addl	%edi,%esi

	addl	%edi,%edx

//		if (((index0->v[1]-index1->v[1]) *
//				(index0->v[0]-index2->v[0]) -
//				(index0->v[0]-index1->v[0])*(index0->v[1]-index2->v[1])) >= 0)
//		{
//			continue;
//		}
//
//		d_pcolormap = &((byte *)acolormap)[index0->v[4] & 0xFF00];
	fildl	fv_v+4(%ecx)	// i0v1
	fildl	fv_v+4(%esi)	// i1v1 | i0v1
	fildl	fv_v+0(%ecx)	// i0v0 | i1v1 | i0v1
	fildl	fv_v+0(%edx)	// i2v0 | i0v0 | i1v1 | i0v1
	fxch	%st(2)			// i1v1 | i0v0 | i2v0 | i0v1
	fsubr	%st(3),%st(0)	// i0v1-i1v1 | i0v0 | i2v0 | i0v1
	fildl	fv_v+0(%esi)	// i1v0 | i0v1-i1v1 | i0v0 | i2v0 | i0v1
	fxch	%st(2)			// i0v0 | i0v1-i1v1 | i1v0 | i2v0 | i0v1
	fsub	%st(0),%st(3)	// i0v0 | i0v1-i1v1 | i1v0 | i0v0-i2v0 | i0v1
	fildl	fv_v+4(%edx)	// i2v1 | i0v0 | i0v1-i1v1 | i1v0 | i0v0-i2v0| i0v1
	fxch	%st(1)			// i0v0 | i2v1 | i0v1-i1v1 | i1v0 | i0v0-i2v0| i0v1
	fsubp	%st(0),%st(3)	// i2v1 | i0v1-i1v1 | i0v0-i1v0 | i0v0-i2v0 | i0v1
	fxch	%st(1)			// i0v1-i1v1 | i2v1 | i0v0-i1v0 | i0v0-i2v0 | i0v1
	fmulp	%st(0),%st(3)	// i2v1 | i0v0-i1v0 | i0v1-i1v1*i0v0-i2v0 | i0v1
	fsubrp	%st(0),%st(3)	// i0v0-i1v0 | i0v1-i1v1*i0v0-i2v0 | i0v1-i2v1
	movl	fv_v+16(%ecx),%eax
	andl	$0xFF00,%eax
	fmulp	%st(0),%st(2)	// i0v1-i1v1*i0v0-i2v0 | i0v0-i1v0*i0v1-i2v1
	addl	C(acolormap),%eax
	fsubp	%st(0),%st(1)	// (i0v1-i1v1)*(i0v0-i2v0)-(i0v0-i1v0)*(i0v1-i2v1)
	movl	%eax,C(d_pcolormap)
	fstps	Ltemp
	movl	Ltemp,%eax
	subl	$0x80000001,%eax
	jc		Lskip

//		if (ptri[i].facesfront)
//		{
//			D_PolysetRecursiveTriangle(index0->v, index1->v, index2->v);
	movl	mtri_facesfront-16(%ebx,%ebp,),%eax
	testl	%eax,%eax
	jz		Lfacesback

	pushl	%edx
	pushl	%esi
	pushl	%ecx
	call	C(D_PolysetRecursiveTriangle)

	subl	$16,%ebp
	jnz		Llooptop
	jmp		Ldone2

//		}
//		else
//		{
Lfacesback:

//			s0 = index0->v[2];
//			s1 = index1->v[2];
//			s2 = index2->v[2];
	movl	fv_v+8(%ecx),%eax
	pushl	%eax
	movl	fv_v+8(%esi),%eax
	pushl	%eax
	movl	fv_v+8(%edx),%eax
	pushl	%eax
	pushl	%ecx
	pushl	%edx

//			if (index0->flags & ALIAS_ONSEAM)
//				index0->v[2] += r_affinetridesc.seamfixupX16;
	movl	C(r_affinetridesc)+atd_seamfixupX16,%eax
	testl	$(ALIAS_ONSEAM),fv_flags(%ecx)
	jz		Lp11
	addl	%eax,fv_v+8(%ecx)
Lp11:

//			if (index1->flags & ALIAS_ONSEAM)
//				index1->v[2] += r_affinetridesc.seamfixupX16;
	testl	$(ALIAS_ONSEAM),fv_flags(%esi)
	jz		Lp12
	addl	%eax,fv_v+8(%esi)
Lp12:

//			if (index2->flags & ALIAS_ONSEAM)
//				index2->v[2] += r_affinetridesc.seamfixupX16;
	testl	$(ALIAS_ONSEAM),fv_flags(%edx)
	jz		Lp13
	addl	%eax,fv_v+8(%edx)
Lp13:

//			D_PolysetRecursiveTriangle(index0->v, index1->v, index2->v);
	pushl	%edx
	pushl	%esi
	pushl	%ecx
	call	C(D_PolysetRecursiveTriangle)

//			index0->v[2] = s0;
//			index1->v[2] = s1;
//			index2->v[2] = s2;
	popl	%edx
	popl	%ecx
	popl	%eax
	movl	%eax,fv_v+8(%edx)
	popl	%eax
	movl	%eax,fv_v+8(%esi)
	popl	%eax
	movl	%eax,fv_v+8(%ecx)

//		}
//	}
Lskip:
	subl	$16,%ebp
	jnz		Llooptop

Ldone2:
	popl	%edi				// restore the caller's stack frame
	popl	%ebx
	popl	%esi				// restore register variables
	popl	%ebp

	addl	$(SPAN_SIZE),%esp

	ret


//----------------------------------------------------------------------
// Alias model triangle left-edge scanning code
//----------------------------------------------------------------------

#define height	4+16

.globl C(D_PolysetScanLeftEdge)
C(D_PolysetScanLeftEdge):
	pushl	%ebp				// preserve caller stack frame pointer
	pushl	%esi				// preserve register variables
	pushl	%edi
	pushl	%ebx

	movl	height(%esp),%eax
	movl	C(d_sfrac),%ecx
	andl	$0xFFFF,%eax
	movl	C(d_ptex),%ebx
	orl		%eax,%ecx
	movl	C(d_pedgespanpackage),%esi
	movl	C(d_tfrac),%edx
	movl	C(d_light),%edi
	movl	C(d_zi),%ebp

// %eax: scratch
// %ebx: d_ptex
// %ecx: d_sfrac in high word, count in low word
// %edx: d_tfrac
// %esi: d_pedgespanpackage, errorterm, scratch alternately
// %edi: d_light
// %ebp: d_zi

//	do
//	{

LScanLoop:

//		d_pedgespanpackage->ptex = ptex;
//		d_pedgespanpackage->pdest = d_pdest;
//		d_pedgespanpackage->pz = d_pz;
//		d_pedgespanpackage->count = d_aspancount;
//		d_pedgespanpackage->light = d_light;
//		d_pedgespanpackage->zi = d_zi;
//		d_pedgespanpackage->sfrac = d_sfrac << 16;
//		d_pedgespanpackage->tfrac = d_tfrac << 16;
	movl	%ebx,spanpackage_t_ptex(%esi)
	movl	C(d_pdest),%eax
	movl	%eax,spanpackage_t_pdest(%esi)
	movl	C(d_pz),%eax
	movl	%eax,spanpackage_t_pz(%esi)
	movl	C(d_aspancount),%eax
	movl	%eax,spanpackage_t_count(%esi)
	movl	%edi,spanpackage_t_light(%esi)
	movl	%ebp,spanpackage_t_zi(%esi)
	movl	%ecx,spanpackage_t_sfrac(%esi)
	movl	%edx,spanpackage_t_tfrac(%esi)

// pretouch the next cache line
	movb	spanpackage_t_size(%esi),%al

//		d_pedgespanpackage++;
	addl	$(spanpackage_t_size),%esi
	movl	C(erroradjustup),%eax
	movl	%esi,C(d_pedgespanpackage)

//		errorterm += erroradjustup;
	movl	C(errorterm),%esi
	addl	%eax,%esi
	movl	C(d_pdest),%eax

//		if (errorterm >= 0)
//		{
	js		LNoLeftEdgeTurnover

//			errorterm -= erroradjustdown;
//			d_pdest += d_pdestextrastep;
	subl	C(erroradjustdown),%esi
	addl	C(d_pdestextrastep),%eax
	movl	%esi,C(errorterm)
	movl	%eax,C(d_pdest)

//			d_pz += d_pzextrastep;
//			d_aspancount += d_countextrastep;
//			d_ptex += d_ptexextrastep;
//			d_sfrac += d_sfracextrastep;
//			d_ptex += d_sfrac >> 16;
//			d_sfrac &= 0xFFFF;
//			d_tfrac += d_tfracextrastep;
	movl	C(d_pz),%eax
	movl	C(d_aspancount),%esi
	addl	C(d_pzextrastep),%eax
	addl	C(d_sfracextrastep),%ecx
	adcl	C(d_ptexextrastep),%ebx
	addl	C(d_countextrastep),%esi
	movl	%eax,C(d_pz)
	movl	C(d_tfracextrastep),%eax
	movl	%esi,C(d_aspancount)
	addl	%eax,%edx

//			if (d_tfrac & 0x10000)
//			{
	jnc		LSkip1

//				d_ptex += r_affinetridesc.skinwidth;
//				d_tfrac &= 0xFFFF;
	addl	C(r_affinetridesc)+atd_skinwidth,%ebx

//			}

LSkip1:

//			d_light += d_lightextrastep;
//			d_zi += d_ziextrastep;
	addl	C(d_lightextrastep),%edi
	addl	C(d_ziextrastep),%ebp

//		}
	movl	C(d_pedgespanpackage),%esi
	decl	%ecx
	testl	$0xFFFF,%ecx
	jnz		LScanLoop

	popl	%ebx
	popl	%edi
	popl	%esi
	popl	%ebp
	ret

//		else
//		{

LNoLeftEdgeTurnover:
	movl	%esi,C(errorterm)

//			d_pdest += d_pdestbasestep;
	addl	C(d_pdestbasestep),%eax
	movl	%eax,C(d_pdest)

//			d_pz += d_pzbasestep;
//			d_aspancount += ubasestep;
//			d_ptex += d_ptexbasestep;
//			d_sfrac += d_sfracbasestep;
//			d_ptex += d_sfrac >> 16;
//			d_sfrac &= 0xFFFF;
	movl	C(d_pz),%eax
	movl	C(d_aspancount),%esi
	addl	C(d_pzbasestep),%eax
	addl	C(d_sfracbasestep),%ecx
	adcl	C(d_ptexbasestep),%ebx
	addl	C(ubasestep),%esi
	movl	%eax,C(d_pz)
	movl	%esi,C(d_aspancount)

//			d_tfrac += d_tfracbasestep;
	movl	C(d_tfracbasestep),%esi
	addl	%esi,%edx

//			if (d_tfrac & 0x10000)
//			{
	jnc		LSkip2

//				d_ptex += r_affinetridesc.skinwidth;
//				d_tfrac &= 0xFFFF;
	addl	C(r_affinetridesc)+atd_skinwidth,%ebx

//			}

LSkip2:

//			d_light += d_lightbasestep;
//			d_zi += d_zibasestep;
	addl	C(d_lightbasestep),%edi
	addl	C(d_zibasestep),%ebp

//		}
//	} while (--height);
	movl	C(d_pedgespanpackage),%esi
	decl	%ecx
	testl	$0xFFFF,%ecx
	jnz		LScanLoop

	popl	%ebx
	popl	%edi
	popl	%esi
	popl	%ebp
	ret


//----------------------------------------------------------------------
// Alias model vertex drawing code
//----------------------------------------------------------------------

#define fv			4+8
#define	numverts	8+8

.globl C(D_PolysetDrawFinalVerts)
C(D_PolysetDrawFinalVerts):
	pushl	%ebp				// preserve caller stack frame pointer
	pushl	%ebx

//	int		i, z;
//	short	*zbuf;

	movl	numverts(%esp),%ecx
	movl	fv(%esp),%ebx

	pushl	%esi				// preserve register variables
	pushl	%edi

LFVLoop:

//	for (i=0 ; i<numverts ; i++, fv++)
//	{
//	// valid triangle coordinates for filling can include the bottom and
//	// right clip edges, due to the fill rule; these shouldn't be drawn
//		if ((fv->v[0] < r_refdef.vrectright) &&
//			(fv->v[1] < r_refdef.vrectbottom))
//		{
	movl	fv_v+0(%ebx),%eax
	movl	C(r_refdef)+rd_vrectright,%edx
	cmpl	%edx,%eax
	jge		LNextVert
	movl	fv_v+4(%ebx),%esi
	movl	C(r_refdef)+rd_vrectbottom,%edx
	cmpl	%edx,%esi
	jge		LNextVert

//			zbuf = zspantable[fv->v[1]] + fv->v[0];
	movl	C(zspantable)(,%esi,4),%edi

//			z = fv->v[5]>>16;
	movl	fv_v+20(%ebx),%edx
	shrl	$16,%edx

//			if (z >= *zbuf)
//			{
//				int		pix;
	cmpw	(%edi,%eax,2),%dx
	jl		LNextVert

//				*zbuf = z;
	movw	%dx,(%edi,%eax,2)

//				pix = skintable[fv->v[3]>>16][fv->v[2]>>16];
	movl	fv_v+12(%ebx),%edi
	shrl	$16,%edi
	movl	C(skintable)(,%edi,4),%edi
	movl	fv_v+8(%ebx),%edx
	shrl	$16,%edx
	movb	(%edi,%edx),%dl

//				pix = ((byte *)acolormap)[pix + (fv->v[4] & 0xFF00)];
	movl	fv_v+16(%ebx),%edi
	andl	$0xFF00,%edi
	andl	$0x00FF,%edx
	addl	%edx,%edi
	movl	C(acolormap),%edx
	movb	(%edx,%edi,1),%dl

//				d_viewbuffer[d_scantable[fv->v[1]] + fv->v[0]] = pix;
	movl	C(d_scantable)(,%esi,4),%edi
	movl	C(d_viewbuffer),%esi
	addl	%eax,%edi
	movb	%dl,(%esi,%edi)

//			}
//		}
//	}
LNextVert:
	addl	$(fv_size),%ebx
	decl	%ecx
	jnz		LFVLoop

	popl	%edi
	popl	%esi
	popl	%ebx
	popl	%ebp
	ret


//----------------------------------------------------------------------
// Alias model non-subdivided polygon dispatching code
//
// not C-callable because of stack buffer cleanup
//----------------------------------------------------------------------

.globl C(D_DrawNonSubdiv)
C(D_DrawNonSubdiv):
	pushl	%ebp				// preserve caller stack frame pointer
	movl	C(r_affinetridesc)+atd_numtriangles,%ebp
	pushl	%ebx
	shll	$(mtri_shift),%ebp
	pushl	%esi				// preserve register variables
	movl	C(r_affinetridesc)+atd_ptriangles,%esi
	pushl	%edi

//	mtriangle_t		*ptri;
//	finalvert_t		*pfv, *index0, *index1, *index2;
//	int				i;
//	int				lnumtriangles;

//	pfv = r_affinetridesc.pfinalverts;
//	ptri = r_affinetridesc.ptriangles;
//	lnumtriangles = r_affinetridesc.numtriangles;

LNDLoop:

//	for (i=0 ; i<lnumtriangles ; i++, ptri++)
//	{
//		index0 = pfv + ptri->vertindex[0];
//		index1 = pfv + ptri->vertindex[1];
//		index2 = pfv + ptri->vertindex[2];
	movl	C(r_affinetridesc)+atd_pfinalverts,%edi
	movl	mtri_vertindex+0-mtri_size(%esi,%ebp,1),%ecx
	shll	$(fv_shift),%ecx
	movl	mtri_vertindex+4-mtri_size(%esi,%ebp,1),%edx
	shll	$(fv_shift),%edx
	movl	mtri_vertindex+8-mtri_size(%esi,%ebp,1),%ebx
	shll	$(fv_shift),%ebx
	addl	%edi,%ecx
	addl	%edi,%edx
	addl	%edi,%ebx

//		d_xdenom = (index0->v[1]-index1->v[1]) *
//				(index0->v[0]-index2->v[0]) -
//				(index0->v[0]-index1->v[0])*(index0->v[1]-index2->v[1]);
	movl	fv_v+4(%ecx),%eax
	movl	fv_v+0(%ecx),%esi
	subl	fv_v+4(%edx),%eax
	subl	fv_v+0(%ebx),%esi
	imull	%esi,%eax
	movl	fv_v+0(%ecx),%esi
	movl	fv_v+4(%ecx),%edi
	subl	fv_v+0(%edx),%esi
	subl	fv_v+4(%ebx),%edi
	imull	%esi,%edi
	subl	%edi,%eax

//		if (d_xdenom >= 0)
//		{
//			continue;
	jns		LNextTri

//		}

	movl	%eax,C(d_xdenom)
	fildl	C(d_xdenom)

//		r_p0[0] = index0->v[0];		// u
//		r_p0[1] = index0->v[1];		// v
//		r_p0[2] = index0->v[2];		// s
//		r_p0[3] = index0->v[3];		// t
//		r_p0[4] = index0->v[4];		// light
//		r_p0[5] = index0->v[5];		// iz
	movl	fv_v+0(%ecx),%eax
	movl	fv_v+4(%ecx),%esi
	movl	%eax,C(r_p0)+0
	movl	%esi,C(r_p0)+4
	movl	fv_v+8(%ecx),%eax
	movl	fv_v+12(%ecx),%esi
	movl	%eax,C(r_p0)+8
	movl	%esi,C(r_p0)+12
	movl	fv_v+16(%ecx),%eax
	movl	fv_v+20(%ecx),%esi
	movl	%eax,C(r_p0)+16
	movl	%esi,C(r_p0)+20

	fdivrs	float_1

//		r_p1[0] = index1->v[0];
//		r_p1[1] = index1->v[1];
//		r_p1[2] = index1->v[2];
//		r_p1[3] = index1->v[3];
//		r_p1[4] = index1->v[4];
//		r_p1[5] = index1->v[5];
	movl	fv_v+0(%edx),%eax
	movl	fv_v+4(%edx),%esi
	movl	%eax,C(r_p1)+0
	movl	%esi,C(r_p1)+4
	movl	fv_v+8(%edx),%eax
	movl	fv_v+12(%edx),%esi
	movl	%eax,C(r_p1)+8
	movl	%esi,C(r_p1)+12
	movl	fv_v+16(%edx),%eax
	movl	fv_v+20(%edx),%esi
	movl	%eax,C(r_p1)+16
	movl	%esi,C(r_p1)+20

//		r_p2[0] = index2->v[0];
//		r_p2[1] = index2->v[1];
//		r_p2[2] = index2->v[2];
//		r_p2[3] = index2->v[3];
//		r_p2[4] = index2->v[4];
//		r_p2[5] = index2->v[5];
	movl	fv_v+0(%ebx),%eax
	movl	fv_v+4(%ebx),%esi
	movl	%eax,C(r_p2)+0
	movl	%esi,C(r_p2)+4
	movl	fv_v+8(%ebx),%eax
	movl	fv_v+12(%ebx),%esi
	movl	%eax,C(r_p2)+8
	movl	%esi,C(r_p2)+12
	movl	fv_v+16(%ebx),%eax
	movl	fv_v+20(%ebx),%esi
	movl	%eax,C(r_p2)+16
	movl	C(r_affinetridesc)+atd_ptriangles,%edi
	movl	%esi,C(r_p2)+20
	movl	mtri_facesfront-mtri_size(%edi,%ebp,1),%eax

//		if (!ptri->facesfront)
//		{
	testl	%eax,%eax
	jnz		LFacesFront

//			if (index0->flags & ALIAS_ONSEAM)
//				r_p0[2] += r_affinetridesc.seamfixupX16;
	movl	fv_flags(%ecx),%eax
	movl	fv_flags(%edx),%esi
	movl	fv_flags(%ebx),%edi
	testl	$(ALIAS_ONSEAM),%eax
	movl	C(r_affinetridesc)+atd_seamfixupX16,%eax
	jz		LOnseamDone0
	addl	%eax,C(r_p0)+8
LOnseamDone0:

//			if (index1->flags & ALIAS_ONSEAM)
// 				r_p1[2] += r_affinetridesc.seamfixupX16;
	testl	$(ALIAS_ONSEAM),%esi
	jz		LOnseamDone1
	addl	%eax,C(r_p1)+8
LOnseamDone1:

//			if (index2->flags & ALIAS_ONSEAM)
//				r_p2[2] += r_affinetridesc.seamfixupX16;
	testl	$(ALIAS_ONSEAM),%edi
	jz		LOnseamDone2
	addl	%eax,C(r_p2)+8
LOnseamDone2:

//		}

LFacesFront:

	fstps	C(d_xdenom)

//		D_PolysetSetEdgeTable ();
//		D_RasterizeAliasPolySmooth ();
		call	C(D_PolysetSetEdgeTable)
		call	C(D_RasterizeAliasPolySmooth)

LNextTri:
		movl	C(r_affinetridesc)+atd_ptriangles,%esi
		subl	$16,%ebp
		jnz		LNDLoop
//	}

	popl	%edi
	popl	%esi
	popl	%ebx
	popl	%ebp

	addl	$(SPAN_SIZE),%esp

	ret


#endif	// USE_INTEL_ASM