gravitybone/linux/d_polysa.s
2020-06-12 11:14:07 -07:00

1250 lines
33 KiB
ArmAsm

//
// d_polysa.s
// x86 assembly-language polygon model drawing code
//
#include "qasm.h"
#include "d_ifacea.h"
#if id386
// !!! if this is changed, it must be changed in d_polyse.c too !!!
#define DPS_MAXSPANS MAXHEIGHT+1
// 1 extra for spanpackage that marks end
//#define SPAN_SIZE (((DPS_MAXSPANS + 1 + ((CACHE_SIZE - 1) / spanpackage_t_size)) + 1) * spanpackage_t_size)
#define SPAN_SIZE (1024+1+1+1)*32
.data
.align 4
p10_minus_p20: .single 0
p01_minus_p21: .single 0
temp0: .single 0
temp1: .single 0
Ltemp: .single 0
aff8entryvec_table: .long LDraw8, LDraw7, LDraw6, LDraw5
.long LDraw4, LDraw3, LDraw2, LDraw1
lzistepx: .long 0
.text
#ifndef NeXT
.extern C(D_PolysetSetEdgeTable)
.extern C(D_RasterizeAliasPolySmooth)
#endif
//----------------------------------------------------------------------
// affine triangle gradient calculation code
//----------------------------------------------------------------------
#if 0
#define skinwidth 4+0
.globl C(R_PolysetCalcGradients)
C(R_PolysetCalcGradients):
// p00_minus_p20 = r_p0[0] - r_p2[0];
// p01_minus_p21 = r_p0[1] - r_p2[1];
// p10_minus_p20 = r_p1[0] - r_p2[0];
// p11_minus_p21 = r_p1[1] - r_p2[1];
//
// xstepdenominv = 1.0 / (p10_minus_p20 * p01_minus_p21 -
// p00_minus_p20 * p11_minus_p21);
//
// ystepdenominv = -xstepdenominv;
fildl C(r_p0)+0 // r_p0[0]
fildl C(r_p2)+0 // r_p2[0] | r_p0[0]
fildl C(r_p0)+4 // r_p0[1] | r_p2[0] | r_p0[0]
fildl C(r_p2)+4 // r_p2[1] | r_p0[1] | r_p2[0] | r_p0[0]
fildl C(r_p1)+0 // r_p1[0] | r_p2[1] | r_p0[1] | r_p2[0] | r_p0[0]
fildl C(r_p1)+4 // r_p1[1] | r_p1[0] | r_p2[1] | r_p0[1] |
// r_p2[0] | r_p0[0]
fxch %st(3) // r_p0[1] | r_p1[0] | r_p2[1] | r_p1[1] |
// r_p2[0] | r_p0[0]
fsub %st(2),%st(0) // p01_minus_p21 | r_p1[0] | r_p2[1] | r_p1[1] |
// r_p2[0] | r_p0[0]
fxch %st(1) // r_p1[0] | p01_minus_p21 | r_p2[1] | r_p1[1] |
// r_p2[0] | r_p0[0]
fsub %st(4),%st(0) // p10_minus_p20 | p01_minus_p21 | r_p2[1] |
// r_p1[1] | r_p2[0] | r_p0[0]
fxch %st(5) // r_p0[0] | p01_minus_p21 | r_p2[1] |
// r_p1[1] | r_p2[0] | p10_minus_p20
fsubp %st(0),%st(4) // p01_minus_p21 | r_p2[1] | r_p1[1] |
// p00_minus_p20 | p10_minus_p20
fxch %st(2) // r_p1[1] | r_p2[1] | p01_minus_p21 |
// p00_minus_p20 | p10_minus_p20
fsubp %st(0),%st(1) // p11_minus_p21 | p01_minus_p21 |
// p00_minus_p20 | p10_minus_p20
fxch %st(1) // p01_minus_p21 | p11_minus_p21 |
// p00_minus_p20 | p10_minus_p20
flds C(d_xdenom) // d_xdenom | p01_minus_p21 | p11_minus_p21 |
// p00_minus_p20 | p10_minus_p20
fxch %st(4) // p10_minus_p20 | p01_minus_p21 | p11_minus_p21 |
// p00_minus_p20 | d_xdenom
fstps p10_minus_p20 // p01_minus_p21 | p11_minus_p21 |
// p00_minus_p20 | d_xdenom
fstps p01_minus_p21 // p11_minus_p21 | p00_minus_p20 | xstepdenominv
fxch %st(2) // xstepdenominv | p00_minus_p20 | p11_minus_p21
//// ceil () for light so positive steps are exaggerated, negative steps
//// diminished, pushing us away from underflow toward overflow. Underflow is
//// very visible, overflow is very unlikely, because of ambient lighting
// t0 = r_p0[4] - r_p2[4];
// t1 = r_p1[4] - r_p2[4];
fildl C(r_p2)+16 // r_p2[4] | xstepdenominv | p00_minus_p20 |
// p11_minus_p21
fildl C(r_p0)+16 // r_p0[4] | r_p2[4] | xstepdenominv |
// p00_minus_p20 | p11_minus_p21
fildl C(r_p1)+16 // r_p1[4] | r_p0[4] | r_p2[4] | xstepdenominv |
// p00_minus_p20 | p11_minus_p21
fxch %st(2) // r_p2[4] | r_p0[4] | r_p1[4] | xstepdenominv |
// p00_minus_p20 | p11_minus_p21
fld %st(0) // r_p2[4] | r_p2[4] | r_p0[4] | r_p1[4] |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fsubrp %st(0),%st(2) // r_p2[4] | t0 | r_p1[4] | xstepdenominv |
// p00_minus_p20 | p11_minus_p21
fsubrp %st(0),%st(2) // t0 | t1 | xstepdenominv | p00_minus_p20 |
// p11_minus_p21
// r_lstepx = (int)
// ceil((t1 * p01_minus_p21 - t0 * p11_minus_p21) * xstepdenominv);
// r_lstepy = (int)
// ceil((t1 * p00_minus_p20 - t0 * p10_minus_p20) * ystepdenominv);
fld %st(0) // t0 | t0 | t1 | xstepdenominv | p00_minus_p20 |
// p11_minus_p21
fmul %st(5),%st(0) // t0*p11_minus_p21 | t0 | t1 | xstepdenominv |
// p00_minus_p20 | p11_minus_p21
fxch %st(2) // t1 | t0 | t0*p11_minus_p21 | xstepdenominv |
// p00_minus_p20 | p11_minus_p21
fld %st(0) // t1 | t1 | t0 | t0*p11_minus_p21 |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fmuls p01_minus_p21 // t1*p01_minus_p21 | t1 | t0 | t0*p11_minus_p21 |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fxch %st(2) // t0 | t1 | t1*p01_minus_p21 | t0*p11_minus_p21 |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fmuls p10_minus_p20 // t0*p10_minus_p20 | t1 | t1*p01_minus_p21 |
// t0*p11_minus_p21 | xstepdenominv |
// p00_minus_p20 | p11_minus_p21
fxch %st(1) // t1 | t0*p10_minus_p20 | t1*p01_minus_p21 |
// t0*p11_minus_p21 | xstepdenominv |
// p00_minus_p20 | p11_minus_p21
fmul %st(5),%st(0) // t1*p00_minus_p20 | t0*p10_minus_p20 |
// t1*p01_minus_p21 | t0*p11_minus_p21 |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fxch %st(2) // t1*p01_minus_p21 | t0*p10_minus_p20 |
// t1*p00_minus_p20 | t0*p11_minus_p21 |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fsubp %st(0),%st(3) // t0*p10_minus_p20 | t1*p00_minus_p20 |
// t1*p01_minus_p21 - t0*p11_minus_p21 |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fsubrp %st(0),%st(1) // t1*p00_minus_p20 - t0*p10_minus_p20 |
// t1*p01_minus_p21 - t0*p11_minus_p21 |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fld %st(2) // xstepdenominv |
// t1*p00_minus_p20 - t0*p10_minus_p20 |
// t1*p01_minus_p21 - t0*p11_minus_p21 |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fmuls float_minus_1 // ystepdenominv |
// t1*p00_minus_p20 - t0*p10_minus_p20 |
// t1*p01_minus_p21 - t0*p11_minus_p21 |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fxch %st(2) // t1*p01_minus_p21 - t0*p11_minus_p21 |
// t1*p00_minus_p20 - t0*p10_minus_p20 |
// ystepdenominv | xstepdenominv | p00_minus_p20 |
// p11_minus_p21
fmul %st(3),%st(0) // (t1*p01_minus_p21 - t0*p11_minus_p21)*
// xstepdenominv |
// t1*p00_minus_p20 - t0*p10_minus_p20 |
// | ystepdenominv | xstepdenominv |
// p00_minus_p20 | p11_minus_p21
fxch %st(1) // t1*p00_minus_p20 - t0*p10_minus_p20 |
// (t1*p01_minus_p21 - t0*p11_minus_p21)*
// xstepdenominv | ystepdenominv |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fmul %st(2),%st(0) // (t1*p00_minus_p20 - t0*p10_minus_p20)*
// ystepdenominv |
// (t1*p01_minus_p21 - t0*p11_minus_p21)*
// xstepdenominv | ystepdenominv |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fldcw ceil_cw
fistpl C(r_lstepy) // r_lstepx | ystepdenominv | xstepdenominv |
// p00_minus_p20 | p11_minus_p21
fistpl C(r_lstepx) // ystepdenominv | xstepdenominv | p00_minus_p20 |
// p11_minus_p21
fldcw single_cw
// t0 = r_p0[2] - r_p2[2];
// t1 = r_p1[2] - r_p2[2];
fildl C(r_p2)+8 // r_p2[2] | ystepdenominv | xstepdenominv |
// p00_minus_p20 | p11_minus_p21
fildl C(r_p0)+8 // r_p0[2] | r_p2[2] | ystepdenominv |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fildl C(r_p1)+8 // r_p1[2] | r_p0[2] | r_p2[2] | ystepdenominv |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fxch %st(2) // r_p2[2] | r_p0[2] | r_p1[2] | ystepdenominv |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fld %st(0) // r_p2[2] | r_p2[2] | r_p0[2] | r_p1[2] |
// ystepdenominv | xstepdenominv | p00_minus_p20 |
// p11_minus_p21
fsubrp %st(0),%st(2) // r_p2[2] | t0 | r_p1[2] | ystepdenominv |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fsubrp %st(0),%st(2) // t0 | t1 | ystepdenominv | xstepdenominv |
// p00_minus_p20 | p11_minus_p21
// r_sstepx = (int)((t1 * p01_minus_p21 - t0 * p11_minus_p21) *
// xstepdenominv);
// r_sstepy = (int)((t1 * p00_minus_p20 - t0 * p10_minus_p20) *
// ystepdenominv);
fld %st(0) // t0 | t0 | t1 | ystepdenominv | xstepdenominv
fmul %st(6),%st(0) // t0*p11_minus_p21 | t0 | t1 | ystepdenominv |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fxch %st(2) // t1 | t0 | t0*p11_minus_p21 | ystepdenominv |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fld %st(0) // t1 | t1 | t0 | t0*p11_minus_p21 |
// ystepdenominv | xstepdenominv | p00_minus_p20 |
// p11_minus_p21
fmuls p01_minus_p21 // t1*p01_minus_p21 | t1 | t0 | t0*p11_minus_p21 |
// ystepdenominv | xstepdenominv | p00_minus_p20 |
// p11_minus_p21
fxch %st(2) // t0 | t1 | t1*p01_minus_p21 | t0*p11_minus_p21 |
// ystepdenominv | xstepdenominv | p00_minus_p20 |
// p11_minus_p21
fmuls p10_minus_p20 // t0*p10_minus_p20 | t1 | t1*p01_minus_p21 |
// t0*p11_minus_p21 | ystepdenominv |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fxch %st(1) // t1 | t0*p10_minus_p20 | t1*p01_minus_p21 |
// t0*p11_minus_p21 | ystepdenominv |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fmul %st(6),%st(0) // t1*p00_minus_p20 | t0*p10_minus_p20 |
// t1*p01_minus_p21 | t0*p11_minus_p21 |
// ystepdenominv | xstepdenominv | p00_minus_p20 |
// p11_minus_p21
fxch %st(2) // t1*p01_minus_p21 | t0*p10_minus_p20 |
// t1*p00_minus_p20 | t0*p11_minus_p21 |
// ystepdenominv | xstepdenominv | p00_minus_p20 |
// p11_minus_p21
fsubp %st(0),%st(3) // t0*p10_minus_p20 | t1*p00_minus_p20 |
// t1*p01_minus_p21 - t0*p11_minus_p21 |
// ystepdenominv | xstepdenominv | p00_minus_p20 |
// p11_minus_p21
fsubrp %st(0),%st(1) // t1*p00_minus_p20 - t0*p10_minus_p20 |
// t1*p01_minus_p21 - t0*p11_minus_p21 |
// ystepdenominv | xstepdenominv | p00_minus_p20 |
// p11_minus_p21
fmul %st(2),%st(0) // (t1*p00_minus_p20 - t0*p10_minus_p20)*
// ystepdenominv |
// t1*p01_minus_p21 - t0*p11_minus_p21 |
// ystepdenominv | xstepdenominv | p00_minus_p20 |
// p11_minus_p21
fxch %st(1) // t1*p01_minus_p21 - t0*p11_minus_p21 |
// (t1*p00_minus_p20 - t0*p10_minus_p20)*
// ystepdenominv | ystepdenominv |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fmul %st(3),%st(0) // (t1*p01_minus_p21 - t0*p11_minus_p21)*
// xstepdenominv |
// (t1*p00_minus_p20 - t0*p10_minus_p20)*
// ystepdenominv | ystepdenominv |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fxch %st(1) // (t1*p00_minus_p20 - t0*p10_minus_p20)*
// ystepdenominv |
// (t1*p01_minus_p21 - t0*p11_minus_p21)*
// xstepdenominv | ystepdenominv |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fistpl C(r_sstepy) // r_sstepx | ystepdenominv | xstepdenominv |
// p00_minus_p20 | p11_minus_p21
fistpl C(r_sstepx) // ystepdenominv | xstepdenominv | p00_minus_p20 |
// p11_minus_p21
// t0 = r_p0[3] - r_p2[3];
// t1 = r_p1[3] - r_p2[3];
fildl C(r_p2)+12 // r_p2[3] | ystepdenominv | xstepdenominv |
// p00_minus_p20 | p11_minus_p21
fildl C(r_p0)+12 // r_p0[3] | r_p2[3] | ystepdenominv |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fildl C(r_p1)+12 // r_p1[3] | r_p0[3] | r_p2[3] | ystepdenominv |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fxch %st(2) // r_p2[3] | r_p0[3] | r_p1[3] | ystepdenominv |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fld %st(0) // r_p2[3] | r_p2[3] | r_p0[3] | r_p1[3] |
// ystepdenominv | xstepdenominv | p00_minus_p20 |
// p11_minus_p21
fsubrp %st(0),%st(2) // r_p2[3] | t0 | r_p1[3] | ystepdenominv |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fsubrp %st(0),%st(2) // t0 | t1 | ystepdenominv | xstepdenominv |
// p00_minus_p20 | p11_minus_p21
// r_tstepx = (int)((t1 * p01_minus_p21 - t0 * p11_minus_p21) *
// xstepdenominv);
// r_tstepy = (int)((t1 * p00_minus_p20 - t0 * p10_minus_p20) *
// ystepdenominv);
fld %st(0) // t0 | t0 | t1 | ystepdenominv | xstepdenominv |
// p00_minus_p20 | p11_minus_p21
fmul %st(6),%st(0) // t0*p11_minus_p21 | t0 | t1 | ystepdenominv |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fxch %st(2) // t1 | t0 | t0*p11_minus_p21 | ystepdenominv |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fld %st(0) // t1 | t1 | t0 | t0*p11_minus_p21 |
// ystepdenominv | xstepdenominv | p00_minus_p20 |
// p11_minus_p21
fmuls p01_minus_p21 // t1*p01_minus_p21 | t1 | t0 | t0*p11_minus_p21 |
// ystepdenominv | xstepdenominv | p00_minus_p20 |
// p11_minus_p21
fxch %st(2) // t0 | t1 | t1*p01_minus_p21 | t0*p11_minus_p21 |
// ystepdenominv | xstepdenominv | p00_minus_p20 |
// p11_minus_p21
fmuls p10_minus_p20 // t0*p10_minus_p20 | t1 | t1*p01_minus_p21 |
// t0*p11_minus_p21 | ystepdenominv |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fxch %st(1) // t1 | t0*p10_minus_p20 | t1*p01_minus_p21 |
// t0*p11_minus_p21 | ystepdenominv |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fmul %st(6),%st(0) // t1*p00_minus_p20 | t0*p10_minus_p20 |
// t1*p01_minus_p21 | t0*p11_minus_p21 |
// ystepdenominv | xstepdenominv | p00_minus_p20 |
// p11_minus_p21
fxch %st(2) // t1*p01_minus_p21 | t0*p10_minus_p20 |
// t1*p00_minus_p20 | t0*p11_minus_p21 |
// ystepdenominv | xstepdenominv | p00_minus_p20 |
// p11_minus_p21
fsubp %st(0),%st(3) // t0*p10_minus_p20 | t1*p00_minus_p20 |
// t1*p01_minus_p21 - t0*p11_minus_p21 |
// ystepdenominv | xstepdenominv | p00_minus_p20 |
// p11_minus_p21
fsubrp %st(0),%st(1) // t1*p00_minus_p20 - t0*p10_minus_p20 |
// t1*p01_minus_p21 - t0*p11_minus_p21 |
// ystepdenominv | xstepdenominv | p00_minus_p20 |
// p11_minus_p21
fmul %st(2),%st(0) // (t1*p00_minus_p20 - t0*p10_minus_p20)*
// ystepdenominv |
// t1*p01_minus_p21 - t0*p11_minus_p21 |
// ystepdenominv | xstepdenominv | p00_minus_p20 |
// p11_minus_p21
fxch %st(1) // t1*p01_minus_p21 - t0*p11_minus_p21 |
// (t1*p00_minus_p20 - t0*p10_minus_p20)*
// ystepdenominv | ystepdenominv |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fmul %st(3),%st(0) // (t1*p01_minus_p21 - t0*p11_minus_p21)*
// xstepdenominv |
// (t1*p00_minus_p20 - t0*p10_minus_p20)*
// ystepdenominv | ystepdenominv |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fxch %st(1) // (t1*p00_minus_p20 - t0*p10_minus_p20)*
// ystepdenominv |
// (t1*p01_minus_p21 - t0*p11_minus_p21)*
// xstepdenominv | ystepdenominv |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fistpl C(r_tstepy) // r_tstepx | ystepdenominv | xstepdenominv |
// p00_minus_p20 | p11_minus_p21
fistpl C(r_tstepx) // ystepdenominv | xstepdenominv | p00_minus_p20 |
// p11_minus_p21
// t0 = r_p0[5] - r_p2[5];
// t1 = r_p1[5] - r_p2[5];
fildl C(r_p2)+20 // r_p2[5] | ystepdenominv | xstepdenominv |
// p00_minus_p20 | p11_minus_p21
fildl C(r_p0)+20 // r_p0[5] | r_p2[5] | ystepdenominv |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fildl C(r_p1)+20 // r_p1[5] | r_p0[5] | r_p2[5] | ystepdenominv |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fxch %st(2) // r_p2[5] | r_p0[5] | r_p1[5] | ystepdenominv |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fld %st(0) // r_p2[5] | r_p2[5] | r_p0[5] | r_p1[5] |
// ystepdenominv | xstepdenominv | p00_minus_p20 |
// p11_minus_p21
fsubrp %st(0),%st(2) // r_p2[5] | t0 | r_p1[5] | ystepdenominv |
// xstepdenominv | p00_minus_p20 | p11_minus_p21
fsubrp %st(0),%st(2) // t0 | t1 | ystepdenominv | xstepdenominv |
// p00_minus_p20 | p11_minus_p21
// r_zistepx = (int)((t1 * p01_minus_p21 - t0 * p11_minus_p21) *
// xstepdenominv);
// r_zistepy = (int)((t1 * p00_minus_p20 - t0 * p10_minus_p20) *
// ystepdenominv);
fld %st(0) // t0 | t0 | t1 | ystepdenominv | xstepdenominv |
// p00_minus_p20 | p11_minus_p21
fmulp %st(0),%st(6) // t0 | t1 | ystepdenominv | xstepdenominv |
// p00_minus_p20 | t0*p11_minus_p21
fxch %st(1) // t1 | t0 | ystepdenominv | xstepdenominv |
// p00_minus_p20 | t0*p11_minus_p21
fld %st(0) // t1 | t1 | t0 | ystepdenominv | xstepdenominv |
// p00_minus_p20 | t0*p11_minus_p21
fmuls p01_minus_p21 // t1*p01_minus_p21 | t1 | t0 | ystepdenominv |
// xstepdenominv | p00_minus_p20 |
// t0*p11_minus_p21
fxch %st(2) // t0 | t1 | t1*p01_minus_p21 | ystepdenominv |
// xstepdenominv | p00_minus_p20 |
// t0*p11_minus_p21
fmuls p10_minus_p20 // t0*p10_minus_p20 | t1 | t1*p01_minus_p21 |
// ystepdenominv | xstepdenominv | p00_minus_p20 |
// t0*p11_minus_p21
fxch %st(1) // t1 | t0*p10_minus_p20 | t1*p01_minus_p21 |
// ystepdenominv | xstepdenominv | p00_minus_p20 |
// t0*p11_minus_p21
fmulp %st(0),%st(5) // t0*p10_minus_p20 | t1*p01_minus_p21 |
// ystepdenominv | xstepdenominv |
// t1*p00_minus_p20 | t0*p11_minus_p21
fxch %st(5) // t0*p11_minus_p21 | t1*p01_minus_p21 |
// ystepdenominv | xstepdenominv |
// t1*p00_minus_p20 | t0*p10_minus_p20
fsubrp %st(0),%st(1) // t1*p01_minus_p21 - t0*p11_minus_p21 |
// ystepdenominv | xstepdenominv |
// t1*p00_minus_p20 | t0*p10_minus_p20
fxch %st(3) // t1*p00_minus_p20 | ystepdenominv |
// xstepdenominv |
// t1*p01_minus_p21 - t0*p11_minus_p21 |
// t0*p10_minus_p20
fsubp %st(0),%st(4) // ystepdenominv | xstepdenominv |
// t1*p01_minus_p21 - t0*p11_minus_p21 |
// t1*p00_minus_p20 - t0*p10_minus_p20
fxch %st(1) // xstepdenominv | ystepdenominv |
// t1*p01_minus_p21 - t0*p11_minus_p21 |
// t1*p00_minus_p20 - t0*p10_minus_p20
fmulp %st(0),%st(2) // ystepdenominv |
// (t1*p01_minus_p21 - t0*p11_minus_p21) *
// xstepdenominv |
// t1*p00_minus_p20 - t0*p10_minus_p20
fmulp %st(0),%st(2) // (t1*p01_minus_p21 - t0*p11_minus_p21) *
// xstepdenominv |
// (t1*p00_minus_p20 - t0*p10_minus_p20) *
// ystepdenominv
fistpl C(r_zistepx) // (t1*p00_minus_p20 - t0*p10_minus_p20) *
// ystepdenominv
fistpl C(r_zistepy)
// a_sstepxfrac = r_sstepx << 16;
// a_tstepxfrac = r_tstepx << 16;
//
// a_ststepxwhole = r_affinetridesc.skinwidth * (r_tstepx >> 16) +
// (r_sstepx >> 16);
movl C(r_sstepx),%eax
movl C(r_tstepx),%edx
shll $16,%eax
shll $16,%edx
movl %eax,C(a_sstepxfrac)
movl %edx,C(a_tstepxfrac)
movl C(r_sstepx),%ecx
movl C(r_tstepx),%eax
sarl $16,%ecx
sarl $16,%eax
imull skinwidth(%esp)
addl %ecx,%eax
movl %eax,C(a_ststepxwhole)
ret
#endif
//----------------------------------------------------------------------
// recursive subdivision affine triangle drawing code
//
// not C-callable because of stdcall return
//----------------------------------------------------------------------
#define lp1 4+16
#define lp2 8+16
#define lp3 12+16
.globl C(D_PolysetRecursiveTriangle)
C(D_PolysetRecursiveTriangle):
pushl %ebp // preserve caller stack frame pointer
pushl %esi // preserve register variables
pushl %edi
pushl %ebx
// int *temp;
// int d;
// int new[6];
// int i;
// int z;
// short *zbuf;
movl lp2(%esp),%esi
movl lp1(%esp),%ebx
movl lp3(%esp),%edi
// d = lp2[0] - lp1[0];
// if (d < -1 || d > 1)
// goto split;
movl 0(%esi),%eax
movl 0(%ebx),%edx
movl 4(%esi),%ebp
subl %edx,%eax
movl 4(%ebx),%ecx
subl %ecx,%ebp
incl %eax
cmpl $2,%eax
ja LSplit
// d = lp2[1] - lp1[1];
// if (d < -1 || d > 1)
// goto split;
movl 0(%edi),%eax
incl %ebp
cmpl $2,%ebp
ja LSplit
// d = lp3[0] - lp2[0];
// if (d < -1 || d > 1)
// goto split2;
movl 0(%esi),%edx
movl 4(%edi),%ebp
subl %edx,%eax
movl 4(%esi),%ecx
subl %ecx,%ebp
incl %eax
cmpl $2,%eax
ja LSplit2
// d = lp3[1] - lp2[1];
// if (d < -1 || d > 1)
// goto split2;
movl 0(%ebx),%eax
incl %ebp
cmpl $2,%ebp
ja LSplit2
// d = lp1[0] - lp3[0];
// if (d < -1 || d > 1)
// goto split3;
movl 0(%edi),%edx
movl 4(%ebx),%ebp
subl %edx,%eax
movl 4(%edi),%ecx
subl %ecx,%ebp
incl %eax
incl %ebp
movl %ebx,%edx
cmpl $2,%eax
ja LSplit3
// d = lp1[1] - lp3[1];
// if (d < -1 || d > 1)
// {
//split3:
// temp = lp1;
// lp3 = lp2;
// lp1 = lp3;
// lp2 = temp;
// goto split;
// }
//
// return; // entire tri is filled
//
cmpl $2,%ebp
jna LDone
LSplit3:
movl %edi,%ebx
movl %esi,%edi
movl %edx,%esi
jmp LSplit
//split2:
LSplit2:
// temp = lp1;
// lp1 = lp2;
// lp2 = lp3;
// lp3 = temp;
movl %ebx,%eax
movl %esi,%ebx
movl %edi,%esi
movl %eax,%edi
//split:
LSplit:
subl $24,%esp // allocate space for a new vertex
//// split this edge
// new[0] = (lp1[0] + lp2[0]) >> 1;
// new[1] = (lp1[1] + lp2[1]) >> 1;
// new[2] = (lp1[2] + lp2[2]) >> 1;
// new[3] = (lp1[3] + lp2[3]) >> 1;
// new[5] = (lp1[5] + lp2[5]) >> 1;
movl 8(%ebx),%eax
movl 8(%esi),%edx
movl 12(%ebx),%ecx
addl %edx,%eax
movl 12(%esi),%edx
sarl $1,%eax
addl %edx,%ecx
movl %eax,8(%esp)
movl 20(%ebx),%eax
sarl $1,%ecx
movl 20(%esi),%edx
movl %ecx,12(%esp)
addl %edx,%eax
movl 0(%ebx),%ecx
movl 0(%esi),%edx
sarl $1,%eax
addl %ecx,%edx
movl %eax,20(%esp)
movl 4(%ebx),%eax
sarl $1,%edx
movl 4(%esi),%ebp
movl %edx,0(%esp)
addl %eax,%ebp
sarl $1,%ebp
movl %ebp,4(%esp)
//// draw the point if splitting a leading edge
// if (lp2[1] > lp1[1])
// goto nodraw;
cmpl %eax,4(%esi)
jg LNoDraw
// if ((lp2[1] == lp1[1]) && (lp2[0] < lp1[0]))
// goto nodraw;
movl 0(%esi),%edx
jnz LDraw
cmpl %ecx,%edx
jl LNoDraw
LDraw:
// z = new[5] >> 16;
movl 20(%esp),%edx
movl 4(%esp),%ecx
sarl $16,%edx
movl 0(%esp),%ebp
// zbuf = zspantable[new[1]] + new[0];
movl C(zspantable)(,%ecx,4),%eax
// if (z >= *zbuf)
// {
cmpw (%eax,%ebp,2),%dx
jnge LNoDraw
// int pix;
//
// *zbuf = z;
movw %dx,(%eax,%ebp,2)
// pix = d_pcolormap[skintable[new[3]>>16][new[2]>>16]];
movl 12(%esp),%eax
sarl $16,%eax
movl 8(%esp),%edx
sarl $16,%edx
subl %ecx,%ecx
movl C(skintable)(,%eax,4),%eax
movl 4(%esp),%ebp
movb (%eax,%edx,),%cl
movl C(d_pcolormap),%edx
movb (%edx,%ecx,),%dl
movl 0(%esp),%ecx
// d_viewbuffer[d_scantable[new[1]] + new[0]] = pix;
movl C(d_scantable)(,%ebp,4),%eax
addl %eax,%ecx
movl C(d_viewbuffer),%eax
movb %dl,(%eax,%ecx,1)
// }
//
//nodraw:
LNoDraw:
//// recursively continue
// D_PolysetRecursiveTriangle (lp3, lp1, new);
pushl %esp
pushl %ebx
pushl %edi
call C(D_PolysetRecursiveTriangle)
// D_PolysetRecursiveTriangle (lp3, new, lp2);
movl %esp,%ebx
pushl %esi
pushl %ebx
pushl %edi
call C(D_PolysetRecursiveTriangle)
addl $24,%esp
LDone:
popl %ebx // restore register variables
popl %edi
popl %esi
popl %ebp // restore caller stack frame pointer
ret $12
//----------------------------------------------------------------------
// 8-bpp horizontal span drawing code for affine polygons, with smooth
// shading and no transparency
//----------------------------------------------------------------------
#define pspans 4+8
.globl C(D_PolysetAff8Start)
C(D_PolysetAff8Start):
.globl C(R_PolysetDrawSpans8_Opaque)
C(R_PolysetDrawSpans8_Opaque):
pushl %esi // preserve register variables
pushl %ebx
movl pspans(%esp),%esi // point to the first span descriptor
movl C(r_zistepx),%ecx
pushl %ebp // preserve caller's stack frame
pushl %edi
rorl $16,%ecx // put high 16 bits of 1/z step in low word
movl spanpackage_t_count(%esi),%edx
movl %ecx,lzistepx
LSpanLoop:
// lcount = d_aspancount - pspanpackage->count;
//
// errorterm += erroradjustup;
// if (errorterm >= 0)
// {
// d_aspancount += d_countextrastep;
// errorterm -= erroradjustdown;
// }
// else
// {
// d_aspancount += ubasestep;
// }
movl C(d_aspancount),%eax
subl %edx,%eax
movl C(erroradjustup),%edx
movl C(errorterm),%ebx
addl %edx,%ebx
js LNoTurnover
movl C(erroradjustdown),%edx
movl C(d_countextrastep),%edi
subl %edx,%ebx
movl C(d_aspancount),%ebp
movl %ebx,C(errorterm)
addl %edi,%ebp
movl %ebp,C(d_aspancount)
jmp LRightEdgeStepped
LNoTurnover:
movl C(d_aspancount),%edi
movl C(ubasestep),%edx
movl %ebx,C(errorterm)
addl %edx,%edi
movl %edi,C(d_aspancount)
LRightEdgeStepped:
cmpl $1,%eax
jl LNextSpan
jz LExactlyOneLong
//
// set up advancetable
//
movl C(a_ststepxwhole),%ecx
movl C(r_affinetridesc)+atd_skinwidth,%edx
movl %ecx,advancetable+4 // advance base in t
addl %edx,%ecx
movl %ecx,advancetable // advance extra in t
movl C(a_tstepxfrac),%ecx
movw C(r_lstepx),%cx
movl %eax,%edx // count
movl %ecx,tstep
addl $7,%edx
shrl $3,%edx // count of full and partial loops
movl spanpackage_t_sfrac(%esi),%ebx
movw %dx,%bx
movl spanpackage_t_pz(%esi),%ecx
negl %eax
movl spanpackage_t_pdest(%esi),%edi
andl $7,%eax // 0->0, 1->7, 2->6, ... , 7->1
subl %eax,%edi // compensate for hardwired offsets
subl %eax,%ecx
subl %eax,%ecx
movl spanpackage_t_tfrac(%esi),%edx
movw spanpackage_t_light(%esi),%dx
movl spanpackage_t_zi(%esi),%ebp
rorl $16,%ebp // put high 16 bits of 1/z in low word
pushl %esi
movl spanpackage_t_ptex(%esi),%esi
jmp aff8entryvec_table(,%eax,4)
// %bx = count of full and partial loops
// %ebx high word = sfrac
// %ecx = pz
// %dx = light
// %edx high word = tfrac
// %esi = ptex
// %edi = pdest
// %ebp = 1/z
// tstep low word = C(r_lstepx)
// tstep high word = C(a_tstepxfrac)
// C(a_sstepxfrac) low word = 0
// C(a_sstepxfrac) high word = C(a_sstepxfrac)
LDrawLoop:
// FIXME: do we need to clamp light? We may need at least a buffer bit to
// keep it from poking into tfrac and causing problems
LDraw8:
cmpw (%ecx),%bp
jl Lp1
xorl %eax,%eax
movb %dh,%ah
movb (%esi),%al
movw %bp,(%ecx)
movb 0x12345678(%eax),%al
LPatch8:
movb %al,(%edi)
Lp1:
addl tstep,%edx
sbbl %eax,%eax
addl lzistepx,%ebp
adcl $0,%ebp
addl C(a_sstepxfrac),%ebx
adcl advancetable+4(,%eax,4),%esi
LDraw7:
cmpw 2(%ecx),%bp
jl Lp2
xorl %eax,%eax
movb %dh,%ah
movb (%esi),%al
movw %bp,2(%ecx)
movb 0x12345678(%eax),%al
LPatch7:
movb %al,1(%edi)
Lp2:
addl tstep,%edx
sbbl %eax,%eax
addl lzistepx,%ebp
adcl $0,%ebp
addl C(a_sstepxfrac),%ebx
adcl advancetable+4(,%eax,4),%esi
LDraw6:
cmpw 4(%ecx),%bp
jl Lp3
xorl %eax,%eax
movb %dh,%ah
movb (%esi),%al
movw %bp,4(%ecx)
movb 0x12345678(%eax),%al
LPatch6:
movb %al,2(%edi)
Lp3:
addl tstep,%edx
sbbl %eax,%eax
addl lzistepx,%ebp
adcl $0,%ebp
addl C(a_sstepxfrac),%ebx
adcl advancetable+4(,%eax,4),%esi
LDraw5:
cmpw 6(%ecx),%bp
jl Lp4
xorl %eax,%eax
movb %dh,%ah
movb (%esi),%al
movw %bp,6(%ecx)
movb 0x12345678(%eax),%al
LPatch5:
movb %al,3(%edi)
Lp4:
addl tstep,%edx
sbbl %eax,%eax
addl lzistepx,%ebp
adcl $0,%ebp
addl C(a_sstepxfrac),%ebx
adcl advancetable+4(,%eax,4),%esi
LDraw4:
cmpw 8(%ecx),%bp
jl Lp5
xorl %eax,%eax
movb %dh,%ah
movb (%esi),%al
movw %bp,8(%ecx)
movb 0x12345678(%eax),%al
LPatch4:
movb %al,4(%edi)
Lp5:
addl tstep,%edx
sbbl %eax,%eax
addl lzistepx,%ebp
adcl $0,%ebp
addl C(a_sstepxfrac),%ebx
adcl advancetable+4(,%eax,4),%esi
LDraw3:
cmpw 10(%ecx),%bp
jl Lp6
xorl %eax,%eax
movb %dh,%ah
movb (%esi),%al
movw %bp,10(%ecx)
movb 0x12345678(%eax),%al
LPatch3:
movb %al,5(%edi)
Lp6:
addl tstep,%edx
sbbl %eax,%eax
addl lzistepx,%ebp
adcl $0,%ebp
addl C(a_sstepxfrac),%ebx
adcl advancetable+4(,%eax,4),%esi
LDraw2:
cmpw 12(%ecx),%bp
jl Lp7
xorl %eax,%eax
movb %dh,%ah
movb (%esi),%al
movw %bp,12(%ecx)
movb 0x12345678(%eax),%al
LPatch2:
movb %al,6(%edi)
Lp7:
addl tstep,%edx
sbbl %eax,%eax
addl lzistepx,%ebp
adcl $0,%ebp
addl C(a_sstepxfrac),%ebx
adcl advancetable+4(,%eax,4),%esi
LDraw1:
cmpw 14(%ecx),%bp
jl Lp8
xorl %eax,%eax
movb %dh,%ah
movb (%esi),%al
movw %bp,14(%ecx)
movb 0x12345678(%eax),%al
LPatch1:
movb %al,7(%edi)
Lp8:
addl tstep,%edx
sbbl %eax,%eax
addl lzistepx,%ebp
adcl $0,%ebp
addl C(a_sstepxfrac),%ebx
adcl advancetable+4(,%eax,4),%esi
addl $8,%edi
addl $16,%ecx
decw %bx
jnz LDrawLoop
popl %esi // restore spans pointer
LNextSpan:
addl $(spanpackage_t_size),%esi // point to next span
LNextSpanESISet:
movl spanpackage_t_count(%esi),%edx
cmpl $-999999,%edx // any more spans?
jnz LSpanLoop // yes
popl %edi
popl %ebp // restore the caller's stack frame
popl %ebx // restore register variables
popl %esi
ret
// draw a one-long span
LExactlyOneLong:
movl spanpackage_t_pz(%esi),%ecx
movl spanpackage_t_zi(%esi),%ebp
rorl $16,%ebp // put high 16 bits of 1/z in low word
movl spanpackage_t_ptex(%esi),%ebx
cmpw (%ecx),%bp
jl LNextSpan
xorl %eax,%eax
movl spanpackage_t_pdest(%esi),%edi
movb spanpackage_t_light+1(%esi),%ah
addl $(spanpackage_t_size),%esi // point to next span
movb (%ebx),%al
movw %bp,(%ecx)
movb 0x12345678(%eax),%al
LPatch9:
movb %al,(%edi)
jmp LNextSpanESISet
.globl C(D_PolysetAff8End)
C(D_PolysetAff8End):
.extern C(alias_colormap)
// #define pcolormap 4
.globl C(D_Aff8Patch)
C(D_Aff8Patch):
movl C(alias_colormap),%eax
movl %eax,LPatch1-4
movl %eax,LPatch2-4
movl %eax,LPatch3-4
movl %eax,LPatch4-4
movl %eax,LPatch5-4
movl %eax,LPatch6-4
movl %eax,LPatch7-4
movl %eax,LPatch8-4
movl %eax,LPatch9-4
ret
//----------------------------------------------------------------------
// Alias model triangle left-edge scanning code
//----------------------------------------------------------------------
#define height 4+16
.globl C(R_PolysetScanLeftEdge)
C(R_PolysetScanLeftEdge):
pushl %ebp // preserve caller stack frame pointer
pushl %esi // preserve register variables
pushl %edi
pushl %ebx
movl height(%esp),%eax
movl C(d_sfrac),%ecx
andl $0xFFFF,%eax
movl C(d_ptex),%ebx
orl %eax,%ecx
movl C(d_pedgespanpackage),%esi
movl C(d_tfrac),%edx
movl C(d_light),%edi
movl C(d_zi),%ebp
// %eax: scratch
// %ebx: d_ptex
// %ecx: d_sfrac in high word, count in low word
// %edx: d_tfrac
// %esi: d_pedgespanpackage, errorterm, scratch alternately
// %edi: d_light
// %ebp: d_zi
// do
// {
LScanLoop:
// d_pedgespanpackage->ptex = ptex;
// d_pedgespanpackage->pdest = d_pdest;
// d_pedgespanpackage->pz = d_pz;
// d_pedgespanpackage->count = d_aspancount;
// d_pedgespanpackage->light = d_light;
// d_pedgespanpackage->zi = d_zi;
// d_pedgespanpackage->sfrac = d_sfrac << 16;
// d_pedgespanpackage->tfrac = d_tfrac << 16;
movl %ebx,spanpackage_t_ptex(%esi)
movl C(d_pdest),%eax
movl %eax,spanpackage_t_pdest(%esi)
movl C(d_pz),%eax
movl %eax,spanpackage_t_pz(%esi)
movl C(d_aspancount),%eax
movl %eax,spanpackage_t_count(%esi)
movl %edi,spanpackage_t_light(%esi)
movl %ebp,spanpackage_t_zi(%esi)
movl %ecx,spanpackage_t_sfrac(%esi)
movl %edx,spanpackage_t_tfrac(%esi)
// pretouch the next cache line
movb spanpackage_t_size(%esi),%al
// d_pedgespanpackage++;
addl $(spanpackage_t_size),%esi
movl C(erroradjustup),%eax
movl %esi,C(d_pedgespanpackage)
// errorterm += erroradjustup;
movl C(errorterm),%esi
addl %eax,%esi
movl C(d_pdest),%eax
// if (errorterm >= 0)
// {
js LNoLeftEdgeTurnover
// errorterm -= erroradjustdown;
// d_pdest += d_pdestextrastep;
subl C(erroradjustdown),%esi
addl C(d_pdestextrastep),%eax
movl %esi,C(errorterm)
movl %eax,C(d_pdest)
// d_pz += d_pzextrastep;
// d_aspancount += d_countextrastep;
// d_ptex += d_ptexextrastep;
// d_sfrac += d_sfracextrastep;
// d_ptex += d_sfrac >> 16;
// d_sfrac &= 0xFFFF;
// d_tfrac += d_tfracextrastep;
movl C(d_pz),%eax
movl C(d_aspancount),%esi
addl C(d_pzextrastep),%eax
addl C(d_sfracextrastep),%ecx
adcl C(d_ptexextrastep),%ebx
addl C(d_countextrastep),%esi
movl %eax,C(d_pz)
movl C(d_tfracextrastep),%eax
movl %esi,C(d_aspancount)
addl %eax,%edx
// if (d_tfrac & 0x10000)
// {
jnc LSkip1
// d_ptex += r_affinetridesc.skinwidth;
// d_tfrac &= 0xFFFF;
addl C(r_affinetridesc)+atd_skinwidth,%ebx
// }
LSkip1:
// d_light += d_lightextrastep;
// d_zi += d_ziextrastep;
addl C(d_lightextrastep),%edi
addl C(d_ziextrastep),%ebp
// }
movl C(d_pedgespanpackage),%esi
decl %ecx
testl $0xFFFF,%ecx
jnz LScanLoop
popl %ebx
popl %edi
popl %esi
popl %ebp
ret
// else
// {
LNoLeftEdgeTurnover:
movl %esi,C(errorterm)
// d_pdest += d_pdestbasestep;
addl C(d_pdestbasestep),%eax
movl %eax,C(d_pdest)
// d_pz += d_pzbasestep;
// d_aspancount += ubasestep;
// d_ptex += d_ptexbasestep;
// d_sfrac += d_sfracbasestep;
// d_ptex += d_sfrac >> 16;
// d_sfrac &= 0xFFFF;
movl C(d_pz),%eax
movl C(d_aspancount),%esi
addl C(d_pzbasestep),%eax
addl C(d_sfracbasestep),%ecx
adcl C(d_ptexbasestep),%ebx
addl C(ubasestep),%esi
movl %eax,C(d_pz)
movl %esi,C(d_aspancount)
// d_tfrac += d_tfracbasestep;
movl C(d_tfracbasestep),%esi
addl %esi,%edx
// if (d_tfrac & 0x10000)
// {
jnc LSkip2
// d_ptex += r_affinetridesc.skinwidth;
// d_tfrac &= 0xFFFF;
addl C(r_affinetridesc)+atd_skinwidth,%ebx
// }
LSkip2:
// d_light += d_lightbasestep;
// d_zi += d_zibasestep;
addl C(d_lightbasestep),%edi
addl C(d_zibasestep),%ebp
// }
// } while (--height);
movl C(d_pedgespanpackage),%esi
decl %ecx
testl $0xFFFF,%ecx
jnz LScanLoop
popl %ebx
popl %edi
popl %esi
popl %ebp
ret
#endif // id386