quakeforge/libs/video/renderer/sw32/d_scan.c

/*
	d_scan.c

	Portable C scan-level rasterization code, all pixel depths.

	Copyright (C) 1996-1997  Id Software, Inc.

	This program is free software; you can redistribute it and/or
	modify it under the terms of the GNU General Public License
	as published by the Free Software Foundation; either version 2
	of the License, or (at your option) any later version.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

	See the GNU General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with this program; if not, write to:

		Free Software Foundation, Inc.
		59 Temple Place - Suite 330
		Boston, MA  02111-1307, USA

*/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif

#define NH_DEFINE
#include "namehack.h"

#include "QF/qendian.h"
#include "QF/render.h"
#include "QF/sys.h"

#include "compat.h"
#include "d_local.h"
#include "r_internal.h"
#include "vid_internal.h"

static byte       *r_turb_pbase;
static void       *r_turb_pdest;
static fixed16_t   r_turb_s, r_turb_t, r_turb_sstep, r_turb_tstep;
static int        *r_turb_turb;
static int         r_turb_spancount;

/*
	D_WarpScreen

	this performs a slight compression of the screen at the same time as
	the sine warp, to keep the edges from wrapping
*/
void
sw32_D_WarpScreen (void)
{
	switch(sw32_r_pixbytes) {
	case 1:
	{
		int         w, h;
		int         u, v;
		byte       *dest;
		int        *turb;
		int        *col;
		byte      **row;
		byte       *rowptr[MAXHEIGHT];
		int         column[MAXWIDTH];
		float       wratio, hratio;

		w = r_refdef.vrect.width;
		h = r_refdef.vrect.height;

		wratio = w / (float) scr_vrect.width;
		hratio = h / (float) scr_vrect.height;

		for (v = 0; v < scr_vrect.height + AMP2 * 2; v++) {
			rowptr[v] = (byte *) sw32_d_viewbuffer + (r_refdef.vrect.y *
												 sw32_screenwidth) +
				(sw32_screenwidth * (int) ((float) v * hratio * h /
									  (h + AMP2 * 2)));
		}

		for (u = 0; u < scr_vrect.width + AMP2 * 2; u++) {
			column[u] = r_refdef.vrect.x +
				(int) ((float) u * wratio * w / (w + AMP2 * 2));
		}

		turb = sw32_intsintable + ((int) (vr_data.realtime * SPEED) & (CYCLE - 1));
		dest = (byte *)vid.buffer + scr_vrect.y * vid.rowbytes +
						 scr_vrect.x;

		for (v = 0; v < scr_vrect.height; v++, dest += vid.rowbytes) {
			col = &column[turb[v]];
			row = &rowptr[v];
			for (u = 0; u < scr_vrect.width; u += 4) {
				dest[u + 0] = row[turb[u + 0]][col[u + 0]];
				dest[u + 1] = row[turb[u + 1]][col[u + 1]];
				dest[u + 2] = row[turb[u + 2]][col[u + 2]];
				dest[u + 3] = row[turb[u + 3]][col[u + 3]];
			}
		}
	}
	break;
	case 2:
	{
		int         w, h;
		int         u, v;
		short      *dest;
		int        *turb;
		int        *col;
		short     **row;
		short      *rowptr[MAXHEIGHT];
		int         column[MAXWIDTH];
		float       wratio, hratio;

		w = r_refdef.vrect.width;
		h = r_refdef.vrect.height;

		wratio = w / (float) scr_vrect.width;
		hratio = h / (float) scr_vrect.height;

		for (v = 0; v < scr_vrect.height + AMP2 * 2; v++) {
			rowptr[v] = (short *) sw32_d_viewbuffer +
				(r_refdef.vrect.y * sw32_screenwidth) +
				(sw32_screenwidth * (int) ((float) v * hratio * h /
									  (h + AMP2 * 2)));
		}

		for (u = 0; u < scr_vrect.width + AMP2 * 2; u++) {
			column[u] = r_refdef.vrect.x +
				(int) ((float) u * wratio * w / (w + AMP2 * 2));
		}

		turb = sw32_intsintable + ((int) (vr_data.realtime * SPEED) & (CYCLE - 1));
		dest = (short *) vid.buffer + scr_vrect.y * (vid.rowbytes >> 1) +
			scr_vrect.x;

		for (v = 0; v < scr_vrect.height; v++, dest += (vid.rowbytes >> 1)) {
			col = &column[turb[v]];
			row = &rowptr[v];
			for (u = 0; u < scr_vrect.width; u += 4) {
				dest[u + 0] = row[turb[u + 0]][col[u + 0]];
				dest[u + 1] = row[turb[u + 1]][col[u + 1]];
				dest[u + 2] = row[turb[u + 2]][col[u + 2]];
				dest[u + 3] = row[turb[u + 3]][col[u + 3]];
			}
		}
	}
	break;
	case 4:
	{
		int         w, h;
		int         u, v;
		int        *dest;
		int        *turb;
		int        *col;
		int       **row;
		int        *rowptr[MAXHEIGHT];
		int         column[MAXWIDTH];
		float       wratio, hratio;

		w = r_refdef.vrect.width;
		h = r_refdef.vrect.height;

		wratio = w / (float) scr_vrect.width;
		hratio = h / (float) scr_vrect.height;

		for (v = 0; v < scr_vrect.height + AMP2 * 2; v++) {
			rowptr[v] = (int *) sw32_d_viewbuffer +
				(r_refdef.vrect.y * sw32_screenwidth) +
				(sw32_screenwidth * (int) ((float) v * hratio * h /
									  (h + AMP2 * 2)));
		}

		for (u = 0; u < scr_vrect.width + AMP2 * 2; u++) {
			column[u] = r_refdef.vrect.x +
				(int) ((float) u * wratio * w / (w + AMP2 * 2));
		}

		turb = sw32_intsintable + ((int) (vr_data.realtime * SPEED) & (CYCLE - 1));
		dest = (int *) vid.buffer + scr_vrect.y * (vid.rowbytes >> 2) +
			scr_vrect.x;

		for (v = 0; v < scr_vrect.height; v++, dest += (vid.rowbytes >> 2)) {
			col = &column[turb[v]];
			row = &rowptr[v];
			for (u = 0; u < scr_vrect.width; u += 4) {
				dest[u + 0] = row[turb[u + 0]][col[u + 0]];
				dest[u + 1] = row[turb[u + 1]][col[u + 1]];
				dest[u + 2] = row[turb[u + 2]][col[u + 2]];
				dest[u + 3] = row[turb[u + 3]][col[u + 3]];
			}
		}
	}
	break;
	default:
		Sys_Error("D_WarpScreen: unsupported r_pixbytes %i", sw32_r_pixbytes);
	}
}

static void
D_DrawTurbulentSpan (void)
{
	int         sturb, tturb;

	switch (sw32_r_pixbytes) {
	case 1:
	{
		byte *pdest = (byte *) r_turb_pdest;
		do {
			sturb =	((r_turb_s + r_turb_turb[(r_turb_t >> 16) &
											 (CYCLE - 1)]) >> 16) &	63;
			tturb = ((r_turb_t + r_turb_turb[(r_turb_s >> 16) &
											 (CYCLE - 1)]) >> 16) & 63;
			*pdest++ = r_turb_pbase[(tturb << 6) + sturb];
			r_turb_s += r_turb_sstep;
			r_turb_t += r_turb_tstep;
		} while (--r_turb_spancount > 0);
		r_turb_pdest = (byte *)pdest;
	}
	break;
	case 2:
	{
		short *pdest = (short *) r_turb_pdest;
		do {
			sturb = ((r_turb_s + r_turb_turb[(r_turb_t >> 16) &
											 (CYCLE - 1)]) >> 16) & 63;
			tturb = ((r_turb_t + r_turb_turb[(r_turb_s >> 16) &
											 (CYCLE - 1)]) >> 16) & 63;
			*pdest++ = sw32_8to16table[r_turb_pbase[(tturb << 6) + sturb]];
			r_turb_s += r_turb_sstep;
			r_turb_t += r_turb_tstep;
		} while (--r_turb_spancount > 0);
		r_turb_pdest = (byte *)pdest;
	}
	break;
	case 4:
	{
		int *pdest = (int *) r_turb_pdest;
		do {
			sturb = ((r_turb_s + r_turb_turb[(r_turb_t >> 16) &
											 (CYCLE - 1)]) >> 16) & 63;
				tturb = ((r_turb_t + r_turb_turb[(r_turb_s >> 16) &
												 (CYCLE - 1)]) >> 16) & 63;
				*pdest++ = d_8to24table[r_turb_pbase[(tturb << 6) + sturb]];
				r_turb_s += r_turb_sstep;
				r_turb_t += r_turb_tstep;
			} while (--r_turb_spancount > 0);
			r_turb_pdest = (byte *)pdest;
		}
		break;
	default:
		Sys_Error("D_DrawTurbulentSpan: unsupported r_pixbytes %i",
				  sw32_r_pixbytes);
	}
}

void
sw32_Turbulent (espan_t *pspan)
{
	int         count;
	fixed16_t   snext, tnext;
	float       sdivz, tdivz, zi, z, du, dv, spancountminus1;
	float       sdivz16stepu, tdivz16stepu, zi16stepu;

	r_turb_turb = sw32_sintable + ((int) (vr_data.realtime * SPEED) & (CYCLE - 1));

	r_turb_sstep = 0;					// keep compiler happy
	r_turb_tstep = 0;					// ditto

	r_turb_pbase = (byte *) sw32_cacheblock;

	sdivz16stepu = sw32_d_sdivzstepu * 16;
	tdivz16stepu = sw32_d_tdivzstepu * 16;
	zi16stepu = d_zistepu * 16 * 65536;

	do {
		r_turb_pdest = (byte *) sw32_d_viewbuffer + ((sw32_screenwidth * pspan->v) +
												pspan->u) * sw32_r_pixbytes;

		count = pspan->count;

		// calculate the initial s/z, t/z, 1/z, s, and t and clamp
		du = (float) pspan->u;
		dv = (float) pspan->v;

		sdivz = sw32_d_sdivzorigin + dv * sw32_d_sdivzstepv + du * sw32_d_sdivzstepu;
		tdivz = sw32_d_tdivzorigin + dv * sw32_d_tdivzstepv + du * sw32_d_tdivzstepu;
		zi = (d_ziorigin + dv * d_zistepv + du * d_zistepu) * 65536.0f;
		z = sw32_d_zitable[(unsigned short) zi];

		r_turb_s = (int) (sdivz * z) + sw32_sadjust;
		if (r_turb_s > sw32_bbextents)
			r_turb_s = sw32_bbextents;
		else if (r_turb_s < 0)
			r_turb_s = 0;

		r_turb_t = (int) (tdivz * z) + sw32_tadjust;
		if (r_turb_t > sw32_bbextentt)
			r_turb_t = sw32_bbextentt;
		else if (r_turb_t < 0)
			r_turb_t = 0;

		do {
			// calculate s and t at the far end of the span
			if (count >= 16)
				r_turb_spancount = 16;
			else
				r_turb_spancount = count;

			count -= r_turb_spancount;

			if (count) {
				// calculate s/z, t/z, zi->fixed s and t at far end of span,
				// calculate s and t steps across span by shifting
				sdivz += sdivz16stepu;
				tdivz += tdivz16stepu;
				zi += zi16stepu;
				z = sw32_d_zitable[(unsigned short) zi];

				snext = (int) (sdivz * z) + sw32_sadjust;
				if (snext > sw32_bbextents)
					snext = sw32_bbextents;
				else if (snext < 16)
					snext = 16;			// prevent round-off error on <0
										// steps from
				// from causing overstepping & running off the
				// edge of the texture

				tnext = (int) (tdivz * z) + sw32_tadjust;
				if (tnext > sw32_bbextentt)
					tnext = sw32_bbextentt;
				else if (tnext < 16)
					tnext = 16;			// guard against round-off error on
										// <0 steps

				r_turb_sstep = (snext - r_turb_s) >> 4;
				r_turb_tstep = (tnext - r_turb_t) >> 4;
			} else {
				// calculate s/z, t/z, zi->fixed s and t at last pixel in
				// span (so can't step off polygon), clamp, calculate s and t
				// steps across span by division, biasing steps low so we
				// don't run off the texture
				spancountminus1 = (float) (r_turb_spancount - 1);
				sdivz += sw32_d_sdivzstepu * spancountminus1;
				tdivz += sw32_d_tdivzstepu * spancountminus1;
				zi += d_zistepu * 65536.0f * spancountminus1;
				z = sw32_d_zitable[(unsigned short) zi];
				snext = (int) (sdivz * z) + sw32_sadjust;
				if (snext > sw32_bbextents)
					snext = sw32_bbextents;
				else if (snext < 16)
					snext = 16;			// prevent round-off error on <0 steps
										// from causing overstepping & running
										// off the edge of the texture

				tnext = (int) (tdivz * z) + sw32_tadjust;
				if (tnext > sw32_bbextentt)
					tnext = sw32_bbextentt;
				else if (tnext < 16)
					tnext = 16;			// guard against round-off error on
										// <0 steps

				if (r_turb_spancount > 1) {
					r_turb_sstep = (snext - r_turb_s) / (r_turb_spancount - 1);
					r_turb_tstep = (tnext - r_turb_t) / (r_turb_spancount - 1);
				}
			}

			r_turb_s = r_turb_s & ((CYCLE << 16) - 1);
			r_turb_t = r_turb_t & ((CYCLE << 16) - 1);

			D_DrawTurbulentSpan ();

			r_turb_s = snext;
			r_turb_t = tnext;

		} while (count > 0);

	} while ((pspan = pspan->pnext) != NULL);
}

void
sw32_D_DrawSpans (espan_t *pspan)
{
	switch(sw32_r_pixbytes) {
	case 1:
	{
		byte       *pbase = (byte *) sw32_cacheblock, *pdest;
		int         count;
		fixed16_t   s, t, snext, tnext, sstep, tstep;
		float       sdivz, tdivz, zi, z, du, dv;
		float       sdivz8stepu, tdivz8stepu, zi8stepu;

		sstep = 0;							// keep compiler happy
		tstep = 0;							// ditto

		sdivz8stepu = sw32_d_sdivzstepu * 8;
		tdivz8stepu = sw32_d_tdivzstepu * 8;
		zi8stepu = d_zistepu * 8 * 65536;

		do {
			pdest = (byte *) sw32_d_viewbuffer + (sw32_screenwidth * pspan->v) +
				pspan->u;

			count = pspan->count;

			// calculate the initial s/z, t/z, 1/z, s, and t and clamp
			du = (float) pspan->u;
			dv = (float) pspan->v;

			sdivz = sw32_d_sdivzorigin + dv * sw32_d_sdivzstepv + du * sw32_d_sdivzstepu;
			tdivz = sw32_d_tdivzorigin + dv * sw32_d_tdivzstepv + du * sw32_d_tdivzstepu;
			zi = (d_ziorigin + dv * d_zistepv + du * d_zistepu) * 65536.0f;
			z = sw32_d_zitable[(unsigned short) zi];

			s = (int) (sdivz * z) + sw32_sadjust;
			s = bound(0, s, sw32_bbextents);
			t = (int) (tdivz * z) + sw32_tadjust;
			t = bound(0, t, sw32_bbextentt);

			while(count >= 8) {
				count -= 8;
				// calculate s/z, t/z, zi->fixed s and t at far end of span,
				// calculate s and t steps across span by shifting
				sdivz += sdivz8stepu;
				tdivz += tdivz8stepu;
				zi += zi8stepu;
				z = sw32_d_zitable[(unsigned short) zi];

				// prevent round-off error on <0 steps from from causing
				// overstepping & running off the edge of the texture
				snext = (int) (sdivz * z) + sw32_sadjust;
				snext = bound(8, snext, sw32_bbextents);
				tnext = (int) (tdivz * z) + sw32_tadjust;
				tnext = bound(8, tnext, sw32_bbextentt);

				sstep = (snext - s) >> 3;
				tstep = (tnext - t) >> 3;

				pdest[0] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
				s += sstep;t += tstep;
				pdest[1] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
				s += sstep;
				t += tstep;
				pdest[2] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
				s += sstep;
				t += tstep;
				pdest[3] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
				s += sstep;
				t += tstep;
				pdest[4] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
				s += sstep;
				t += tstep;
				pdest[5] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
				s += sstep;
				t += tstep;
				pdest[6] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
				s += sstep;
				t += tstep;
				pdest[7] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
				s = snext;
				t = tnext;
				pdest += 8;
			}
			if (count)
			{
				// calculate s/z, t/z, zi->fixed s and t at last pixel in span
				// (so can't step off polygon), clamp, calculate s and t steps
				// across span by division, biasing steps low so we don't run
				// off the texture
				//countminus1 = (float) (count - 1);
				sdivz += sw32_d_sdivzstepu * count; //minus1;
				tdivz += sw32_d_tdivzstepu * count; //minus1;
				zi += d_zistepu * 65536.0f * count; //minus1;
				z = sw32_d_zitable[(unsigned short) zi];

				// prevent round-off error on <0 steps from from causing
				// overstepping & running off the edge of the texture
				snext = (int) (sdivz * z) + sw32_sadjust;
				snext = bound(count, snext, sw32_bbextents);
				tnext = (int) (tdivz * z) + sw32_tadjust;
				tnext = bound(count, tnext, sw32_bbextentt);

				if (count > 1) {
					sstep = (snext - s) / count; //(count - 1);
					tstep = (tnext - t) / count; //(count - 1);

					if (count & 4)
					{
						pdest[0] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
						s += sstep;
						t += tstep;
						pdest[1] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
						s += sstep;
						t += tstep;
						pdest[2] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
						s += sstep;
						t += tstep;
						pdest[3] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
						s += sstep;
						t += tstep;
						pdest += 4;
					}
					if (count & 2)
					{
						pdest[0] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
						s += sstep;
						t += tstep;
						pdest[1] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
						s += sstep;
						t += tstep;
						pdest += 2;
					}
					if (count & 1)
						pdest[0] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
				}
				else
				{
					pdest[0] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
				}
			}
		} while ((pspan = pspan->pnext) != NULL);
	}
	break;
	case 2:
	{
		short      *pbase = (short *) sw32_cacheblock, *pdest;
		int         count;
		fixed16_t   s, t, snext, tnext, sstep, tstep;
		float       sdivz, tdivz, zi, z, du, dv;
		float       sdivz8stepu, tdivz8stepu, zi8stepu;

		sstep = 0;							// keep compiler happy
		tstep = 0;							// ditto

		sdivz8stepu = sw32_d_sdivzstepu * 8;
		tdivz8stepu = sw32_d_tdivzstepu * 8;
		zi8stepu = d_zistepu * 8 * 65536;

		do {
			pdest = (short *) sw32_d_viewbuffer + (sw32_screenwidth * pspan->v) +
				pspan->u;

			count = pspan->count;

			// calculate the initial s/z, t/z, 1/z, s, and t and clamp
			du = (float) pspan->u;
			dv = (float) pspan->v;

			sdivz = sw32_d_sdivzorigin + dv * sw32_d_sdivzstepv + du * sw32_d_sdivzstepu;
			tdivz = sw32_d_tdivzorigin + dv * sw32_d_tdivzstepv + du * sw32_d_tdivzstepu;
			zi = (d_ziorigin + dv * d_zistepv + du * d_zistepu) * 65536.0f;
			z = sw32_d_zitable[(unsigned short) zi];

			s = (int) (sdivz * z) + sw32_sadjust;
			s = bound(0, s, sw32_bbextents);
			t = (int) (tdivz * z) + sw32_tadjust;
			t = bound(0, t, sw32_bbextentt);

			while(count >= 8) {
				count -= 8;
				// calculate s/z, t/z, zi->fixed s and t at far end of span,
				// calculate s and t steps across span by shifting
				sdivz += sdivz8stepu;
				tdivz += tdivz8stepu;
				zi += zi8stepu;
				z = sw32_d_zitable[(unsigned short) zi];

				// prevent round-off error on <0 steps from from causing
				// overstepping & running off the edge of the texture
				snext = (int) (sdivz * z) + sw32_sadjust;
				snext = bound(8, snext, sw32_bbextents);
				tnext = (int) (tdivz * z) + sw32_tadjust;
				tnext = bound(8, tnext, sw32_bbextentt);

				sstep = (snext - s) >> 3;
				tstep = (tnext - t) >> 3;

				pdest[0] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
				s += sstep;
				t += tstep;
				pdest[1] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
				s += sstep;
				t += tstep;
				pdest[2] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
				s += sstep;
				t += tstep;
				pdest[3] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
				s += sstep;
				t += tstep;
				pdest[4] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
				s += sstep;
				t += tstep;
				pdest[5] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
				s += sstep;
				t += tstep;
				pdest[6] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
				s += sstep;
				t += tstep;
				pdest[7] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
				s = snext;t = tnext;
				pdest += 8;
			}
			if (count)
			{
				// calculate s/z, t/z, zi->fixed s and t at last pixel in span
				// (so can't step off polygon), clamp, calculate s and t steps
				// across span by division, biasing steps low so we don't run
				// off the texture
				//countminus1 = (float) (count - 1);
				sdivz += sw32_d_sdivzstepu * count; //minus1;
				tdivz += sw32_d_tdivzstepu * count; //minus1;
				zi += d_zistepu * 65536.0f * count; //minus1;
				z = sw32_d_zitable[(unsigned short) zi];

				// prevent round-off error on <0 steps from from causing
				// overstepping & running off the edge of the texture
				snext = (int) (sdivz * z) + sw32_sadjust;
				snext = bound(count, snext, sw32_bbextents);
				tnext = (int) (tdivz * z) + sw32_tadjust;
				tnext = bound(count, tnext, sw32_bbextentt);

				if (count > 1) {
					sstep = (snext - s) / count; //(count - 1);
					tstep = (tnext - t) / count; //(count - 1);

					if (count & 4)
					{
						pdest[0] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
						s += sstep;
						t += tstep;
						pdest[1] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
						s += sstep;
						t += tstep;
						pdest[2] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
						s += sstep;
						t += tstep;
						pdest[3] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
						s += sstep;t += tstep;
						pdest += 4;
					}
					if (count & 2)
					{
						pdest[0] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
						s += sstep;
						t += tstep;
						pdest[1] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
						s += sstep;
						t += tstep;
						pdest += 2;
					}
					if (count & 1)
						pdest[0] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
				}
				else
				{
					pdest[0] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
				}
			}
		} while ((pspan = pspan->pnext) != NULL);
	}
	break;
	case 4:
	{
		int        *pbase = (int *) sw32_cacheblock, *pdest;
		int         count;
		fixed16_t   s, t, snext, tnext, sstep, tstep;
		float       sdivz, tdivz, zi, z, du, dv;
		float       sdivz8stepu, tdivz8stepu, zi8stepu;

		sstep = 0;							// keep compiler happy
		tstep = 0;							// ditto

		sdivz8stepu = sw32_d_sdivzstepu * 8;
		tdivz8stepu = sw32_d_tdivzstepu * 8;
		zi8stepu = d_zistepu * 8 * 65536;

		do {
			pdest = (int *) sw32_d_viewbuffer + (sw32_screenwidth * pspan->v) + pspan->u;

			count = pspan->count;

			// calculate the initial s/z, t/z, 1/z, s, and t and clamp
			du = (float) pspan->u;
			dv = (float) pspan->v;

			sdivz = sw32_d_sdivzorigin + dv * sw32_d_sdivzstepv + du * sw32_d_sdivzstepu;
			tdivz = sw32_d_tdivzorigin + dv * sw32_d_tdivzstepv + du * sw32_d_tdivzstepu;
			zi = (d_ziorigin + dv * d_zistepv + du * d_zistepu) * 65536.0f;
			z = sw32_d_zitable[(unsigned short) zi];

			s = (int) (sdivz * z) + sw32_sadjust;
			s = bound(0, s, sw32_bbextents);
			t = (int) (tdivz * z) + sw32_tadjust;
			t = bound(0, t, sw32_bbextentt);

			while(count >= 8) {
				count -= 8;
				// calculate s/z, t/z, zi->fixed s and t at far end of span,
				// calculate s and t steps across span by shifting
				sdivz += sdivz8stepu;
				tdivz += tdivz8stepu;
				zi += zi8stepu;
				z = sw32_d_zitable[(unsigned short) zi];

				// prevent round-off error on <0 steps from from causing
				// overstepping & running off the edge of the texture
				snext = (int) (sdivz * z) + sw32_sadjust;
				snext = bound(8, snext, sw32_bbextents);
				tnext = (int) (tdivz * z) + sw32_tadjust;
				tnext = bound(8, tnext, sw32_bbextentt);

				sstep = (snext - s) >> 3;
				tstep = (tnext - t) >> 3;

				pdest[0] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
				s += sstep;
				t += tstep;
				pdest[1] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
				s += sstep;
				t += tstep;
				pdest[2] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
				s += sstep;
				t += tstep;
				pdest[3] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
				s += sstep;
				t += tstep;
				pdest[4] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
				s += sstep;
				t += tstep;
				pdest[5] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
				s += sstep;
				t += tstep;
				pdest[6] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
				s += sstep;
				t += tstep;
				pdest[7] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
				s = snext;
				t = tnext;
				pdest += 8;
			}
			if (count)
			{
				// calculate s/z, t/z, zi->fixed s and t at last pixel in span
				// (so can't step off polygon), clamp, calculate s and t steps
				// across span by division, biasing steps low so we don't run
				// off the texture
				//countminus1 = (float) (count - 1);
				sdivz += sw32_d_sdivzstepu * count; //minus1;
				tdivz += sw32_d_tdivzstepu * count; //minus1;
				zi += d_zistepu * 65536.0f * count; //minus1;
				z = sw32_d_zitable[(unsigned short) zi];

				// prevent round-off error on <0 steps from from causing
				// overstepping & running off the edge of the texture
				snext = (int) (sdivz * z) + sw32_sadjust;
				snext = bound(count, snext, sw32_bbextents);
				tnext = (int) (tdivz * z) + sw32_tadjust;
				tnext = bound(count, tnext, sw32_bbextentt);

				if (count > 1) {
					sstep = (snext - s) / count; //(count - 1);
					tstep = (tnext - t) / count; //(count - 1);

					if (count & 4)
					{
						pdest[0] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
						s += sstep;
						t += tstep;
						pdest[1] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
						s += sstep;
						t += tstep;
						pdest[2] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
						s += sstep;
						t += tstep;
						pdest[3] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
						s += sstep;
						t += tstep;
						pdest += 4;
					}
					if (count & 2)
					{
						pdest[0] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
						s += sstep;
						t += tstep;
						pdest[1] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
						s += sstep;
						t += tstep;
						pdest += 2;
					}
					if (count & 1)
						pdest[0] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
				}
				else
				{
					pdest[0] = pbase[(t >> 16) * sw32_cachewidth + (s >> 16)];
				}
			}
		} while ((pspan = pspan->pnext) != NULL);
	}
	break;
	default:
		Sys_Error("D_DrawSpans: unsupported r_pixbytes %i", sw32_r_pixbytes);
	}
}

void
sw32_D_DrawZSpans (espan_t *pspan)
{
	int         count, doublecount, izistep;
	int         izi;
	short      *pdest;
	unsigned int ltemp;
	double      zi;
	float       du, dv;

	// FIXME: check for clamping/range problems
	// we count on FP exceptions being turned off to avoid range problems
	izistep = (int) (d_zistepu * 0x8000 * 0x10000);

	do {
		pdest = sw32_d_pzbuffer + (sw32_d_zwidth * pspan->v) + pspan->u;

		count = pspan->count;

		// calculate the initial 1/z
		du = (float) pspan->u;
		dv = (float) pspan->v;

		zi = d_ziorigin + dv * d_zistepv + du * d_zistepu;
		// we count on FP exceptions being turned off to avoid range problems
		izi = (int) (zi * 0x8000 * 0x10000);

		// LordHavoc: added big endian case, the old code is not correct on
		// big-endian (results in swapped depth pairs), and is tuned more for
		// x86, PowerPC compilers can probably do a good job with raw loop
		// unrolling if it is even necessary...
		if (bigendien)
		{
			do
			{
				*pdest++ = (short) (izi >> 16);
				izi += izistep;
			}
			while(--count);
		}
		else
		{
			if ((intptr_t) pdest & 0x02) {
				*pdest++ = (short) (izi >> 16);
				izi += izistep;
				count--;
			}

			if ((doublecount = count >> 1) > 0) {
				do {
					ltemp = izi >> 16;
					izi += izistep;
					ltemp |= izi & 0xFFFF0000;
					izi += izistep;
					*(int *) pdest = ltemp;
					pdest += 2;
				} while (--doublecount > 0);
			}

			if (count & 1)
				*pdest = (short) (izi >> 16);
		}
	} while ((pspan = pspan->pnext) != NULL);
}