gzdoom/src/asm_ia32/tmap.asm

;*
;* tmap.nas
;* The texture-mapping inner loops in pure assembly language.
;*
;*---------------------------------------------------------------------------
;* Copyright 1998-2006 Randy Heit
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* 1. Redistributions of source code must retain the above copyright
;*    notice, this list of conditions and the following disclaimer.
;* 2. Redistributions in binary form must reproduce the above copyright
;*    notice, this list of conditions and the following disclaimer in the
;*    documentation and/or other materials provided with the distribution.
;* 3. The name of the author may not be used to endorse or promote products
;*    derived from this software without specific prior written permission.
;*
;* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
;* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
;* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
;* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
;* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
;* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;*---------------------------------------------------------------------------
;*

BITS 32

%include "valgrind.inc"

; Segment/section definition macros. 

	SECTION .data

%define SPACEFILLER4 (0x44444444)

; If you change this in r_draw.c, be sure to change it here, too!
FUZZTABLE	equ	50

%ifndef M_TARGET_LINUX

%define ylookup			_ylookup
%define centery			_centery
%define fuzzpos			_fuzzpos
%define fuzzoffset		_fuzzoffset
%define NormalLight		_NormalLight
%define viewheight		_viewheight
%define fuzzviewheight	_fuzzviewheight
%define CPU				_CPU

%define dc_pitch		_dc_pitch
%define dc_colormap		_dc_colormap
%define dc_color		_dc_color
%define dc_iscale		_dc_iscale
%define dc_texturefrac	_dc_texturefrac
%define dc_srcblend		_dc_srcblend
%define dc_destblend	_dc_destblend
%define dc_source		_dc_source
%define dc_yl			_dc_yl
%define dc_yh			_dc_yh
%define dc_x			_dc_x
%define dc_count		_dc_count
%define dc_dest			_dc_dest
%define dc_destorg		_dc_destorg

%define Col2RGB8		_Col2RGB8
%define RGB32k			_RGB32k

%define dc_ctspan		_dc_ctspan
%define dc_temp			_dc_temp

%define ds_xstep		_ds_xstep
%define ds_ystep		_ds_ystep
%define ds_colormap		_ds_colormap
%define ds_source		_ds_source
%define ds_x1			_ds_x1
%define ds_x2			_ds_x2
%define ds_xfrac		_ds_xfrac
%define ds_yfrac		_ds_yfrac
%define ds_y			_ds_y

%define ds_cursource	_ds_cursource
%define ds_curcolormap	_ds_curcolormap

%define R_SetSpanSource_ASM		_R_SetSpanSource_ASM
%define R_SetSpanSize_ASM		_R_SetSpanSize_ASM
%define R_SetSpanColormap_ASM	_R_SetSpanColormap_ASM
%define R_SetupShadedCol		_R_SetupShadedCol
%define R_SetupAddCol			_R_SetupAddCol
%define R_SetupAddClampCol		_R_SetupAddClampCol

%endif

EXTERN ylookup
EXTERN centery
EXTERN fuzzpos
EXTERN fuzzoffset
EXTERN NormalLight
EXTERN viewheight
EXTERN fuzzviewheight
EXTERN CPU

EXTERN dc_pitch
EXTERN dc_colormap
EXTERN dc_color
EXTERN dc_iscale
EXTERN dc_texturefrac
EXTERN dc_srcblend
EXTERN dc_destblend
EXTERN dc_source
EXTERN dc_yl
EXTERN dc_yh
EXTERN dc_x
EXTERN dc_count
EXTERN dc_dest
EXTERN dc_destorg

EXTERN dc_ctspan
EXTERN dc_temp

EXTERN Col2RGB8
EXTERN RGB32k

EXTERN ds_xstep
EXTERN ds_ystep
EXTERN ds_colormap
EXTERN ds_source
EXTERN ds_x1
EXTERN ds_x2
EXTERN ds_xfrac
EXTERN ds_yfrac
EXTERN ds_y

GLOBAL ds_cursource
GLOBAL ds_curcolormap


ds_cursource:
	DD 0

ds_curcolormap:
	DD 0


; Local stuff:
lastAddress	DD 0
pixelcount	DD 0

	SECTION .text


GLOBAL @R_SetSpanSource_ASM@4
GLOBAL R_SetSpanSource_ASM

R_SetSpanSource_ASM:
	mov	ecx,[esp+4]

@R_SetSpanSource_ASM@4:
	mov	[spreada+2],ecx
	mov	[spreadb+2],ecx
	mov	[spreadc+2],ecx
	mov	[spreadd+2],ecx
	mov	[spreade+2],ecx
	mov	[spreadf+2],ecx
	mov	[spreadg+2],ecx

	mov	[mspreada+2],ecx
	mov	[mspreadb+2],ecx
	mov	[mspreadc+2],ecx
	mov	[mspreadd+2],ecx
	mov	[mspreade+2],ecx
	mov	[mspreadf+2],ecx
	mov	[mspreadg+2],ecx
	
	selfmod spreada, mspreadg+6

	mov	[ds_cursource],ecx
	ret

GLOBAL @R_SetSpanColormap_ASM@4
GLOBAL R_SetSpanColormap_ASM

R_SetSpanColormap_ASM:
	mov ecx,[esp+4]

@R_SetSpanColormap_ASM@4:
	mov	[spmapa+2],ecx
	mov	[spmapb+2],ecx
	mov	[spmapc+2],ecx
	mov	[spmapd+2],ecx
	mov	[spmape+2],ecx
	mov	[spmapf+2],ecx
	mov	[spmapg+2],ecx

	mov	[mspmapa+2],ecx
	mov	[mspmapb+2],ecx
	mov	[mspmapc+2],ecx
	mov	[mspmapd+2],ecx
	mov	[mspmape+2],ecx
	mov	[mspmapf+2],ecx
	mov	[mspmapg+2],ecx
	
	selfmod spmapa, mspmapg+6

	mov	[ds_curcolormap],ecx
	ret

GLOBAL R_SetSpanSize_ASM

EXTERN	SetTiltedSpanSize

R_SetSpanSize_ASM:
	mov	edx,[esp+4]
	mov	ecx,[esp+8]
	call	SetTiltedSpanSize
	
	mov	[dsy1+2],dl
	mov	[dsy2+2],dl
	
	mov	[dsx1+2],cl
	mov	[dsx2+2],cl
	mov	[dsx3+2],cl
	mov	[dsx4+2],cl
	mov	[dsx5+2],cl
	mov	[dsx6+2],cl
	mov	[dsx7+2],cl
	
	mov	[dmsy1+2],dl
	mov	[dmsy2+2],dl
	
	mov	[dmsx1+2],cl
	mov	[dmsx2+2],cl
	mov	[dmsx3+2],cl
	mov	[dmsx4+2],cl
	mov	[dmsx5+2],cl
	mov	[dmsx6+2],cl
	mov	[dmsx7+2],cl

	push	ecx
	add	ecx,edx
	mov	eax,1
	shl	eax,cl
	dec	eax
	mov	[dsm1+2],eax
	mov	[dsm5+1],eax
	mov	[dsm6+1],eax
	mov	[dsm7+1],eax
	
	mov	[dmsm1+2],eax
	mov	[dmsm5+1],eax
	mov	[dmsm6+1],eax
	mov	[dmsm7+1],eax
	pop	ecx
	ror	eax,cl
	mov	[dsm2+2],eax
	mov	[dsm3+2],eax
	mov	[dsm4+2],eax

	mov	[dmsm2+2],eax
	mov	[dmsm3+2],eax
	mov	[dmsm4+2],eax
	and	eax,0xffff
	not	eax
	mov	[dsm8+2],eax
	mov	[dsm9+2],eax

	mov	[dmsm8+2],eax
	mov	[dmsm9+2],eax
	
	neg	dl
	mov	[dsy3+2],dl
	mov	[dsy4+2],dl

	mov	[dmsy3+2],dl
	mov	[dmsy4+2],dl
	
	selfmod dsy1, dmsm7+6
	
aret:	ret

%ifdef M_TARGET_MACHO
	SECTION .text align=64
%else
	SECTION .rtext	progbits alloc exec write align=64
%endif

%ifdef M_TARGET_MACHO
GLOBAL _rtext_tmap_start
_rtext_tmap_start:
%endif

rtext_start:

GLOBAL @R_DrawSpanP_ASM@0
GLOBAL _R_DrawSpanP_ASM
GLOBAL R_DrawSpanP_ASM

; eax: scratch
; ebx: zero
; ecx: yfrac at top end, xfrac int part at low end
; edx: xfrac frac part at top end
; edi: dest
; ebp: scratch
; esi: count
; [esp]: xstep
; [esp+4]: ystep

	align	16

@R_DrawSpanP_ASM@0:
_R_DrawSpanP_ASM:
R_DrawSpanP_ASM:
	mov	eax,[ds_x2]
	 mov	ecx,[ds_x1]
	sub	eax,ecx
	 jl	near rdspret		; count < 0: nothing to do, so leave

	push	ebx
	push	edi
	push	ebp
	push	esi
	sub		esp, 8

	mov	edi,ecx
	add	edi,[dc_destorg]
	 mov	ecx,[ds_y]
	add	edi,[ylookup+ecx*4]
	 mov	edx,[ds_xstep]
dsy1:	shl	edx,6
	 mov	ebp,[ds_xstep]
dsy3:	shr	ebp,26
	 xor	ebx,ebx
	lea	esi,[eax+1]
	 mov	[esp],edx
	mov	edx,[ds_ystep]
	 mov	ecx,[ds_xfrac]
dsy4:	shr	ecx,26
dsm8:	 and	edx,strict dword 0xffffffc0
	or	ebp,edx
	 mov	[esp+4],ebp
	mov	ebp,[ds_yfrac]
	 mov	edx,[ds_xfrac]
dsy2:	shl	edx,6
dsm9:	 and	ebp,strict dword 0xffffffc0
	or	ecx,ebp
	 shr	esi,1
	jnc	dseven1

; do odd pixel

		mov	ebp,ecx
dsx1:		rol	ebp,6
dsm1:		and	ebp,0xfff
		 add	edx,[esp]
		adc	ecx,[esp+4]
spreada		 mov	bl,[ebp+SPACEFILLER4]
spmapa		mov	bl,[ebx+SPACEFILLER4]
		mov	[edi],bl
		 inc	edi

dseven1		shr	esi,1
		 jnc	dsrest

; do two more pixels
		mov	ebp,ecx
		 add	edx,[esp]
		adc	ecx,[esp+4]
dsm2:		 and	ebp,0xfc00003f
dsx2:		rol	ebp,6
		mov	eax,ecx
		 add	edx,[esp]
		adc	ecx,[esp+4]
spreadb		 mov	bl,[ebp+SPACEFILLER4]	;read texel1
dsx3:		rol	eax,6
dsm6:		and	eax,0xfff
spmapb		 mov	bl,[ebx+SPACEFILLER4]	;map texel1
		mov	[edi],bl		;store texel1
		 add	edi,2
spreadc		mov	bl,[eax+SPACEFILLER4]	;read texel2
spmapc		mov	bl,[ebx+SPACEFILLER4]	;map texel2
		mov	[edi-1],bl		;store texel2

; do the rest

dsrest		test	esi,esi
		jz near	dsdone

		align 16

dsloop		mov	ebp,ecx
spstep1d	 add	edx,[esp]
spstep2d	adc	ecx,[esp+4]
dsm3:		 and	ebp,0xfc00003f
dsx4:		rol	ebp,6
		mov	eax,ecx
spstep1e	 add	edx,[esp]
spstep2e	adc	ecx,[esp+4]
spreadd		 mov	bl,[ebp+SPACEFILLER4]	;read texel1
dsx5:		rol	eax,6
dsm5:		and	eax,0xfff
spmapd		 mov	bl,[ebx+SPACEFILLER4]	;map texel1
		mov	[edi],bl		;store texel1
		 mov	ebp,ecx
spreade		mov	bl,[eax+SPACEFILLER4]	;read texel2
spstep1f	 add	edx,[esp]
spstep2f	adc	ecx,[esp+4]
dsm4:		 and	ebp,0xfc00003f
dsx6:		rol	ebp,6
spmape		mov	bl,[ebx+SPACEFILLER4]	;map texel2
		 mov	eax,ecx
		mov	[edi+1],bl		;store texel2
spreadf		 mov	bl,[ebp+SPACEFILLER4]	;read texel3
spmapf		mov	bl,[ebx+SPACEFILLER4]	;map texel3
		 add	edi,4
dsx7:		rol	eax,6
dsm7:		and	eax,0xfff
		 mov	[edi-2],bl		;store texel3
spreadg		mov	bl,[eax+SPACEFILLER4]	;read texel4
spstep1g	 add	edx,[esp]
spstep2g	adc	ecx,[esp+4]
spmapg		 mov	bl,[ebx+SPACEFILLER4]	;map texel4
		dec	esi
		 mov	[edi-1],bl		;store texel4
		jnz near dsloop

dsdone	add esp,8
	pop	esi
	pop	ebp
	pop	edi
	pop	ebx

rdspret	ret

; This is the same as the previous routine, except it doesn't draw pixels
; where the texture's color value is 0.

GLOBAL @R_DrawSpanMaskedP_ASM@0
GLOBAL _R_DrawSpanMaskedP_ASM
GLOBAL R_DrawSpanMaskedP_ASM

; eax: scratch
; ebx: zero
; ecx: yfrac at top end, xfrac int part at low end
; edx: xfrac frac part at top end
; edi: dest
; ebp: scratch
; esi: count
; [esp]: xstep
; [esp+4]: ystep

	align	16

@R_DrawSpanMaskedP_ASM@0:
_R_DrawSpanMaskedP_ASM:
R_DrawSpanMaskedP_ASM:
	mov	eax,[ds_x2]
	 mov	ecx,[ds_x1]
	sub	eax,ecx
	 jl	rdspret		; count < 0: nothing to do, so leave

	push	ebx
	push	edi
	push	ebp
	push	esi
	sub		esp,8

	mov	edi,ecx
	add	edi,[dc_destorg]
	 mov	ecx,[ds_y]
	add	edi,[ylookup+ecx*4]
	 mov	edx,[ds_xstep]
dmsy1:	shl	edx,6
	 mov	ebp,[ds_xstep]
dmsy3:	shr	ebp,26
	 xor	ebx,ebx
	lea	esi,[eax+1]
	 mov	[esp],edx
	mov	edx,[ds_ystep]
	 mov	ecx,[ds_xfrac]
dmsy4:	shr	ecx,26
dmsm8:	 and	edx,strict dword 0xffffffc0
	or	ebp,edx
	 mov	[esp+4],ebp
	mov	ebp,[ds_yfrac]
	 mov	edx,[ds_xfrac]
dmsy2:	shl	edx,6
dmsm9:	 and	ebp,strict dword 0xffffffc0
	or	ecx,ebp
	 shr	esi,1
	jnc	dmseven1

; do odd pixel

		mov	ebp,ecx
dmsx1:		rol	ebp,6
dmsm1:		and	ebp,0xfff
		 add	edx,[esp]
		adc	ecx,[esp+4]
mspreada	 mov	bl,[ebp+SPACEFILLER4]
		cmp	bl,0
		 je	mspskipa
mspmapa		mov	bl,[ebx+SPACEFILLER4]
		mov	[edi],bl
mspskipa:	 inc	edi

dmseven1	shr	esi,1
		 jnc	dmsrest

; do two more pixels
		mov	ebp,ecx
		 add	edx,[esp]
		adc	ecx,[esp+4]
dmsm2:		 and	ebp,0xfc00003f
dmsx2:		rol	ebp,6
		mov	eax,ecx
		 add	edx,[esp]
		adc	ecx,[esp+4]
mspreadb	 mov	bl,[ebp+SPACEFILLER4]	;read texel1
dmsx3:		rol	eax,6
dmsm6:		and	eax,0xfff
		cmp	bl,0
		 je	mspskipb
mspmapb		 mov	bl,[ebx+SPACEFILLER4]	;map texel1
		mov	[edi],bl		;store texel1
mspskipb	 add	edi,2
mspreadc	mov	bl,[eax+SPACEFILLER4]	;read texel2
		cmp	bl,0
		 je	dmsrest
mspmapc		mov	bl,[ebx+SPACEFILLER4]	;map texel2
		mov	[edi-1],bl		;store texel2

; do the rest

dmsrest		test	esi,esi
		jz near	dmsdone

		align 16

dmsloop		mov	ebp,ecx
mspstep1d	 add	edx,[esp]
mspstep2d	adc	ecx,[esp+4]
dmsm3:		 and	ebp,0xfc00003f
dmsx4:		rol	ebp,6
		mov	eax,ecx
mspstep1e	 add	edx,[esp]
mspstep2e	adc	ecx,[esp+4]
mspreadd	 mov	bl,[ebp+SPACEFILLER4]	;read texel1
dmsx5:		rol	eax,6
dmsm5:		and	eax,0xfff
		 cmp	bl,0
		mov	ebp,ecx
		 je	mspreade
mspmapd		mov	bl,[ebx+SPACEFILLER4]	;map texel1
		mov	[edi],bl		;store texel1
mspreade	mov	bl,[eax+SPACEFILLER4]	;read texel2
mspstep1f	 add	edx,[esp]
mspstep2f	adc	ecx,[esp+4]
dmsm4:		 and	ebp,0xfc00003f
dmsx6:		rol	ebp,6
		 cmp	bl,0
		mov	eax,ecx
		 je	mspreadf
mspmape		mov	bl,[ebx+SPACEFILLER4]	;map texel2
		mov	[edi+1],bl		;store texel2
mspreadf	 mov	bl,[ebp+SPACEFILLER4]	;read texel3
		add	edi,4
dmsx7:		rol	eax,6
dmsm7:		and	eax,0xfff
		cmp	bl,0
		 je	mspreadg
mspmapf		mov	bl,[ebx+SPACEFILLER4]	;map texel3
		 mov	[edi-2],bl		;store texel3
mspreadg	mov	bl,[eax+SPACEFILLER4]	;read texel4
mspstep1g	 add	edx,[esp]
mspstep2g	adc	ecx,[esp+4]
		cmp	bl,0
		 je	mspskipg
mspmapg		 mov	bl,[ebx+SPACEFILLER4]	;map texel4
		mov	[edi-1],bl		;store texel4
mspskipg	dec	esi
		 jnz near dmsloop

dmsdone	add esp,8
	pop	esi
	pop	ebp
	pop	edi
	pop	ebx

	ret


;*----------------------------------------------------------------------
;*
;* R_DrawColumnP
;*
;*----------------------------------------------------------------------

GLOBAL	@R_DrawColumnP_ASM@0
GLOBAL	_R_DrawColumnP_ASM
GLOBAL	R_DrawColumnP_ASM

	align 16

R_DrawColumnP_ASM:
_R_DrawColumnP_ASM:
@R_DrawColumnP_ASM@0:

; count = dc_yh - dc_yl;

	mov	ecx,[dc_count]
	test	ecx,ecx
	jle	near rdcpret		; count <= 0: nothing to do, so leave

	push	ebp			; save registers
	 push	ebx
	push	edi
	 push	esi

; dest = ylookup[dc_yl] + dc_x + dc_destorg;

	mov	edi,[dc_dest]
	mov	ebp,ecx
	mov	ebx,[dc_texturefrac]	; ebx = frac
rdcp1:	sub	edi,SPACEFILLER4
	mov	ecx,ebx
	shr	ecx,16
	mov	esi,[dc_source]
	mov	edx,[dc_iscale]
	mov	eax,[dc_colormap]

	cmp	BYTE [CPU+66],byte 5
	jg	rdcploop2

	align 16

; The registers should now look like this:
;
;	[31  ..  16][15 .. 8][7 .. 0]
; eax	[colormap		    ]
; ebx	[yi	   ][yf		    ]
; ecx	[scratch		    ]
; edx	[dyi	   ][dyf	    ]
; esi	[source texture column	    ]
; edi	[destination screen pointer ]
; ebp	[counter		    ]
;


; Note the partial register stalls on anything better than a Pentium
; That's why there are two versions of this loop.

rdcploop:
	mov	cl,[esi+ecx]		; Fetch texel
	 xor	ch,ch
	add	ebx,edx			; increment frac
rdcp2:	 add	edi,SPACEFILLER4	; increment destination pointer
	mov	cl,[eax+ecx]		; colormap texel
	mov	[edi],cl		; Store texel
	 mov	ecx,ebx
	shr	ecx,16
	 dec	ebp
	jnz	rdcploop		; loop

	pop	esi
	 pop	edi
	pop	ebx
	 pop	ebp
rdcpret:
	ret
	
	align 16

rdcploop2:
	movzx	ecx,byte [esi+ecx]	; Fetch texel
	add	ebx,edx			; increment frac
	mov	cl,[eax+ecx]		; colormap texel
rdcp3:	add	edi,SPACEFILLER4	; increment destination pointer
	mov	[edi],cl		; Store texel
	mov	ecx,ebx
	shr	ecx,16
	dec	ebp
	jnz	rdcploop2		; loop

	pop	esi
	pop	edi
	pop	ebx
	pop	ebp
	ret
	

;*----------------------------------------------------------------------
;*
;* R_DrawFuzzColumnP
;*
;*----------------------------------------------------------------------

GLOBAL	@R_DrawFuzzColumnP_ASM@0
GLOBAL	_R_DrawFuzzColumnP_ASM
GLOBAL	R_DrawFuzzColumnP_ASM

	align 16

R_DrawFuzzColumnP_ASM:
_R_DrawFuzzColumnP_ASM:
@R_DrawFuzzColumnP_ASM@0:

; Adjust borders. Low...
	mov	eax,[dc_yl]
	 push	ebx
	push	esi
	 push	edi
	push	ebp

	 cmp	eax,0
	jg	.ylok

	mov	eax,1
	nop

; ...and high.
.ylok	mov	edx,[fuzzviewheight]
	 mov	esi,[dc_yh]
	cmp	esi,edx
	 jle	.yhok

	mov	esi,edx
	nop

.yhok	mov	edx,[dc_x]
	 sub	esi,eax			; esi = count
	js	near .dfcdone		; Zero length (or less)

	mov	edi,[ylookup+eax*4]
	 mov	ebx,edx
	add	edi,[dc_destorg]
	 mov	eax,[NormalLight]
	mov	ecx,[fuzzpos]
	 add	edi,ebx
	add	eax,256*6
	 inc	esi
	mov	ebp,[dc_pitch]
	 mov	edx,FUZZTABLE
	test	ecx,ecx
	 je	.fuzz0

;
; esi = count
; edi = dest
; ecx = fuzzpos
; eax = colormap 6
;

; first loop: end with fuzzpos or count 0, whichever happens first

	sub	edx,ecx			; edx = # of entries left in fuzzoffset
	 mov	ebx,esi
	cmp	esi,edx
	 jle	.enuf
	mov	esi,edx
.enuf	sub	ebx,esi
	 mov	edx,[fuzzoffset+ecx*4]
	push	ebx
	 xor	ebx,ebx

.loop1	inc	ecx
	 mov	bl,[edi+edx]
	dec	esi
	 mov	bl,[eax+ebx]
	mov	[edi],bl
	 lea	edi,[edi+ebp]
	mov	edx,[fuzzoffset+ecx*4]
	 jnz	.loop1

; second loop: Chunk it into groups of FUZZTABLE-sized spans and do those

	pop	esi
	 cmp	ecx,FUZZTABLE
	jl	.savefuzzpos
	xor	ecx,ecx
	 nop
.fuzz0	cmp	esi,FUZZTABLE
	 jl	.chunked

.oloop	lea	edx,[esi-FUZZTABLE]
	 mov	esi,FUZZTABLE
	push	edx
	 mov	edx,[fuzzoffset+ecx*4]

.iloop	inc	ecx
	 mov	bl,[edi+edx]
	dec	esi
	 mov	bl,[eax+ebx]
	mov	[edi],bl
	 lea	edi,[edi+ebp]
	mov	edx,[fuzzoffset+ecx*4]
	 jnz	.iloop

	pop	esi
	 xor	ecx,ecx
	cmp	esi,FUZZTABLE
	 jge	.oloop

; third loop: Do whatever is left

.chunked:
	test	esi,esi
	 jle	.savefuzzpos
	mov	edx,[fuzzoffset+ecx*4]
	 nop

.loop3	inc	ecx
	 mov	bl,[edi+edx]
	dec	esi
	 mov	bl,[eax+ebx]
	mov	[edi],bl
	 lea	edi,[edi+ebp]
	mov	edx,[fuzzoffset+ecx*4]
	 jnz	.loop3

.savefuzzpos:
	mov	[fuzzpos],ecx
.dfcdone:
	pop	ebp
	pop	edi
	pop	esi
	pop	ebx
	ret


;*----------------------------------------------------------------------
;*
;* R_DrawColumnHorizP_ASM
;*
;*----------------------------------------------------------------------

GLOBAL	@R_DrawColumnHorizP_ASM@0
GLOBAL	_R_DrawColumnHorizP_ASM
GLOBAL	R_DrawColumnHorizP_ASM

	align 16

@R_DrawColumnHorizP_ASM@0:
_R_DrawColumnHorizP_ASM:
R_DrawColumnHorizP_ASM:

; count = dc_yh - dc_yl;

	mov	eax,[dc_yh]
	mov	ecx,[dc_yl]
	sub	eax,ecx
	mov	edx,[dc_x]

	jl	near .leave		; count < 0: nothing to do, so leave

	push	ebp			; save registers
	push	ebx
	push	edi
	push	esi

	inc	eax			; make 0 count mean 0 pixels
	 and	edx,3
	push	eax
	 mov	eax,[dc_temp]
	mov	esi,[dc_ctspan+edx*4]
	 add	eax,edx
	lea	eax,[eax+ecx*4] ; eax = top of column in buffer
	 mov	ebp,[dc_yh]
	mov	[esi],ecx
	 mov	[esi+4],ebp
	add	esi,8
	 mov	edi,[dc_source]
	mov	[dc_ctspan+edx*4],esi
	 mov	esi,[dc_iscale]
	mov	ecx,[dc_texturefrac]	; ecx = frac
	 mov	dl,[edi]		; load cache
	mov	ebx,[esp]
	 and	ebx,0xfffffff8
	jnz	.mthan8

; Register usage in the following code is:
;
; eax: dest
; edi: source
; ecx: frac (16.16)
; esi: fracstep (16.16)
; ebx: add1
; ebp: add2
;  dl: texel1
;  dh: texel2
;[esp] count

; there are fewer than 8 pixels to draw

	mov	ebx,[esp]
.lthan8	shr	ebx,1
	 jnc	.even

; do one pixel before loop (little opportunity for pairing)

	mov	ebp,ecx			; copy frac to ebx
	 add	ecx,esi			; increment frac
	shr	ebp,16			; shift frac over to low end
	 add	eax,4
	mov	dl,[edi+ebp]
	mov	[eax-4],dl

.even	test	ebx,ebx
	jz	near .done

.loop2	mov	[esp],ebx		; save counter
	 mov	ebx,ecx			; copy frac for texel1 to ebx
	shr	ebx,16			; shift frac for texel1 to low end
	 add	ecx,esi			; increment frac
	mov	ebp,ecx			; copy frac for texel2 to ebp
	shr	ebp,16			; shift frac for texel2 to low end
	 add	ecx,esi			; increment frac
	 mov	dl,[edi+ebx]		; read texel1
	mov	ebx,[esp]		; fetch counter
	 mov	dh,[edi+ebp]		; read texel2
	mov	[eax],dl		; write texel1
	 mov	[eax+4],dh		; write texel2
	add	eax,8			; increment dest
	 dec	ebx			; decrement counter
	jnz	.loop2			; loop until it hits 0

	jmp	.done

; there are more than 8 pixels to draw. position eax as close to a 32 byte
; boundary as possible, then do whatever is left.

.mthan8 test	eax,4
	jz	.try2

	mov	ebp,ecx			; frac: in ebp
	 add	ecx,esi			; step
	shr	ebp,16			; frac: shift
	 add	eax,4			; increment dest
	 mov	ebx,[esp]		; fetch counter
	mov	dl,[edi+ebp]		; tex:  read
	 dec	ebx			; decrement counter
	mov	[eax-4],dl		; tex:  write
	 mov	[esp],ebx		; store counter

.try2	test	eax,8
	jz	.try4

	mov	ebx,ecx			; frac1: in ebx
	 add	ecx,esi			; step
	shr	ebx,16			; frac1: shift
	 mov	ebp,ecx			; frac2: in ebp
	shr	ebp,16			; frac2: shift
	 add	ecx,esi			; step
	mov	dl,[edi+ebx]		; tex1:  read
	 mov	ebx,[esp]		; fetch counter
	mov	dh,[edi+ebp]		; tex2:  read
	 mov	[eax],dl		; tex1:  write
	mov	[eax+4],dh		; tex2:  write
	 sub	ebx,2			; decrement counter
	add	eax,8			; increment dest
	 mov	[esp],ebx		; store counter

.try4	test	eax,16
	jz	.try8

	mov	ebx,ecx			; frac1: in ebx
	 add	ecx,esi			; step
	shr	ebx,16			; frac1: shift
	 mov	ebp,ecx			; frac2: in ebp
	shr	ebp,16			; frac2: shift
	 add	ecx,esi			; step
	mov	dl,[edi+ebx]		; tex1:  read
	 mov	ebx,ecx			; frac3: in ebx
	shr	ebx,16			; frac3: shift
	 mov	dh,[edi+ebp]		; tex2:  read
	add	ecx,esi			; step
	 mov	[eax],dl		; tex1:  write
	mov	[eax+4],dh		; tex2:  write
	 mov	ebp,ecx			; frac4: in ebp
	shr	ebp,16			; frac4: shift
	 add	ecx,esi			; step
	mov	dl,[edi+ebx]		; tex3:  read
	 mov	ebx,[esp]		; fetch counter
	mov	dh,[edi+ebp]		; tex4:  read
	 sub	ebx,4			; decrement counter
	mov	[esp],ebx		; store counter
	 mov	[eax+8],dl		; tex3:  write
	mov	[eax+12],dh		; tex4:  write
	 add	eax,16			; increment dest

.try8	mov	ebx,[esp]		; make counter count groups of 8
	sub	esp,4
	shr	ebx,3
	jmp	.tail8

	align	16

.loop8	mov	[esp],ebx		; save counter
	 mov	ebx,ecx			; frac1: in ebx
	shr	ebx,16			; frac1: shift
	 add	ecx,esi			; step
	mov	ebp,ecx			; frac2: in ebp
	shr	ebp,16			; frac2: shift
	 add	ecx,esi			; step
	 mov	dl,[edi+ebx]		; tex1:  read
	mov	ebx,ecx			; frac3: in ebx
	 mov	dh,[edi+ebp]		; tex2:  read
	shr	ebx,16			; frac3: shift
	 add	ecx,esi			; step
	mov	[eax],dl		; tex1:  write
	 mov	[eax+4],dh		; tex2:  write
	mov	ebp,ecx			; frac4: in ebp
	shr	ebp,16			; frac4: shift
	 add	ecx,esi			; step
	 mov	dl,[edi+ebx]		; tex3:  read
	mov	ebx,ecx			; frac5: in ebx
	 mov	dh,[edi+ebp]		; tex4:  read
	shr	ebx,16			; frac5: shift
	 mov	[eax+8],dl		; tex3:  write
	mov	[eax+12],dh		; tex4:  write
	 add	ecx,esi			; step
	mov	ebp,ecx			; frac6: in ebp
	shr	ebp,16			; frac6: shift
	 mov	dl,[edi+ebx]		; tex5:  read
	 add	ecx,esi			; step
	mov	ebx,ecx			; frac7: in ebx
	 mov	[eax+16],dl		; tex5:  write
	shr	ebx,16			; frac7: shift
	 mov	dh,[edi+ebp]		; tex6:  read
	 add	ecx,esi			; step
	mov	ebp,ecx			; frac8: in ebp
	 mov	[eax+20],dh		; tex6:  write
	shr	ebp,16			; frac8: shift
	 add	eax,32			; increment dest pointer
	 mov	dl,[edi+ebx]		; tex7:  read
	mov	ebx,[esp]		; fetch counter
	 mov	[eax-8],dl		; tex7:  write
	mov	dh,[edi+ebp]		; tex8:  read
	 add	ecx,esi			; step
	mov	[eax-4],dh		; tex8:  write
	 mov	dl,[eax]		; load cache
	dec	ebx			; decrement counter
.tail8	 jnz	near .loop8		; loop if more to do
	
	pop	ebp
	mov	ebx,[esp]
	and	ebx,7
	jnz	near .lthan8

.done	pop	eax
	pop	esi
	pop	edi
	pop	ebx
	pop	ebp
.leave	ret


;*----------------------------------------------------------------------
;*
;* rt_copy1col_asm
;*
;* ecx = hx
;* edx = sx
;* [esp+4] = yl
;* [esp+8] = yh
;*
;*----------------------------------------------------------------------

GLOBAL	@rt_copy1col_asm@16
GLOBAL	_rt_copy1col_asm
GLOBAL	rt_copy1col_asm

	align 16

rt_copy1col_asm:
_rt_copy1col_asm:
	pop	eax
	mov	edx,[esp+4*3]
	mov	ecx,[esp+4*2]
	push	edx
	push	ecx
	mov	ecx,[esp+4*2]
	mov	edx,[esp+4*3]
	push	eax

@rt_copy1col_asm@16:
	mov	eax, [esp+4]
	push	ebx
	mov	ebx, [esp+12]
	push	esi
	sub	ebx, eax
	push	edi
	js	.done

	lea	esi,[eax*4]
	inc	ebx			; ebx = count
	mov	eax,edx
	add ecx,esi
	mov	edi,[ylookup+esi]
	add ecx,[dc_temp]	; ecx = source
	mov	esi,[dc_pitch]		; esi = pitch
	add	eax,edi			; eax = dest
	add	eax,[dc_destorg]

	shr	ebx,1
	jnc	.even

	mov	dl,[ecx]
	add	ecx,4
	mov	[eax],dl
	add	eax,esi

.even	and	ebx,ebx
	jz	.done

.loop	mov	dl,[ecx]
	mov	dh,[ecx+4]
	mov	[eax],dl
	mov	[eax+esi],dh
	add	ecx,8
	lea	eax,[eax+esi*2]
	dec	ebx
	jnz	.loop

.done	pop	edi
	pop	esi
	pop	ebx
	ret	8

;*----------------------------------------------------------------------
;*
;* rt_copy4cols_asm
;*
;* ecx = sx
;* edx = yl
;* [esp+4] = yh
;*
;*----------------------------------------------------------------------

GLOBAL	@rt_copy4cols_asm@12
GLOBAL	_rt_copy4cols_asm
GLOBAL	rt_copy4cols_asm

	align 16

rt_copy4cols_asm:
_rt_copy4cols_asm:
	pop	eax
	mov	ecx,[esp+8]
	mov	edx,[esp+4]
	push	ecx
	mov	ecx,[esp+4]
	push	eax

@rt_copy4cols_asm@12:
	push	ebx
	mov	ebx,[esp+8]
	push	esi
	sub	ebx,edx
	push	edi
	js	.done

	inc	ebx			; ebx = count
	mov	eax,ecx
	mov	esi,[ylookup+edx*4]
	mov ecx,[dc_temp]
	add	eax,esi			; eax = dest
	add	eax,[dc_destorg]
	lea	ecx,[ecx+edx*4]	; ecx = source
	mov	edx,[dc_pitch]		; edx = pitch

	shr	ebx,1
	jnc	.even

	mov	esi,[ecx]
	add	ecx,4
	mov	[eax],esi
	add	eax,edx

.even	and	ebx,ebx
	jz	.done

.loop	mov	esi,[ecx]
	mov	edi,[ecx+4]
	mov	[eax],esi
	mov	[eax+edx],edi
	add	ecx,8
	lea	eax,[eax+edx*2]
	dec	ebx
	jnz	.loop

.done	pop	edi
	pop	esi
	pop	ebx
	ret	4

;*----------------------------------------------------------------------
;*
;* rt_map1col_asm
;*
;* ecx = hx
;* edx = sx
;* [esp+4] = yl
;* [esp+8] = yh
;*
;*----------------------------------------------------------------------

GLOBAL	@rt_map1col_asm@16
GLOBAL	_rt_map1col_asm
GLOBAL	rt_map1col_asm

	align 16

rt_map1col_asm:
_rt_map1col_asm:
	pop	eax
	mov	edx,[esp+4*3]
	mov	ecx,[esp+4*2]
	push	edx
	push	ecx
	mov	ecx,[esp+4*2]
	mov	edx,[esp+4*3]
	push	eax

@rt_map1col_asm@16:	
	mov	eax,[esp+4]
	push	ebx
	mov	ebx,[esp+12]
	push	ebp
	push	esi
	sub	ebx, eax
	push	edi
	js	.done

	lea	edi,[eax*4]
	mov	esi,[dc_colormap]		; esi = colormap
	inc	ebx				; ebx = count
	mov	eax,edx
	lea	ebp,[ecx+edi]		; ebp = source
	add ebp,[dc_temp]
	mov	ecx,[ylookup+edi]
	mov	edi,[dc_pitch]			; edi = pitch
	add	eax,ecx				; eax = dest
	xor	ecx,ecx
	xor	edx,edx
	add	eax,[dc_destorg]

	shr	ebx,1
	jnc	.even

	mov	dl,[ebp]
	add	ebp,4
	mov	dl,[esi+edx]
	mov	[eax],dl
	add	eax,edi

.even	and	ebx,ebx
	jz	.done

.loop	mov	dl,[ebp]
	mov	cl,[ebp+4]
	add	ebp,8
	mov	dl,[esi+edx]
	mov	cl,[esi+ecx]
	mov	[eax],dl
	mov	[eax+edi],cl
	dec	ebx
	lea	eax,[eax+edi*2]
	jnz	.loop

.done	pop	edi
	pop	esi
	pop	ebp
	pop	ebx
	ret	8

;*----------------------------------------------------------------------
;*
;* rt_map4cols_asm
;*
;* rt_map4cols_asm1 is for PPro and above
;* rt_map4cols_asm2 is for Pentium and below
;*
;* ecx = sx
;* edx = yl
;* [esp+4] = yh
;*
;*----------------------------------------------------------------------

GLOBAL	@rt_map4cols_asm1@12
GLOBAL	_rt_map4cols_asm1
GLOBAL	rt_map4cols_asm1

	align 16

rt_map4cols_asm1:
_rt_map4cols_asm1:
	pop	eax
	mov	ecx,[esp+8]
	mov	edx,[esp+4]
	push	ecx
	mov	ecx,[esp+4]
	push	eax

@rt_map4cols_asm1@12:
	push	ebx
	mov	ebx,[esp+8]
	push	ebp
	push	esi
	sub	ebx,edx
	push	edi
	js	near .done

	mov	esi,[dc_colormap]	; esi = colormap
	shl	edx,2
	mov	eax,ecx
	inc	ebx			; ebx = count
	mov	edi,[ylookup+edx]
	mov	ebp,[dc_temp]
	add ebp,edx		; ebp = source
	add	eax,edi			; eax = dest
	mov	edi,[dc_pitch]		; edi = pitch
	add	eax,[dc_destorg]
	xor	ecx,ecx
	xor	edx,edx

	shr	ebx,1
	jnc	.even

	mov	dl,[ebp]
	 mov	cl,[ebp+1]
	add	ebp,4
	 mov	dl,[esi+edx]
	mov	cl,[esi+ecx]
	 mov	[eax],dl
	mov	[eax+1],cl
	 mov	dl,[ebp-2]
	mov	cl,[ebp-1]
	 mov	dl,[esi+edx]
	mov	cl,[esi+ecx]
	 mov	[eax+2],dl
	mov	[eax+3],cl
	 add	eax,edi

.even	and	ebx,ebx
	jz	.done

.loop:
	mov	dl,[ebp]
	 mov	cl,[ebp+1]
	add	ebp,8
	 mov	dl,[esi+edx]
	mov	cl,[esi+ecx]
	 mov	[eax],dl
	mov	[eax+1],cl
	 mov	dl,[ebp-6]
	mov	cl,[ebp-5]
	 mov	dl,[esi+edx]
	mov	cl,[esi+ecx]
	 mov	[eax+2],dl
	mov	[eax+3],cl
	 mov	dl,[ebp-4]
	mov	cl,[ebp-3]
	 mov	dl,[esi+edx]
	mov	cl,[esi+ecx]
	 mov	[eax+edi],dl
	mov	[eax+edi+1],cl
	 mov	dl,[ebp-2]
	mov	cl,[ebp-1]
	 mov	dl,[esi+edx]
	mov	cl,[esi+ecx]
	 mov	[eax+edi+2],dl
	mov	[eax+edi+3],cl
	 lea	eax,[eax+edi*2]
	dec	ebx

	jnz	.loop

.done	pop	edi
	pop	esi
	pop	ebp
	pop	ebx
	ret	4

GLOBAL	@rt_map4cols_asm2@12
GLOBAL	_rt_map4cols_asm2
GLOBAL	rt_map4cols_asm2

	align 16

rt_map4cols_asm2:
_rt_map4cols_asm2:
	pop	eax
	mov	ecx,[esp+8]
	mov	edx,[esp+4]
	push	ecx
	mov	ecx,[esp+4]
	push	eax

@rt_map4cols_asm2@12:
	push	ebx
	mov	ebx,[esp+8]
	push	ebp
	push	esi
	sub	ebx,edx
	push	edi
	js	near .done

	mov	esi,[dc_colormap]	; esi = colormap
	shl	edx,2
	mov	eax,ecx
	inc	ebx			; ebx = count
	mov	edi,[ylookup+edx]
	mov ebp,[dc_temp]
	add ebp,edx		; ebp = source
	add	eax,edi			; eax = dest
	mov	edi,[dc_pitch]		; edi = pitch
	add	eax,[dc_destorg]
	xor	ecx,ecx
	xor	edx,edx

	shr	ebx,1
	jnc	.even

	mov	dl,[ebp]
	 mov	cl,[ebp+1]
	add	ebp,4
	 mov	dl,[esi+edx]
	mov	cl,[esi+ecx]
	 mov	[eax],dl
	mov	[eax+1],cl
	 mov	dl,[ebp-2]
	mov	cl,[ebp-1]
	 mov	dl,[esi+edx]
	mov	cl,[esi+ecx]
	 mov	[eax+2],dl
	mov	[eax+3],cl
	 add	eax,edi

.even	and	ebx,ebx
	jz	.done

.loop:
	mov	dl,[ebp+3]
	mov	ch,[esi+edx]
	mov	dl,[ebp+2]
	mov	cl,[esi+edx]
	shl	ecx,16
	mov	dl,[ebp+1]
	mov	ch,[esi+edx]
	mov	dl,[ebp]
	mov	cl,[esi+edx]
	mov	[eax],ecx
	add	eax,edi

	mov	dl,[ebp+7]
	mov	ch,[esi+edx]
	mov	dl,[ebp+6]
	mov	cl,[esi+edx]
	shl	ecx,16
	mov	dl,[ebp+5]
	mov	ch,[esi+edx]
	mov	dl,[ebp+4]
	mov	cl,[esi+edx]
	mov	[eax],ecx
	add	eax,edi
	add	ebp,8
	dec	ebx

	jnz	.loop

.done	pop	edi
	pop	esi
	pop	ebp
	pop	ebx
	ret	4

	align 16

GLOBAL rt_shaded4cols_asm
GLOBAL _rt_shaded4cols_asm

rt_shaded4cols_asm:
_rt_shaded4cols_asm:
		mov		ecx,[esp+8]
		push	ebp
		mov		ebp,[esp+16]
		sub		ebp,ecx
		js		near s4nil
		mov		eax,[ylookup+ecx*4]
		add		eax,[dc_destorg]				; eax = destination
		push	ebx
		push	esi
		mov		esi,[dc_temp]
		inc		ebp								; ebp = count
		add		eax,[esp+16]
		push	edi
		lea		esi,[esi+ecx*4]				; esi = source

		align	16

s4loop:	movzx	edx,byte [esi]
		movzx	ecx,byte [esi+1]
s4cm1:	movzx	edx,byte [SPACEFILLER4+edx]		; colormap
s4cm2:	movzx	edi,byte [SPACEFILLER4+ecx]		; colormap
		shl		edx,8
		movzx	ebx,byte [eax]
		shl		edi,8
		movzx	ecx,byte [eax+1]
		sub		ebx,edx
		sub		ecx,edi
		mov		ebx,[Col2RGB8+0x10000+ebx*4]
		mov		ecx,[Col2RGB8+0x10000+ecx*4]
s4fg1:	add		ebx,[SPACEFILLER4+edx*4]
s4fg2:	add		ecx,[SPACEFILLER4+edi*4]
		or		ebx,0x1f07c1f
		or		ecx,0x1f07c1f
		mov		edx,ebx
		shr		ebx,15
		mov		edi,ecx
		shr		ecx,15
		and		edx,ebx
		and		ecx,edi
		mov		bl,[RGB32k+edx]
		movzx	edx,byte [esi+2]
		mov		bh,[RGB32k+ecx]
		movzx	ecx,byte [esi+3]
		mov		[eax],bl
		mov		[eax+1],bh

s4cm3:	movzx	edx,byte [SPACEFILLER4+edx]		; colormap
s4cm4:	movzx	edi,byte [SPACEFILLER4+ecx]		; colormap
		shl		edx,8
		movzx	ebx,byte [eax+2]
		shl		edi,8
		movzx	ecx,byte [eax+3]
		sub		ebx,edx
		sub		ecx,edi
		mov		ebx,[Col2RGB8+0x10000+ebx*4]
		mov		ecx,[Col2RGB8+0x10000+ecx*4]
s4fg3:	add		ebx,[SPACEFILLER4+edx*4]
s4fg4:	add		ecx,[SPACEFILLER4+edi*4]
		or		ebx,0x1f07c1f
		or		ecx,0x1f07c1f
		mov		edx,ebx
		shr		ebx,15
		mov		edi,ecx
		shr		ecx,15
		and		edx,ebx
		and		ecx,edi
s4p:	add		eax,320							; pitch
		add		esi,4
		mov		bl,[RGB32k+edx]
		mov		bh,[RGB32k+ecx]
s4p2:	mov		[eax-320+2],bl
s4p3:	mov		[eax-320+3],bh
		dec		ebp
		jne		s4loop

		pop		edi
		pop		esi
		pop		ebx
s4nil:	pop		ebp
		ret

		align 16

GLOBAL	rt_add4cols_asm
GLOBAL	_rt_add4cols_asm

rt_add4cols_asm:
_rt_add4cols_asm:
		mov		ecx,[esp+8]
		push	edi
		mov		edi,[esp+16]
		sub		edi,ecx
		js		near a4nil
		mov		eax,[ylookup+ecx*4]
		add		eax,[dc_destorg]
		push	ebx
		push	esi
		mov		esi,[dc_temp]
		push	ebp
		inc		edi
		add		eax,[esp+20]
		lea		esi,[esi+ecx*4]
		
		align 16
a4loop:
		movzx	ebx,byte [esi]
		movzx	edx,byte [esi+1]
		movzx	ecx,byte [eax]
		movzx	ebp,byte [eax+1]
a4cm1:	movzx	ebx,byte [SPACEFILLER4+ebx]		; colormap
a4cm2:	movzx	edx,byte [SPACEFILLER4+edx]		; colormap
a4bg1:	mov		ecx,[SPACEFILLER4+ecx*4]		; bg2rgb
a4bg2:	mov		ebp,[SPACEFILLER4+ebp*4]		; bg2rgb
a4fg1:	add		ecx,[SPACEFILLER4+ebx*4]		; fg2rgb
a4fg2:	add		ebp,[SPACEFILLER4+edx*4]		; fg2rgb
		or		ecx,0x01f07c1f
		or		ebp,0x01f07c1f
		mov		ebx,ecx
		shr		ecx,15
		mov		edx,ebp
		shr		ebp,15
		and		ecx,ebx
		and		ebp,edx
		movzx	ebx,byte [esi+2]
		movzx	edx,byte [esi+3]
		mov		cl,[RGB32k+ecx]
		mov		ch,[RGB32k+ebp]
		mov		[eax],cl
		mov		[eax+1],ch

		movzx	ecx,byte [eax+2]
		movzx	ebp,byte [eax+3]
a4cm3:	movzx	ebx,byte [SPACEFILLER4+ebx]		; colormap
a4cm4:	movzx	edx,byte [SPACEFILLER4+edx]		; colormap
a4bg3:	mov		ecx,[SPACEFILLER4+ecx*4]		; bg2rgb
a4bg4:	mov		ebp,[SPACEFILLER4+ebp*4]		; bg2rgb
a4fg3:	add		ecx,[SPACEFILLER4+ebx*4]		; fg2rgb
a4fg4:	add		ebp,[SPACEFILLER4+edx*4]		; fg2rgb
		or		ecx,0x01f07c1f
		or		ebp,0x01f07c1f
		mov		ebx,ecx
		shr		ecx,15
		mov		edx,ebp
		shr		ebp,15
		and		ebx,ecx
		and		edx,ebp
		mov		cl,[RGB32k+ebx]
		mov		ch,[RGB32k+edx]
		mov		[eax+2],cl
		mov		[eax+3],ch

		add		esi,4
a4p:	add		eax,320							; pitch
		sub		edi,1
		jne		a4loop
		pop		ebp
		pop		esi
		pop		ebx
a4nil:	pop		edi
		ret

		align 16

GLOBAL	rt_addclamp4cols_asm
GLOBAL	_rt_addclamp4cols_asm

rt_addclamp4cols_asm:
_rt_addclamp4cols_asm:
		mov		ecx,[esp+8]
		push	edi
		mov		edi,[esp+16]
		sub		edi,ecx
		js		near ac4nil
		mov		eax,[ylookup+ecx*4]
		add		eax,[dc_destorg]
		push	ebx
		push	esi
		mov		esi,[dc_temp]
		push	ebp
		inc		edi
		add		eax,[esp+20]
		lea		esi,[esi+ecx*4]
		push	edi
		
		align	16
ac4loop:
		movzx	ebx,byte [esi]
		movzx	edx,byte [esi+1]
		mov		[esp],edi
ac4cm1:	movzx	ebx,byte [SPACEFILLER4+ebx]		; colormap
ac4cm2:	movzx	edx,byte [SPACEFILLER4+edx]		; colormap
		movzx	ecx,byte [eax]
		movzx	ebp,byte [eax+1]
ac4fg1:	mov		ebx,[SPACEFILLER4+ebx*4]		; fg2rgb
ac4fg2:	mov		edx,[SPACEFILLER4+edx*4]		; fg2rgb
ac4bg1:	add		ebx,[SPACEFILLER4+ecx*4]		; bg2rgb
ac4bg2:	add		edx,[SPACEFILLER4+ebp*4]		; bg2rgb
		mov		ecx,ebx
		or		ebx,0x01f07c1f
		and		ecx,0x40100400
		and		ebx,0x3fffffff
		mov		edi,ecx
		shr		ecx,5
		mov		ebp,edx
		sub		edi,ecx
		or		edx,0x01f07c1f
		or		ebx,edi
		mov		ecx,ebx
		shr		ebx,15
		and		ebp,0x40100400
		and		ebx,ecx
		and		edx,0x3fffffff
		mov		edi,ebp
		shr		ebp,5
		mov		cl,[RGB32k+ebx]
		sub		edi,ebp
		mov		[eax],cl
		or		edx,edi
		mov		ebp,edx
		shr		edx,15
		movzx	ebx,byte [esi+2]
		and		ebp,edx
		movzx	edx,byte [esi+3]
ac4cm3:	movzx	ebx,byte [SPACEFILLER4+ebx]		; colormap
		mov		cl,[RGB32k+ebp]
ac4cm4:	movzx	edx,byte [SPACEFILLER4+edx]		; colormap
		mov		[eax+1],cl
		movzx	ecx,byte [eax+2]
		movzx	ebp,byte [eax+3]
ac4fg3:	mov		ebx,[SPACEFILLER4+ebx*4]		; fg2rgb
ac4fg4:	mov		edx,[SPACEFILLER4+edx*4]		; fg2rgb
ac4bg3:	add		ebx,[SPACEFILLER4+ecx*4]		; bg2rgb
ac4bg4:	add		edx,[SPACEFILLER4+ebp*4]		; bg2rgb
		mov		ecx,ebx
		or		ebx,0x01f07c1f
		and		ecx,0x40100400
		and		ebx,0x3fffffff
		mov		edi,ecx
		shr		ecx,5
		mov		ebp,edx
		sub		edi,ecx
		or		edx,0x01f07c1f
		or		ebx,edi
		mov		ecx,ebx
		shr		ebx,15
		and		ebp,0x40100400
		and		ebx,ecx
		and		edx,0x3fffffff
		mov		edi,ebp
		shr		ebp,5
		mov		cl,[RGB32k+ebx]
		sub		edi,ebp
		mov		[eax+2],cl
		or		edx,edi
		mov		edi,[esp]
		mov		ebp,edx
		shr		edx,15
		add		esi,4
		and		edx,ebp
		mov		cl,[RGB32k+edx]
		mov		[eax+3],cl

ac4p:	add		eax,320							; pitch
		sub		edi,1
		jne		ac4loop
		pop		edi

		pop		ebp
		pop		esi
		pop		ebx
ac4nil:	pop		edi
		ret

rtext_end:
%ifdef M_TARGET_MACHO
GLOBAL _rtext_tmap_end
_rtext_tmap_end:
%endif
		align	16

;************************

	SECTION .text

GLOBAL	R_SetupShadedCol
GLOBAL	@R_SetupShadedCol@0

# Patch the values of dc_colormap and dc_color into the shaded column drawer.

R_SetupShadedCol:
@R_SetupShadedCol@0:
		mov		eax,[dc_colormap]
		cmp		[s4cm1+3],eax
		je		.cmdone
		mov		[s4cm1+3],eax
		mov		[s4cm2+3],eax
		mov		[s4cm3+3],eax
		mov		[s4cm4+3],eax
.cmdone	mov		eax,[dc_color]
		lea		eax,[Col2RGB8+eax*4]
		cmp		[s4fg1+3],eax
		je		.cdone
		mov		[s4fg1+3],eax
		mov		[s4fg2+3],eax
		mov		[s4fg3+3],eax
		mov		[s4fg4+3],eax
		selfmod s4cm1, s4fg4+7
.cdone	ret

GLOBAL	R_SetupAddCol
GLOBAL	@R_SetupAddCol@0

# Patch the values of dc_colormap, dc_srcblend, and dc_destblend into the
# unclamped adding column drawer.

R_SetupAddCol:
@R_SetupAddCol@0:
		mov		eax,[dc_colormap]
		cmp		[a4cm1+3],eax
		je		.cmdone
		mov		[a4cm1+3],eax
		mov		[a4cm2+3],eax
		mov		[a4cm3+3],eax
		mov		[a4cm4+3],eax
.cmdone	mov		eax,[dc_srcblend]
		cmp		[a4fg1+3],eax
		je		.sbdone
		mov		[a4fg1+3],eax
		mov		[a4fg2+3],eax
		mov		[a4fg3+3],eax
		mov		[a4fg4+3],eax
.sbdone	mov		eax,[dc_destblend]
		cmp		[a4bg1+3],eax
		je		.dbdone
		mov		[a4bg1+3],eax
		mov		[a4bg2+3],eax
		mov		[a4bg3+3],eax
		mov		[a4bg4+3],eax
		selfmod a4cm1, a4bg4+7
.dbdone	ret

GLOBAL	R_SetupAddClampCol
GLOBAL	@R_SetupAddClampCol@0

# Patch the values of dc_colormap, dc_srcblend, and dc_destblend into the
# add with clamping column drawer.

R_SetupAddClampCol:
@R_SetupAddClampCol@0:
		mov		eax,[dc_colormap]
		cmp		[ac4cm1+3],eax
		je		.cmdone
		mov		[ac4cm1+3],eax
		mov		[ac4cm2+3],eax
		mov		[ac4cm3+3],eax
		mov		[ac4cm4+3],eax
.cmdone	mov		eax,[dc_srcblend]
		cmp		[ac4fg1+3],eax
		je		.sbdone
		mov		[ac4fg1+3],eax
		mov		[ac4fg2+3],eax
		mov		[ac4fg3+3],eax
		mov		[ac4fg4+3],eax
.sbdone	mov		eax,[dc_destblend]
		cmp		[ac4bg1+3],eax
		je		.dbdone
		mov		[ac4bg1+3],eax
		mov		[ac4bg2+3],eax
		mov		[ac4bg3+3],eax
		mov		[ac4bg4+3],eax
		selfmod ac4cm1, ac4bg4+7
.dbdone	ret

EXTERN setvlinebpl_
EXTERN setpitch3

GLOBAL	@ASM_PatchPitch@0
GLOBAL	_ASM_PatchPitch
GLOBAL	ASM_PatchPitch

ASM_PatchPitch:
_ASM_PatchPitch:
@ASM_PatchPitch@0:
		mov		eax,[dc_pitch]
		mov		[rdcp1+2],eax
		mov		[rdcp2+2],eax
		mov		[rdcp3+2],eax
		mov		[s4p+1],eax
		mov		[a4p+1],eax
		mov		[ac4p+1],eax
		mov		ecx,eax
		neg		ecx
		inc		ecx
		inc		ecx
		mov		[s4p2+2],ecx
		inc		ecx
		mov		[s4p3+2],ecx
		selfmod rtext_start, rtext_end
		call	setpitch3
		jmp		setvlinebpl_