qzdoom-gpl/src/blocks.nas

;*
;* blocks.nas
;* Draws simple blocks to the screen, possibly with masking
;*
;*---------------------------------------------------------------------------
;* Copyright 1998-2001 Randy Heit
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* 1. Redistributions of source code must retain the above copyright
;*    notice, this list of conditions and the following disclaimer.
;* 2. Redistributions in binary form must reproduce the above copyright
;*    notice, this list of conditions and the following disclaimer in the
;*    documentation and/or other materials provided with the distribution.
;* 3. The name of the author may not be used to endorse or promote products
;*    derived from this software without specific prior written permission.
;*
;* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
;* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
;* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
;* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
;* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
;* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;*---------------------------------------------------------------------------
;*

BITS 32

%define SPACEFILLER4 (0x44444444)

%ifndef M_TARGET_LINUX

%define RGB32k _RGB32k
%define Col2RGB8 _Col2RGB8

; Key to naming (because more descriptive names mess up the linker, sadly):
; 1st char: D = Draw, S = Scale
; 2nd char: M = Masked
; 3rd char: P = Plain, T = Translucent, S = Shadowed, A = Alpha
; 4th char: R = Remapped, U = Unmapped
; 5th char: P = Palettized output

%define DMPRP _DMPRP
%define DMPUP _DMPUP
%define DMTRP _DMTRP
%define DMTUP _DMTUP
%define DMSRP _DMSRP
%define DMSUP _DMSUP
%define DMAUP _DMAUP

%define SMPUP _SMPUP
%define SMPRP _SMPRP
%define SMTRP _SMTRP
%define SMTUP _SMTUP
%define SMSRP _SMSRP
%define SMSUP _SMSUP
%define SMAUP _SMAUP

%define MaskedBlockFunctions _MaskedBlockFunctions

%endif

%ifdef M_TARGET_WATCOM
  SEGMENT DATA PUBLIC ALIGN=16 CLASS=DATA USE32
  SEGMENT DATA
%else
  SECTION .data
%endif

GLOBAL	MaskedBlockFunctions

; Get around linker problems
MaskedBlockFunctions:
	dd	DMPRP
	dd	DMPUP
	dd	SMPRP
	dd	SMPUP
	dd	DMTRP
	dd	DMTUP
	dd	SMTRP
	dd	SMTUP
	dd	DMSRP
	dd	DMSUP
	dd	SMSRP
	dd	SMSUP
	dd	DMAUP
	dd	SMAUP

EXTERN	RGB32k
EXTERN	Col2RGB8

%ifdef M_TARGET_WATCOM
  SEGMENT CODE PUBLIC ALIGN=16 CLASS=CODE USE32
  SEGMENT CODE
%else
  SECTION .text
%endif

;__declspec(naked) void STACK_ARGS DMPRP (
;	const byte *src, byte *dest, const byte *remap, int srcpitch, int destpitch, int width, int height)

GLOBAL	DMPRP

	align 16

DMPRP:
	push	ebx
	push	edi
	push	ebp

%assign STACKBASE 4*4
%assign src	(4*0+STACKBASE)
%assign dest	(4*1+STACKBASE)
%assign remap	(4*2+STACKBASE)
%assign srcp	(4*3+STACKBASE)
%assign destp	(4*4+STACKBASE)
%assign width	(4*5+STACKBASE)
%assign height	(4*6+STACKBASE)

	mov	ebx,[esp+width]
	mov	ecx,[esp+src]
	xor	eax,eax
	mov	edx,[esp+dest]
	add	ecx,ebx
	add	edx,ebx
	neg	ebx
	mov	ebp,[esp+remap]
	mov	[esp+width],ebx
	mov	edi,[esp+height]

.loop	mov	al,[ecx+ebx]
	cmp	eax,255
	jz	.skip

	mov	al,[eax+ebp]
	mov	[edx+ebx],al

.skip	inc	ebx
	jnz	.loop

	add	ecx,[esp+srcp]
	add	edx,[esp+destp]
	mov	ebx,[esp+width]
	dec	edi
	jnz	.loop
	pop	ebp
	pop	edi
	pop	ebx
	ret

;__declspec(naked) void STACK_ARGS DMPUP (
;	const byte *src, byte *dest, int srcpitch, int destpitch, int _width, int _height)

	align 16

GLOBAL	DMPUP

DMPUP:
	mov	ecx,[esp+4*0+4]
	mov	edx,[esp+4*1+4]
	push	ebx
	mov	ebx,[esp+4*4+8]
	add	ecx,ebx
	add	edx,ebx
	xor	eax,eax
	neg	ebx
	mov	[esp+4*4+8],ebx
	push	ebp
	mov	ebp,[esp+4*5+12]

.loop	mov	al,[ecx+ebx]
	cmp	al,255
	jz	.skip

	mov	[edx+ebx],al

.skip	inc	ebx
	jnz	.loop

	add	ecx,[esp+4*2+12]
	add	edx,[esp+4*3+12]
	mov	ebx,[esp+4*4+12]
	dec	ebp
	jnz	.loop
	pop	ebp
	pop	ebx
	ret

;__declspec(naked) void STACK_ARGS DMTRP (
;	src, dest, remap, srcpitch, destpitch, _width, _height, fg2rgb, bg2rgb);

GLOBAL	DMTRP

	align 16

DMTRP:
	push	ebx
	push	esi
	push	edi
	push	ebp

	mov	esi,[esp+0*4+20]	; esi = source
	mov	ebp,[esp+5*4+20]	; ebp = index
	mov	edi,[esp+1*4+20]	; edi = dest
	add	esi,ebp
	add	edi,ebp
	neg	ebp
	mov	[esp+5*4+20],ebp
	xor	eax,eax
	mov	ebx,[esp+2*4+20]	; ebx = remap
	mov	ecx,[esp+7*4+20]	; ecx = fg2rgb and scratch
	mov	edx,[esp+8*4+20]	; edx = bg2rgb and scratch

.loop	mov	al,[esi+ebp]
	cmp	eax,255
	jz	.skip

	mov	al,[ebx+eax]		; remap foreground
	mov	ecx,[ecx+eax*4]		; get foreground RGB
	mov	al,[edi+ebp]		; get background
	add	ecx,[edx+eax*4]		; add background RGB
	or	ecx,0x1f07c1f
	mov	edx,ecx
	shr	edx,15
	and	edx,ecx
	mov	ecx,[esp+7*4+20]	; set ecx back to fg2rgb
	mov	al,[RGB32k+edx]
	mov	edx,[esp+8*4+20]	; set edx back to bg2rgb
	mov	[edi+ebp],al

.skip	inc	ebp
	jnz	.loop

	add	esi,[esp+3*4+20]
	add	edi,[esp+4*4+20]
	mov	ebp,[esp+5*4+20]
	dec	dword [esp+6*4+20]
	jnz	.loop

	pop	ebp
	pop	edi
	pop	esi
	pop	ebx
	ret

;__declspec(naked) void STACK_ARGS DMTUP (
;	src, dest, srcpitch, destpitch, _width, _height, fg2rgb, bg2rgb);

GLOBAL	DMTUP

	align 16

DMTUP:
	push	ebx
	push	esi
	push	edi
	push	ebp

	mov	esi,[esp+0*4+20]	; esi = source
	mov	ebp,[esp+4*4+20]	; ebp = index
	mov	edi,[esp+1*4+20]	; edi = dest
	add	esi,ebp
	add	edi,ebp
	neg	ebp
	mov	[esp+4*4+20],ebp
	xor	eax,eax
	mov	ebx,[esp+5*4+20]	; ebx = height remaining
	mov	ecx,[esp+6*4+20]	; ecx = fg2rgb and scratch
	mov	edx,[esp+7*4+20]	; edx = bg2rgb and scratch

.loop	mov	al,[esi+ebp]
	cmp	eax,255
	jz	.skip

	mov	ecx,[ecx+eax*4]		; get foreground RGB
	mov	al,[edi+ebp]
	add	ecx,[edx+eax*4]		; add background RGB
	or	ecx,0x1f07c1f
	mov	edx,ecx
	shr	edx,15
	and	edx,ecx
	mov	ecx,[esp+6*4+20]	; set ecx back to fg2rgb
	mov	al,[RGB32k+edx]
	mov	edx,[esp+7*4+20]	; set edx back to bg2rgb
	mov	[edi+ebp],al

.skip	inc	ebp
	jnz	.loop

	add	esi,[esp+2*4+20]
	add	edi,[esp+3*4+20]
	mov	ebp,[esp+4*4+20]
	dec	ebx
	jnz	.loop

	pop	ebp
	pop	edi
	pop	esi
	pop	ebx
	ret

;__declspec(naked) void STACK_ARGS DMSRP (
;	const byte *src, byte *dest, const byte *remap, int srcpitch, int destpitch, int w, int h, DWORD fg, DWORD *bg2rgb);

GLOBAL	DMSRP

	align 16

DMSRP:
	push	ebx
	push	esi
	push	edi
	push	ebp

	mov	ebp,[esp+5*4+20]	; ebp = index
	mov	esi,[esp+0*4+20]	; esi = source
	mov	edi,[esp+1*4+20]	; edi = dest
	add	esi,ebp
	add	edi,ebp
	neg	ebp
	mov	ebx,[esp+4*4+20]	; ebx = dest+destpitch*2+2
	mov	[esp+5*4+20],ebp
	lea	ebx,[ebx*2+edi+2]
	xor	eax,eax
	mov	edx,[esp+2*4+20]	; edx = remap and scratch
	mov	ecx,[esp+8*4+20]	; ecx = bg2rgb and scratch

.loop	mov	al,[esi+ebp]
	cmp	eax,255
	jz	.skip

	mov	al,[edx+eax]		; remap color
	mov	edx,[esp+7*4+20]	; edx = fg
	mov	[edi+ebp],al		; store color
	mov	al,[ebx+ebp]		; get background color
	mov	ecx,[ecx+eax*4]		; get background RGB
	add	ecx,edx			; add foreground RGB
	or	ecx,0x1f07c1f
	mov	edx,ecx
	shr	edx,15
	and	edx,ecx
	mov	ecx,[esp+8*4+20]	; set ecx back to bg2rgb
	mov	al,[RGB32k+edx]		; convert RGB back to palette index
	mov	edx,[esp+2*4+20]	; set edx back to remap
	mov	[ebx+ebp],al		; store shadow pixel

.skip	inc	ebp
	jnz	.loop

	add	esi,[esp+3*4+20]
	add	edi,[esp+4*4+20]
	add	ebx,[esp+4*4+20]
	dec	dword [esp+6*4+20]
	mov	ebp,[esp+5*4+20]
	jnz	.loop

	pop	ebp
	pop	edi
	pop	esi
	pop	ebx
	ret

;__declspec(naked) void STACK_ARGS DMSUP (
;	const byte *src, byte *dest, int srcpitch, int destpitch, int w, int h, DWORD fg, DWORD *bg2rgb);

GLOBAL	DMSUP

	align 16

DMSUP:
	push	ebx
	push	esi
	push	edi
	push	ebp

	mov	ebp,[esp+4*4+20]	; ebp = index
	mov	esi,[esp+0*4+20]	; esi = source
	mov	edi,[esp+1*4+20]	; edi = dest
	add	esi,ebp
	add	edi,ebp
	neg	ebp
	mov	ebx,[esp+3*4+20]	; ebx = dest+destpitch*2+2
	mov	[esp+4*4+20],ebp
	lea	ebx,[ebx*2+edi+2]
	xor	eax,eax
	mov	ecx,[esp+7*4+20]	; ecx = bg2rgb

.loop	mov	al,[esi+ebp]
	cmp	al,255
	jz	.skip

	mov	[edi+ebp],al
	mov	al,[ebx+ebp]
	mov	edx,[esp+6*4+20]
	mov	ecx,[ecx+eax*4]
	add	ecx,edx
	or	ecx,0x1f07c1f
	mov	edx,ecx
	shr	edx,15
	and	edx,ecx
	mov	ecx,[esp+7*4+20]	; set ecx back to bg2rgb
	mov	al,[RGB32k+edx]
	mov	[ebx+ebp],al

.skip	inc	ebp
	jnz	.loop

	add	esi,[esp+2*4+20]
	add	edi,[esp+3*4+20]
	add	ebx,[esp+3*4+20]
	dec	dword [esp+5*4+20]
	mov	ebp,[esp+4*4+20]
	jnz	.loop

	pop	ebp
	pop	edi
	pop	esi
	pop	ebx
	ret

;__declspec(naked) void STACK_ARGS DMAUP (
;	const byte *src, byte *dest, int srcpitch, int destpitch, int w, int h, DWORD *fgstart);

GLOBAL DMAUP

	align 16

DMAUP:
	push	ebx
	push	esi
	push	edi
	push	ebp

	mov	ebp,[esp+4*4+20]	; ebp = index
	mov	esi,[esp+0*4+20]	; esi = source
	mov	edi,[esp+1*4+20]	; edi = dest
	add	esi,ebp
	add	edi,ebp
	neg	ebp
	mov	[esp+4*4+20],ebp
	mov	ecx,[esp+6*4+20]	; ecx = fgstart and scratch
	mov	edx,[esp+5*4+20]	; edx = height remaining
	xor	eax,eax
	xor	ebx,ebx

.loop	mov	al,[esi+ebp]
	cmp	eax,255
	jz	.skip

	add	eax,1
	mov	bl,[edi+ebp]
	shl	eax,6
	and	eax,0x7f00
	sub	ebx,eax
	mov	ecx,[ecx+eax*4]		; get fg RGB
	add	ecx,[Col2RGB8+ebx*4+0x10000] ; add 1/fg RGB
	xor	eax,eax
	or	ecx,0x1f07c1f
	mov	ebx,ecx
	shr	ebx,15
	and	ebx,ecx
	mov	ecx,[esp+6*4+20]	; set ecx back to fgstart
	mov	al,[RGB32k+ebx]
	xor	ebx,ebx
	mov	[edi+ebp],al

.skip	inc	ebp
	jnz	.loop

	add	esi,[esp+2*4+20]
	add	edi,[esp+3*4+20]
	mov	ebp,[esp+4*4+20]
	dec	edx
	jnz	.loop

	pop	ebp
	pop	edi
	pop	esi
	pop	ebx
	ret


; Learn from my mistake. In the following routines, I used this bit of code
; to get two cycle pairing on a Pentium. Unfurtunately, the shr instruction
; can clobber the OF flag even though it doesn't on a Pentium II (which is
; what I have ATM). So I had to swap the middle two instructions, so now it
; only gets three cycles on a Pentium. (Other processors should be smart
; enough to reorder this to run in two or fewer cycles, I would hope.)
;
;.skip	mov	ebx,edx
;	inc	ebp
;	shr	ebx,16
;	jno	.loop

;__declspec(naked) void STACK_ARGS SMPRP (
;	src, dest, remap, srcpitch, destpitch, xinc, yinc, xstart, yerr, width, height)

GLOBAL	SMPRP

	align 16

SMPRP:
	push	ebx
	push	esi
	push	edi
	push	ebp

	mov	ebp,[esp+9*4+20]	; ebp = dest index and counter
	mov	edi,[esp+1*4+20]	; edi = dest
	mov	esi,[esp+0*4+20]	; esi = source
	xor	eax,eax
	lea	edi,[edi+ebp+0x80000000]
	neg	ebp
	add	ebp,0x80000000
	mov	ebx,[esp+7*4+20]	; ebx = source x index
	mov	[esp+9*4+20],ebp
	mov	edx,[esp+7*4+20]	; edx = xplace
	shr	ebx,16
	mov	ecx,[esp+5*4+20]	; ecx = xinc

.loop	mov	al,[esi+ebx]
	mov	ebx,[esp+2*4+20]
	add	edx,ecx
	cmp	eax,255
	mov	al,[ebx+eax]
	jz	.skip

	mov	[edi+ebp],al

.skip	mov	ebx,edx
	shr	ebx,16
	inc	ebp
	jno	.loop

	mov	ebp,[esp+8*4+20]
	mov	edx,[esp+7*4+20]
	add	ebp,[esp+6*4+20]
	add	edi,[esp+4*4+20]
	cmp	ebp,0x10000
	jb	.noadv

.adv	add	esi,[esp+3*4+20]
	sub	ebp,0x10000
	cmp	ebp,0x10000
	jae	.adv

.noadv	mov	ebx,edx
	mov	[esp+8*4+20],ebp
	shr	ebx,16
	mov	ebp,[esp+9*4+20]
	dec	dword [esp+10*4+20]
	jnz	.loop

	pop	ebp
	pop	edi
	pop	esi
	pop	ebx
	ret

;__declspec(naked) void STACK_ARGS SMPUP (
;	src, dest, srcpitch, destpitch, xinc, yinc, xstart, yerr, width, height)

GLOBAL	SMPUP

	align 16

SMPUP:
	push	ebx
	push	esi
	push	edi
	push	ebp

	mov	ebp,[esp+8*4+20]	; ebp = dest index and counter
	mov	edi,[esp+1*4+20]	; edi = dest
	mov	esi,[esp+0*4+20]	; esi = source
	xor	eax,eax
	lea	edi,[edi+ebp+0x80000000]
	neg	ebp
	add	ebp,0x80000000
	mov	ebx,[esp+6*4+20]	; ebx = source x index
	mov	[esp+8*4+20],ebp
	mov	edx,[esp+6*4+20]	; edx = xplace
	shr	ebx,16
	mov	ecx,[esp+4*4+20]	; ecx = xinc


.loop	mov	al,[esi+ebx]
	add	edx,ecx
	cmp	al,255
	jz	.skip

	mov	[edi+ebp],al

.skip	mov	ebx,edx
	shr	ebx,16
	inc	ebp
	jno	.loop

	mov	ebp,[esp+7*4+20]
	mov	edx,[esp+6*4+20]
	add	ebp,[esp+5*4+20]
	add	edi,[esp+3*4+20]
	cmp	ebp,0x10000
	jb	.noadv

.adv	add	esi,[esp+2*4+20]
	sub	ebp,0x10000
	cmp	ebp,0x10000
	jae	.adv

.noadv	mov	ebx,edx
	mov	[esp+7*4+20],ebp
	shr	ebx,16
	mov	ebp,[esp+8*4+20]
	dec	dword [esp+9*4+20]
	jnz	.loop

	pop	ebp
	pop	edi
	pop	esi
	pop	ebx
	ret

;SMAUP (src, dest, srcpitch, destpitch, xinc, yinc, xstart, yerr, dwidth, dheight, fgstart);

GLOBAL	SMAUP

	align 16

SMAUP:
	push	ebx
	push	esi
	push	edi
	push	ebp

	mov	ebp,[esp+8*4+20]	; ebp = dest index/counter
	mov	edi,[esp+1*4+20]	; edi = dest
	mov	esi,[esp+0*4+20]	; esi = source
	xor	eax,eax
	lea	edi,[edi+ebp+0x80000000]
	neg	ebp
	add	ebp,0x80000000
	mov	ebx,[esp+6*4+20]
	shr	ebx,16
	mov	[esp+8*4+20],ebp
	mov	edx,[esp+6*4+20]	; edx = xplace
	mov	ecx,[esp+4*4+20]	; ecx = xinc

.loop	mov	al,[esi+ebx]
	add	edx,ecx
	cmp	eax,255
	jz	.skip

	add	eax,2
	xor	ebx,ebx
	shl	eax,6
	mov	ecx,[esp+10*4+20]	; ecx = fgstart
	mov	bl,[edi+ebp]
	and	eax,0x7f00
	sub	ebx,eax
	mov	ecx,[ecx+eax*4]		; get fg RGB
	add	ecx,[Col2RGB8+ebx*4+0x10000] ; add 1/fg RGB
	xor	eax,eax
	or	ecx,0x1f07c1f
	mov	ebx,ecx
	shr	ecx,15
	and	ebx,ecx
	mov	ecx,[esp+4*4+20]	; ecx = xinc
	mov	al,[RGB32k+ebx]
	mov	[edi+ebp],al

.skip	mov	ebx,edx
	shr	ebx,16
	inc	ebp
	jno	.loop

	mov	ebp,[esp+8*4+20]
	mov	ebx,[esp+7*4+20]	; get yerr
	add	edi,[esp+3*4+20]	; advance dest to next line
	add	ebx,[esp+5*4+20]	; advance yerr
	cmp	ebx,0x10000
	jb	.noadv
.adv	add	esi,[esp+2*4+20]	; advance src to next line
	sub	ebx,0x10000
	cmp	ebx,0x10000
	jae	.adv
.noadv	mov	[esp+7*4+20],ebx
	mov	ebx,[esp+6*4+20]
	shr	ebx,16
	mov	edx,[esp+6*4+20]
	dec	dword [esp+9*4+20]
	jnz	near .loop

	pop	ebp
	pop	edi
	pop	esi
	pop	ebx
	ret

; The rest of this code is all self-modifying, so it lives in the data segment

%ifdef M_TARGET_WATCOM
  SEGMENT DATA PUBLIC ALIGN=16 CLASS=CODE USE32
  SEGMENT DATA
%else
  SECTION .data
%endif

;SMTRP (src, dest, remap, srcpitch, destpitch, xinc, yinc, xstart, yerr, dwidth, dheight, fg2rgb, bg2rgb);

GLOBAL	SMTRP

	align 16

SMTRP:
; self-modify with fg2rgb, bg2rgb, and remap
	mov	eax,[esp+11*4+4]	; fg2rgb
	mov	ecx,[esp+12*4+4]	; bg2rgb
	cmp	[.fg+3],eax		; fg2rgb and bg2rgb always come in unique pairs
	je	.tgood
	mov	[.fg+3],eax
	mov	[.bg+3],ecx
.tgood	mov	eax,[esp+2*4+4]		; remap
	cmp	[.map+2],eax
	je	.mgood
	mov	[.map+2],eax
.mgood
	push	ebx
	push	esi
	push	edi
	push	ebp

	mov	ebp,[esp+9*4+20]	; ebp = dest index/counter
	mov	edi,[esp+1*4+20]	; edi = dest
	mov	esi,[esp+0*4+20]	; esi = source
	xor	eax,eax
	lea	edi,[edi+ebp+0x80000000]
	neg	ebp
	add	ebp,0x80000000
	mov	ebx,[esp+7*4+20]
	shr	ebx,16
	mov	[esp+9*4+20],ebp
	mov	edx,[esp+7*4+20]
	mov	ecx,[esp+5*4+20]

.loop	mov	al,[esi+ebx]
	add	edx,ecx
	cmp	eax,255
	jz	.skip

.map	mov	al,[SPACEFILLER4+eax]		; remap
.fg	mov	ebx,[SPACEFILLER4+eax*4]	; fg2rgb
	mov	al,[edi+ebp]
.bg	add	ebx,[SPACEFILLER4+eax*4]	; bg2rgb
	or	ebx,0x1f07c1f
	mov	eax,ebx
	shr	ebx,15
	and	ebx,eax
	xor	eax,eax
	mov	al,[RGB32k+ebx]
	mov	[edi+ebp],al

.skip	mov	ebx,edx
	shr	ebx,16
	inc	ebp
	jno	.loop			; inc modifies OF; shr does not

	mov	ebp,[esp+9*4+20]
	mov	ebx,[esp+8*4+20]	; get yerr
	add	edi,[esp+4*4+20]	; advance dest to next line
	add	ebx,[esp+6*4+20]	; advance yerr
	cmp	ebx,0x10000
	jb	.noadv
.adv	add	esi,[esp+3*4+20]	; advance src to next line
	sub	ebx,0x10000
	cmp	ebx,0x10000
	jae	.adv
.noadv	mov	[esp+8*4+20],ebx
	mov	ebx,[esp+7*4+20]
	shr	ebx,16
	mov	edx,[esp+7*4+20]
	dec	dword [esp+10*4+20]
	jnz	.loop

	pop	ebp
	pop	edi
	pop	esi
	pop	ebx
	ret

;SMTUP (src, dest, srcpitch, destpitch, xinc, yinc, xstart, yerr, dwidth, dheight, fg2rgb, bg2rgb);

GLOBAL	SMTUP

	align 16

SMTUP:
; self-modify with fg2rgb and bg2rgb
	mov	eax,[esp+10*4+4]	; fg2rgb
	mov	ecx,[esp+11*4+4]	; bg2rgb
	cmp	[.fg+3],eax
	je	.good
	mov	[.fg+3],eax
	mov	[.bg+3],ecx

.good	push	ebx
	push	esi
	push	edi
	push	ebp

	mov	ebp,[esp+8*4+20]	; ebp = index
	mov	edi,[esp+1*4+20]	; edi = dest
	mov	esi,[esp+0*4+20]	; esi = source
	xor	eax,eax
	lea	edi,[edi+ebp+0x80000000]
	neg	ebp
	add	ebp,0x80000000
	mov	ebx,[esp+6*4+20]
	shr	ebx,16
	mov	[esp+8*4+20],ebp
	mov	edx,[esp+6*4+20]	; edx = xplace
	mov	ecx,[esp+4*4+20]	; ecx = xinc

.loop	mov	al,[esi+ebx]
	add	edx,ecx
	cmp	eax,255
	jz	.skip

.fg	mov	ebx,[SPACEFILLER4+eax*4]	; fg2rgb
	mov	al,[edi+ebp]
.bg	add	ebx,[SPACEFILLER4+eax*4]	; bg2rgb
	or	ebx,0x1f07c1f
	mov	eax,ebx
	shr	ebx,15
	and	ebx,eax
	xor	eax,eax
	mov	al,[RGB32k+ebx]
	mov	[edi+ebp],al

.skip	mov	ebx,edx
	shr	ebx,16
	inc	ebp
	jno	.loop			; inc modifies OF; shr does not

	mov	ebp,[esp+8*4+20]
	mov	ebx,[esp+7*4+20]	; get yerr
	add	edi,[esp+3*4+20]	; advance dest to next line
	add	ebx,[esp+5*4+20]	; advance yerr
	cmp	ebx,0x10000
	jb	.noadv
.adv	add	esi,[esp+2*4+20]	; advance src to next line
	sub	ebx,0x10000
	cmp	ebx,0x10000
	jae	.adv
.noadv	mov	[esp+7*4+20],ebx
	mov	ebx,[esp+6*4+20]
	shr	ebx,16
	mov	edx,[esp+6*4+20]
	dec	dword [esp+9*4+20]
	jnz	.loop

	pop	ebp
	pop	edi
	pop	esi
	pop	ebx
	ret

;SMSRP (src, dest, remap, srcpitch, destpitch, xinc, yinc, xstart, err, dwidth, dheight, fg, bg2rgb);

GLOBAL	SMSRP

	align 16

SMSRP:
; self-modify with remap, bg2rgb and destpitch
	mov	eax,[esp+12*4+4]	; bg2rgb
	mov	ecx,[esp+4*4+4]		; destpitch
	cmp	[.bg+3],eax
	je	.bgood
	mov	[.bg+3],eax
.bgood	lea	ecx,[ecx*2+2]
	mov	eax,[esp+2*4+4]		; remap
	cmp	[.sadv1+3],ecx
	je	.pgood
	mov	[.sadv1+3],ecx
	mov	[.sadv2+3],ecx
.pgood	cmp	eax,[.map+2]
	je	.mgood
	mov	[.map+2],eax
.mgood
	push	ebx
	push	esi
	push	edi
	push	ebp

	mov	ebp,[esp+9*4+20]	; ebp = dest index/counter
	mov	edi,[esp+1*4+20]	; edi = dest
	mov	esi,[esp+0*4+20]	; esi = source
	xor	eax,eax
	lea	edi,[edi+ebp+0x80000000]
	neg	ebp
	add	ebp,0x80000000
	mov	ebx,[esp+7*4+20]	; ebx = src index
	shr	ebx,16
	mov	[esp+9*4+20],ebp
	mov	edx,[esp+7*4+20]	; edx = xplace
	mov	ecx,[esp+5*4+20]	; ecx = xinc

.loop	mov	al,[esi+ebx]
	add	edx,ecx
	cmp	eax,255
	jz	.skip

.map	mov	al,[SPACEFILLER4+eax]
	mov	[edi+ebp],al
.sadv1	mov	al,[edi+ebp+SPACEFILLER4]	; get pixel under shadow
.bg	mov	ebx,[SPACEFILLER4+eax*4]	; and convert to RGB
	mov	eax,[esp+11*4+20]		; get fg RGB
	add	ebx,eax				; and add to bg RGB
	or	ebx,0x1f07c1f
	mov	eax,ebx
	shr	ebx,15
	and	ebx,eax
	xor	eax,eax
	mov	al,[RGB32k+ebx]
.sadv2	mov	[edi+ebp+SPACEFILLER4],al	; write shaded shadow

.skip	mov	ebx,edx
	shr	ebx,16
	inc	ebp
	jno	.loop

	mov	ebp,[esp+9*4+20]
	mov	ebx,[esp+8*4+20]	; get yerr
	add	edi,[esp+4*4+20]	; advance dest to next line
	add	ebx,[esp+6*4+20]	; advance yerr
	cmp	ebx,0x10000
	jb	.noadv
.adv	add	esi,[esp+3*4+20]	; advance src to next line
	sub	ebx,0x10000
	cmp	ebx,0x10000
	jae	.adv
.noadv	mov	[esp+8*4+20],ebx
	mov	ebx,[esp+7*4+20]
	shr	ebx,16
	mov	edx,[esp+7*4+20]
	dec	dword [esp+10*4+20]
	jnz	near .loop

	pop	ebp
	pop	edi
	pop	esi
	pop	ebx
	ret

;SMSUP (src, dest, srcpitch, destpitch, xinc, yinc, xstart, err, dwidth, dheight, fg, bg2rgb);

GLOBAL	SMSUP

	align 16

SMSUP:
; self-modify with bg2rgb and destpitch
	mov	eax,[esp+11*4+4]	; bg2rgb
	mov	ecx,[esp+3*4+4]		; destpitch
	cmp	[.bg+3],eax
	je	.bgood
	mov	[.bg+3],eax
.bgood	lea	ecx,[ecx*2+2]
	cmp	[.sadv1+3],ecx
	je	.pgood
	mov	[.sadv1+3],ecx
	mov	[.sadv2+3],ecx
.pgood
	push	ebx
	push	esi
	push	edi
	push	ebp

	mov	ebp,[esp+8*4+20]	; ebp = dest index/counter
	mov	edi,[esp+1*4+20]	; edi = dest
	mov	esi,[esp+0*4+20]	; esi = source
	xor	eax,eax
	lea	edi,[edi+ebp+0x80000000]
	neg	ebp
	add	ebp,0x80000000
	mov	ebx,[esp+6*4+20]	; ebx = src index
	shr	ebx,16
	mov	[esp+8*4+20],ebp
	mov	edx,[esp+6*4+20]	; edx = xplace
	mov	ecx,[esp+4*4+20]	; ecx = xinc

.loop	mov	al,[esi+ebx]
	add	edx,ecx
	cmp	eax,255
	jz	.skip

	mov	[edi+ebp],al
.sadv1	mov	al,[edi+ebp+SPACEFILLER4]	; get pixel under shadow
.bg	mov	ebx,[SPACEFILLER4+eax*4]	; and convert to RGB
	mov	eax,[esp+10*4+20]		; get fg RGB
	add	ebx,eax				; and add to bg RGB
	or	ebx,0x1f07c1f
	mov	eax,ebx
	shr	ebx,15
	and	ebx,eax
	xor	eax,eax
	mov	al,[RGB32k+ebx]
.sadv2	mov	[edi+ebp+SPACEFILLER4],al	; write shaded shadow

.skip	mov	ebx,edx
	shr	ebx,16
	inc	ebp
	jno	.loop

	mov	ebp,[esp+8*4+20]
	mov	ebx,[esp+7*4+20]	; get yerr
	add	edi,[esp+3*4+20]	; advance dest to next line
	add	ebx,[esp+5*4+20]	; advance yerr
	cmp	ebx,0x10000
	jb	.noadv
.adv	add	esi,[esp+2*4+20]	; advance src to next line
	sub	ebx,0x10000
	cmp	ebx,0x10000
	jae	.adv
.noadv	mov	[esp+7*4+20],ebx
	mov	ebx,[esp+6*4+20]
	shr	ebx,16
	mov	edx,[esp+6*4+20]
	dec	dword [esp+9*4+20]
	jnz	near .loop

	pop	ebp
	pop	edi
	pop	esi
	pop	ebx
	ret