;; SONIC ROBO BLAST 2
;;-----------------------------------------------------------------------------
;; Copyright (C) 1998-2000 by DOSDOOM.
;; Copyright (C) 2010-2016 by Sonic Team Junior.
;;
;; This program is free software distributed under the
;; terms of the GNU General Public License, version 2.
;; See the 'LICENSE' file for more details.
;;-----------------------------------------------------------------------------
;; FILE:
;;      tmap_mmx.nas
;; DESCRIPTION:
;;      Assembler optimised rendering code for software mode, using SIMD
;;      instructions.
;;      Draw wall columns.


[BITS 32]

%define FRACBITS 16
%define TRANSPARENTPIXEL 247

%ifdef LINUX
%macro cextern 1
[extern %1]
%endmacro

%macro cglobal 1
[global %1]
%endmacro

%else
%macro cextern 1
%define %1 _%1
[extern %1]
%endmacro

%macro cglobal 1
%define %1 _%1
[global %1]
%endmacro

%endif


; The viddef_s structure. We only need the width field.
struc viddef_s
		resb 12
.width: resb 4
		resb 44
endstruc


;; externs
;; columns
cextern dc_colormap
cextern dc_x
cextern dc_yl
cextern dc_yh
cextern dc_iscale
cextern dc_texturemid
cextern dc_texheight
cextern dc_source
cextern dc_hires
cextern centery
cextern centeryfrac
cextern dc_transmap

cextern R_DrawColumn_8_ASM
cextern R_Draw2sMultiPatchColumn_8_ASM

;; spans
cextern nflatshiftup
cextern nflatxshift
cextern nflatyshift
cextern nflatmask
cextern ds_xfrac
cextern ds_yfrac
cextern ds_xstep
cextern ds_ystep
cextern ds_x1
cextern ds_x2
cextern ds_y
cextern ds_source
cextern ds_colormap

cextern ylookup
cextern columnofs
cextern vid

[SECTION .data]

nflatmask64		dq		0


[SECTION .text]

;;----------------------------------------------------------------------
;;
;; R_DrawColumn : 8bpp column drawer
;;
;; MMX column drawer.
;;
;;----------------------------------------------------------------------
;; eax = accumulator
;; ebx = colormap
;; ecx = count
;; edx = accumulator
;; esi = source
;; edi = dest
;; ebp = vid.width
;; mm0 = accumulator
;; mm1 = heightmask, twice
;; mm2 = 2 * fracstep, twice
;; mm3 = pair of consecutive fracs
;;----------------------------------------------------------------------


cglobal R_DrawColumn_8_MMX
R_DrawColumn_8_MMX:
		push		ebp						;; preserve caller's stack frame pointer
		push		esi						;; preserve register variables
		push		edi
		push		ebx

;;
;; Our algorithm requires that the texture height be a power of two.
;; If not, fall back to the non-MMX drawer.
;;
.texheightcheck:
		mov			edx, [dc_texheight]
		sub			edx, 1					;; edx = heightmask
		test		edx, [dc_texheight]
		jnz			near .usenonMMX

		mov			ebp, edx				;; Keep a copy of heightmask in a
											;; GPR for the time being.

;;
;; Fill mm1 with heightmask
;;
		movd		mm1, edx				;; low dword = heightmask
		punpckldq	mm1, mm1				;; copy low dword to high dword

;;
;; dest = ylookup[dc_yl] + columnofs[dc_x];
;;
		mov			eax, [dc_yl]
		mov			edi, [ylookup+eax*4]
		mov			ebx, [dc_x]
		add			edi, [columnofs+ebx*4]	;; edi = dest


;;
;; pixelcount = yh - yl + 1
;;
		mov			ecx, [dc_yh]
		add			ecx, 1
		sub			ecx, eax				;; pixel count
		jle			near .done				;; nothing to scale

;;
;; fracstep = dc_iscale;
;;
		movd		mm2, [dc_iscale]		;; fracstep in low dword
		punpckldq	mm2, mm2				;; copy to high dword

		mov			ebx, [dc_colormap]
		mov			esi, [dc_source]

;;
;; frac = (dc_texturemid + FixedMul((dc_yl << FRACBITS) - centeryfrac, fracstep));
;;
											;; eax == dc_yl already
		shl			eax, FRACBITS
		sub			eax, [centeryfrac]
		imul		dword [dc_iscale]
		shrd		eax, edx, FRACBITS
		add			eax, [dc_texturemid]

;;
;; if (dc_hires) frac = 0;
;;
		test		byte [dc_hires], 0x01
		jz			.mod2
		xor			eax, eax


;;
;; Do mod-2 pixel.
;;
.mod2:
		test		ecx, 1
		jz			.pairprepare
		mov			edx, eax				;; edx = frac
		add			eax, [dc_iscale]		;; eax += fracstep
		sar			edx, FRACBITS
		and			edx, ebp				;; edx &= heightmask
		movzx		edx, byte [esi + edx]
		movzx		edx, byte [ebx + edx]
		mov			[edi], dl

		add			edi, [vid + viddef_s.width]
		sub			ecx, 1
		jz			.done

.pairprepare:
;;
;; Prepare for the main loop.
;;
		movd		mm3, eax				;; Low dword = frac
		movq		mm4, mm3				;; Copy to intermediate register
		paddd		mm4, mm2				;; dwords of mm4 += fracstep
		punpckldq	mm3, mm4				;; Low dword = first frac, high = second
		pslld		mm2, 1					;; fracstep *= 2

;;
;; ebp = vid.width
;;
		mov			ebp, [vid + viddef_s.width]

		align		16
.pairloop:
		movq		mm0, mm3				;; 3B 1u.
		psrad		mm0, FRACBITS			;; 4B 1u.
		pand		mm0, mm1				;; 3B 1u. frac &= heightmask
		paddd		mm3, mm2				;; 3B 1u. frac += fracstep

		movd		eax, mm0				;; 3B 1u. Get first frac
;; IFETCH boundary
		movzx		eax, byte [esi + eax]	;; 4B 1u. Texture map
		movzx		eax, byte [ebx + eax]	;; 4B 1u. Colormap

		punpckhdq	mm0, mm0				;; 3B 1(2)u. low dword = high dword
		movd		edx, mm0				;; 3B 1u. Get second frac
		mov			[edi], al				;; 2B 1(2)u. First pixel
;; IFETCH boundary

		movzx		edx, byte [esi + edx]	;; 4B 1u. Texture map
		movzx		edx, byte [ebx + edx]	;; 4B 1u. Colormap
		mov			[edi + 1*ebp], dl		;; 3B 1(2)u. Second pixel

		lea			edi, [edi + 2*ebp]		;; 3B 1u. edi += 2 * vid.width
;; IFETCH boundary
		sub			ecx, 2					;; 3B 1u. count -= 2
		jnz			.pairloop				;; 2B 1u. if(count != 0) goto .pairloop


.done:
;;
;; Clear MMX state, or else FPU operations will go badly awry.
;;
		emms

		pop			ebx
		pop			edi
		pop			esi
		pop			ebp
		ret

.usenonMMX:
		call		R_DrawColumn_8_ASM
		jmp			.done


;;----------------------------------------------------------------------
;;
;; R_Draw2sMultiPatchColumn : Like R_DrawColumn, but omits transparent
;;                            pixels.
;;
;; MMX column drawer.
;;
;;----------------------------------------------------------------------
;; eax = accumulator
;; ebx = colormap
;; ecx = count
;; edx = accumulator
;; esi = source
;; edi = dest
;; ebp = vid.width
;; mm0 = accumulator
;; mm1 = heightmask, twice
;; mm2 = 2 * fracstep, twice
;; mm3 = pair of consecutive fracs
;;----------------------------------------------------------------------


cglobal R_Draw2sMultiPatchColumn_8_MMX
R_Draw2sMultiPatchColumn_8_MMX:
		push		ebp						;; preserve caller's stack frame pointer
		push		esi						;; preserve register variables
		push		edi
		push		ebx

;;
;; Our algorithm requires that the texture height be a power of two.
;; If not, fall back to the non-MMX drawer.
;;
.texheightcheck:
		mov			edx, [dc_texheight]
		sub			edx, 1					;; edx = heightmask
		test		edx, [dc_texheight]
		jnz			near .usenonMMX

		mov			ebp, edx				;; Keep a copy of heightmask in a
											;; GPR for the time being.

;;
;; Fill mm1 with heightmask
;;
		movd		mm1, edx				;; low dword = heightmask
		punpckldq	mm1, mm1				;; copy low dword to high dword

;;
;; dest = ylookup[dc_yl] + columnofs[dc_x];
;;
		mov			eax, [dc_yl]
		mov			edi, [ylookup+eax*4]
		mov			ebx, [dc_x]
		add			edi, [columnofs+ebx*4]	;; edi = dest


;;
;; pixelcount = yh - yl + 1
;;
		mov			ecx, [dc_yh]
		add			ecx, 1
		sub			ecx, eax				;; pixel count
		jle			near .done				;; nothing to scale
;;
;; fracstep = dc_iscale;
;;
		movd		mm2, [dc_iscale]		;; fracstep in low dword
		punpckldq	mm2, mm2				;; copy to high dword

		mov			ebx, [dc_colormap]
		mov			esi, [dc_source]

;;
;; frac = (dc_texturemid + FixedMul((dc_yl << FRACBITS) - centeryfrac, fracstep));
;;
											;; eax == dc_yl already
		shl			eax, FRACBITS
		sub			eax, [centeryfrac]
		imul		dword [dc_iscale]
		shrd		eax, edx, FRACBITS
		add			eax, [dc_texturemid]

;;
;; if (dc_hires) frac = 0;
;;
		test		byte [dc_hires], 0x01
		jz			.mod2
		xor			eax, eax


;;
;; Do mod-2 pixel.
;;
.mod2:
		test		ecx, 1
		jz			.pairprepare
		mov			edx, eax				;; edx = frac
		add			eax, [dc_iscale]		;; eax += fracstep
		sar			edx, FRACBITS
		and			edx, ebp				;; edx &= heightmask
		movzx		edx, byte [esi + edx]
		cmp			dl, TRANSPARENTPIXEL
		je			.nextmod2
		movzx		edx, byte [ebx + edx]
		mov			[edi], dl

.nextmod2:
		add			edi, [vid + viddef_s.width]
		sub			ecx, 1
		jz			.done

.pairprepare:
;;
;; Prepare for the main loop.
;;
		movd		mm3, eax				;; Low dword = frac
		movq		mm4, mm3				;; Copy to intermediate register
		paddd		mm4, mm2				;; dwords of mm4 += fracstep
		punpckldq	mm3, mm4				;; Low dword = first frac, high = second
		pslld		mm2, 1					;; fracstep *= 2

;;
;; ebp = vid.width
;;
		mov			ebp, [vid + viddef_s.width]

		align		16
.pairloop:
		movq		mm0, mm3				;; 3B 1u.
		psrad		mm0, FRACBITS			;; 4B 1u.
		pand		mm0, mm1				;; 3B 1u. frac &= heightmask
		paddd		mm3, mm2				;; 3B 1u. frac += fracstep

		movd		eax, mm0				;; 3B 1u. Get first frac
;; IFETCH boundary
		movzx		eax, byte [esi + eax]	;; 4B 1u. Texture map
		punpckhdq	mm0, mm0				;; 3B 1(2)u. low dword = high dword
		movd		edx, mm0				;; 3B 1u. Get second frac
		cmp			al, TRANSPARENTPIXEL	;; 2B 1u.
		je			.secondinpair			;; 2B 1u.
;; IFETCH boundary
		movzx		eax, byte [ebx + eax]	;; 4B 1u. Colormap
		mov			[edi], al				;; 2B 1(2)u. First pixel

.secondinpair:
		movzx		edx, byte [esi + edx]	;; 4B 1u. Texture map
		cmp			dl, TRANSPARENTPIXEL	;; 2B 1u.
		je			.nextpair				;; 2B 1u.
;; IFETCH boundary
		movzx		edx, byte [ebx + edx]	;; 4B 1u. Colormap
		mov			[edi + 1*ebp], dl		;; 3B 1(2)u. Second pixel

.nextpair:
		lea			edi, [edi + 2*ebp]		;; 3B 1u. edi += 2 * vid.width
		sub			ecx, 2					;; 3B 1u. count -= 2
		jnz			.pairloop				;; 2B 1u. if(count != 0) goto .pairloop


.done:
;;
;; Clear MMX state, or else FPU operations will go badly awry.
;;
		emms

		pop			ebx
		pop			edi
		pop			esi
		pop			ebp
		ret

.usenonMMX:
		call		R_Draw2sMultiPatchColumn_8_ASM
		jmp			.done


;;----------------------------------------------------------------------
;;
;; R_DrawSpan : 8bpp span drawer
;;
;; MMX span drawer.
;;
;;----------------------------------------------------------------------
;; eax = accumulator
;; ebx = colormap
;; ecx = count
;; edx = accumulator
;; esi = source
;; edi = dest
;; ebp = two pixels
;; mm0 = accumulator
;; mm1 = xposition
;; mm2 = yposition
;; mm3 = 2 * xstep
;; mm4 = 2 * ystep
;; mm5 = nflatxshift
;; mm6 = nflatyshift
;; mm7 = accumulator
;;----------------------------------------------------------------------

cglobal R_DrawSpan_8_MMX
R_DrawSpan_8_MMX:
		push		ebp						;; preserve caller's stack frame pointer
		push		esi						;; preserve register variables
		push		edi
		push		ebx

;;
;; esi = ds_source
;; ebx = ds_colormap
;;
		mov			esi, [ds_source]
		mov			ebx, [ds_colormap]

;;
;; edi = ylookup[ds_y] + columnofs[ds_x1]
;;
		mov			eax, [ds_y]
		mov			edi, [ylookup + eax*4]
		mov			edx, [ds_x1]
		add			edi, [columnofs + edx*4]

;;
;; ecx = ds_x2 - ds_x1 + 1
;;
		mov			ecx, [ds_x2]
		sub			ecx, edx
		add			ecx, 1

;;
;; Needed for fracs and steps
;;
		movd		mm7, [nflatshiftup]

;;
;; mm3 = xstep
;;
		movd		mm3, [ds_xstep]
		pslld		mm3, mm7
		punpckldq	mm3, mm3

;;
;; mm4 = ystep
;;
		movd		mm4, [ds_ystep]
		pslld		mm4, mm7
		punpckldq	mm4, mm4

;;
;; mm1 = pair of consecutive xpositions
;;
		movd		mm1, [ds_xfrac]
		pslld		mm1, mm7
		movq		mm6, mm1
		paddd		mm6, mm3
		punpckldq	mm1, mm6

;;
;; mm2 = pair of consecutive ypositions
;;
		movd		mm2, [ds_yfrac]
		pslld		mm2, mm7
		movq		mm6, mm2
		paddd		mm6, mm4
		punpckldq	mm2, mm6

;;
;; mm5 = nflatxshift
;; mm6 = nflatyshift
;;
		movd		mm5, [nflatxshift]
		movd		mm6, [nflatyshift]

;;
;; Mask is in memory due to lack of registers.
;;
		mov			eax, [nflatmask]
		mov			[nflatmask64], eax
		mov			[nflatmask64 + 4], eax


;;
;; Go until we reach a dword boundary.
;;
.unaligned:
		test		edi, 3
		jz			.alignedprep
.stragglers:
		cmp			ecx, 0
		je			.done					;; If ecx == 0, we're finished.

;;
;; eax = ((yposition >> nflatyshift) & nflatmask) | (xposition >> nflatxshift)
;;
		movq		mm0, mm1				;; mm0 = xposition
		movq		mm7, mm2				;; mm7 = yposition
		paddd		mm1, mm3				;; xposition += xstep (once!)
		paddd		mm2, mm4				;; yposition += ystep (once!)
		psrld		mm0, mm5				;; shift
		psrld		mm7, mm6				;; shift
		pand		mm7, [nflatmask64]		;; mask
		por			mm0, mm7				;; or x and y together

		movd		eax, mm0				;; eax = index of first pixel
		movzx		eax, byte [esi + eax]	;; al = source[eax]
		movzx		eax, byte [ebx + eax]	;; al = colormap[al]

		mov			[edi], al
		add			edi, 1

		sub			ecx, 1
		jmp			.unaligned


.alignedprep:
;;
;; We can double the steps now.
;;
		pslld		mm3, 1
		pslld		mm4, 1


;;
;; Generate chunks of four pixels.
;;
.alignedloop:

;;
;; Make sure we have at least four pixels.
;;
		cmp			ecx, 4
		jl			.prestragglers

;;
;; First two pixels.
;;
		movq		mm0, mm1				;; mm0 = xposition
		movq		mm7, mm2				;; mm7 = yposition
		paddd		mm1, mm3				;; xposition += xstep
		paddd		mm2, mm4				;; yposition += ystep
		psrld		mm0, mm5				;; shift
		psrld		mm7, mm6				;; shift
		pand		mm7, [nflatmask64]		;; mask
		por			mm0, mm7				;; or x and y together

		movd		eax, mm0				;; eax = index of first pixel
		movzx		eax, byte [esi + eax]	;; al = source[eax]
		movzx		ebp, byte [ebx + eax]	;; ebp = colormap[al]

		punpckhdq	mm0, mm0				;; both dwords = high dword
		movd		eax, mm0				;; eax = index of second pixel
		movzx		eax, byte [esi + eax]	;; al = source[eax]
		movzx		eax, byte [ebx + eax]	;; al = colormap[al]
		shl			eax, 8					;; get pixel in right byte
		or			ebp, eax				;; put pixel in ebp

;;
;; Next two pixels.
;;
		movq		mm0, mm1				;; mm0 = xposition
		movq		mm7, mm2				;; mm7 = yposition
		paddd		mm1, mm3				;; xposition += xstep
		paddd		mm2, mm4				;; yposition += ystep
		psrld		mm0, mm5				;; shift
		psrld		mm7, mm6				;; shift
		pand		mm7, [nflatmask64]		;; mask
		por			mm0, mm7				;; or x and y together

		movd		eax, mm0				;; eax = index of third pixel
		movzx		eax, byte [esi + eax]	;; al = source[eax]
		movzx		eax, byte [ebx + eax]	;; al = colormap[al]
		shl			eax, 16					;; get pixel in right byte
		or			ebp, eax				;; put pixel in ebp

		punpckhdq	mm0, mm0				;; both dwords = high dword
		movd		eax, mm0				;; eax = index of second pixel
		movzx		eax, byte [esi + eax]	;; al = source[eax]
		movzx		eax, byte [ebx + eax]	;; al = colormap[al]
		shl			eax, 24					;; get pixel in right byte
		or			ebp, eax				;; put pixel in ebp

;;
;; Write pixels.
;;
		mov			[edi], ebp
		add			edi, 4

		sub			ecx, 4
		jmp			.alignedloop

.prestragglers:
;;
;; Back to one step at a time.
;;
		psrad		mm3, 1
		psrad		mm4, 1
		jmp			.stragglers

.done:
;;
;; Clear MMX state, or else FPU operations will go badly awry.
;;
		emms

		pop			ebx
		pop			edi
		pop			esi
		pop			ebp
		ret