gzdoom-gles/src/misc.nas

;*
;* misc.nas
;* Miscellaneous assembly functions
;*
;*---------------------------------------------------------------------------
;* Copyright 1998-2006 Randy Heit
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* 1. Redistributions of source code must retain the above copyright
;*    notice, this list of conditions and the following disclaimer.
;* 2. Redistributions in binary form must reproduce the above copyright
;*    notice, this list of conditions and the following disclaimer in the
;*    documentation and/or other materials provided with the distribution.
;* 3. The name of the author may not be used to endorse or promote products
;*    derived from this software without specific prior written permission.
;*
;* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
;* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
;* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
;* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
;* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
;* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;*---------------------------------------------------------------------------
;*

BITS 32

%ifndef M_TARGET_LINUX

%define CheckMMX	_CheckMMX
%define EndMMX		_EndMMX
%define DoBlending_MMX	_DoBlending_MMX
%define BestColor_MMX	_BestColor_MMX
%define DoubleHoriz_MMX _DoubleHoriz_MMX
%define DoubleHorizVert_MMX _DoubleHorizVert_MMX
%define DoubleVert_ASM	_DoubleVert_ASM

%endif

%ifdef M_TARGET_WATCOM
  SEGMENT DATA PUBLIC ALIGN=16 CLASS=DATA USE32
  SEGMENT DATA
%else
  SECTION .data
%endif

Blending256:
	dd	0x01000100,0x00000100

%ifdef M_TARGET_WATCOM
  SEGMENT CODE PUBLIC ALIGN=16 CLASS=CODE USE32
  SEGMENT CODE
%else
  SECTION .text
%endif

;-----------------------------------------------------------
;
; CheckMMX
;
; Checks for the presence of MMX instructions on the
; current processor. This code is adapted from the samples
; in AMD's document entitled "AMD-K6<4B> MMX Processor
; Multimedia Extensions." Also fills in the vendor
; information string.
;
;-----------------------------------------------------------

GLOBAL	CheckMMX

; void CheckMMX (struct CPUInfo *)

CheckMMX:
	xor	eax,eax
	mov	ecx,92/4
	push	ebx
	push	edi
	mov	edi,[esp+12]
	rep	stosd
	sub	edi,92

	mov	[edi+88],byte 32; Assume a 32-byte cache line

	pushfd			; save EFLAGS
	pop	eax		; store EFLAGS in EAX
	mov	ebx,eax		; save in EBX for later testing
	xor	eax,0x00200000	; toggle bit 21
	push	eax		; put to stack
	popfd			; save changed EAX to EFLAGS
	pushfd			; push EFLAGS to TOS
	pop	eax		; store EFLAGS in EAX
	cmp	eax,ebx		; see if bit 21 has changed
	jz	near .noid	; if no change, then no CPUID

; Get vendor ID
	xor	eax,eax
	CPUID
	mov	[edi],ebx
	mov	[edi+4],edx
	mov	[edi+8],ecx

	cmp	ebx,0x68747541	; 'htuA'
	jne	.notamd
	cmp	edx,0x69746e65	; 'itne'
	jne	.notamd
	cmp	ecx,0x444d4163	; 'DMAc'
	jne	.notamd
	inc	byte [edi+87]
.notamd

; Get features flags and other info
	mov	eax,1
	CPUID
	mov	[edi+68],ebx	; Store brand index and other stuff
	mov	[edi+72],ecx	; Store extended feature flags
	mov	[edi+76],edx	; Store feature flags

	test	edx,(1<<19)	; If CLFLUSH instruction is supported,
	jz	.noclf
	shl	bh,3		; get the real cache line size.
	mov	[edi+88],bh

.noclf	mov	bl,al		; Extract stepping
	and	bl,0x0F
	mov	[edi+64],bl

	mov	bl,ah		; Extract processor type
	shr	bl,4		; (Valid for Intel only)
	and	bl,0x03
	mov	[edi+67],bl

	shr	al,4		; Extract model and family
	and	ah,0x0F		; model in al and family in ah
	cmp	ah,15
	jne	.noex

	mov	ebx,eax		; Add extended model and family
	shr	ebx,12
	and	bl,0xF0
	add	ah,bh
	or	al,bl

.noex	mov	[edi+65],al
	mov	[edi+66],ah

; Check for processor brand string
	mov	eax,0x80000000
	CPUID
	cmp	eax,0x80000001
	je	.feat2
	jb	near .noid
	cmp	eax,0x80000004
	jb	.feat2
	cmp	eax,0x80000005
	jb	.brand

; Get data L1 cache info
	mov	eax,0x80000005
	CPUID
	mov	[edi+88],ecx

; Get processor brand string
.brand	mov	eax,0x80000002
	CPUID
	mov	[edi+16],eax
	mov	[edi+20],ebx
	mov	[edi+24],ecx
	mov	[edi+28],edx
	mov	eax,0x80000003
	CPUID
	mov	[edi+32],eax
	mov	[edi+36],ebx
	mov	[edi+40],ecx
	mov	[edi+44],edx
	mov	eax,0x80000004
	CPUID
	mov	[edi+48],eax
	mov	[edi+52],ebx
	mov	[edi+56],ecx
	mov	[edi+60],edx

; Get AMD-specific feature flags
.feat2	cmp	byte [edi+87],0
	jz	.noid
	mov	eax,0x80000001
	CPUID
	mov	[edi+80],edx

	mov	bl,al		; Extract stepping
	and	bl,0x0F
	mov	[edi+84],bl

	shr	al,4		; Extract model and family
	and	ah,0x0F		; model in al and family in ah
	cmp	ah,15
	jne	.noex2

	mov	ebx,eax		; Add extended model and family
	shr	ebx,12
	and	bl,0xF0
	add	ah,bh
	or	al,bl

.noex2	mov	[edi+85],al
	mov	[edi+86],ah

.noid	pop	edi
	pop	ebx
	ret

;-----------------------------------------------------------
;
; EndMMX
;
; Signal the end of MMX code for compilers that can't
; do inline assembly. Currently unused.
;
;-----------------------------------------------------------

GLOBAL	EndMMX

EndMMX:
	emms
	ret

;-----------------------------------------------------------
;
; DoBlending_MMX
;
; MMX version of DoBlending
;
; (DWORD *from, DWORD *to, count, tor, tog, tob, toa)
;-----------------------------------------------------------

GLOBAL	DoBlending_MMX

DoBlending_MMX:
	pxor		mm0,mm0		; mm0 = 0
	mov		eax,[esp+4*4]
	shl		eax,16
	mov		edx,[esp+4*5]
	shl		edx,8
	or		eax,[esp+4*6]
	or		eax,edx
	mov		ecx,[esp+4*3]	; ecx = count
	movd		mm1,eax		; mm1 = 00000000 00RRGGBB
	mov		eax,[esp+4*7]
	shl		eax,16
	mov		edx,[esp+4*7]
	shl		edx,8
	or		eax,[esp+4*7]
	or		eax,edx
	mov		edx,[esp+4*2]	; edx = dest
	movd		mm6,eax		; mm6 = 00000000 00AAAAAA
	punpcklbw	mm1,mm0		; mm1 = 000000RR 00GG00BB
	movq		mm7,[Blending256]
	punpcklbw	mm6,mm0		; mm6 = 000000AA 00AA00AA
	mov		eax,[esp+4*1]	; eax = source
	pmullw		mm1,mm6		; mm1 = 000000RR 00GG00BB (multiplied by alpha)
	psubusw		mm7,mm6		; mm7 = 000000aa 00aa00aa (one minus alpha)
	nop	; Does this actually pair on a Pentium?

; Do two colors per iteration: Count must be even.

.loop	movq		mm2,[eax]	; mm2 = 00r2g2b2 00r1g1b1
	add		eax,8
	movq		mm3,mm2		; mm3 = 00r2g2b2 00r1g1b1
	punpcklbw	mm2,mm0		; mm2 = 000000r1 00g100b1
	movq		mm4,mm1
	punpckhbw	mm3,mm0		; mm3 = 000000r2 00g200b2
	pmullw		mm2,mm7		; mm2 = 0000r1rr g1ggb1bb
	add		edx,8
	pmullw		mm3,mm7		; mm3 = 0000r2rr g2ggb2bb
	sub		ecx,2
	paddusw		mm2,mm1
	paddusw		mm3,mm1
	psrlw		mm2,8
	psrlw		mm3,8
	packuswb	mm2,mm3		; mm2 = 00r2g2b2 00r1g1b1
	movq		[edx-8],mm2
	jnz		.loop

	emms
	ret

;-----------------------------------------------------------
;
; BestColor_MMX
;
; Picks the closest matching color from a palette
;
; Passed FFRRGGBB and palette array in same format
; FF is the index of the first palette entry to consider
;
;-----------------------------------------------------------

GLOBAL	BestColor_MMX
GLOBAL	@BestColor_MMX@8

BestColor_MMX:
	mov		ecx,[esp+4]
	mov		edx,[esp+8]
@BestColor_MMX@8:
	pxor		mm0,mm0
	movd		mm1,ecx		; mm1 = color searching for
	mov		eax,257*257+257*257+257*257	;eax = bestdist
	push		ebx
	punpcklbw	mm1,mm0
	mov		ebx,ecx		; ebx = best color
	shr		ecx,24		; ecx = count
	and		ebx,0xffffff
	push		esi
	push		ebp

.loop	movd		mm2,[edx+ecx*4]	; mm2 = color considering now
	inc		ecx
	punpcklbw	mm2,mm0
	movq		mm3,mm1
	psubsw		mm3,mm2
	pmullw		mm3,mm3		; mm3 = color distance squared

	movd		ebp,mm3		; add the three components
	psrlq		mm3,32		; into ebp to get the real
	mov		esi,ebp		; (squared) distance
	shr		esi,16
	and		ebp,0xffff
	add		ebp,esi
	movd		esi,mm3
	add		ebp,esi

	jz		.perf		; found a perfect match
	cmp		eax,ebp
	jb		.skip
	mov		eax,ebp
	lea		ebx,[ecx-1]
.skip	cmp		ecx,256
	jne		.loop
	mov		eax,ebx
	pop		ebp
	pop		esi
	pop		ebx
	emms
	ret

.perf	lea		eax,[ecx-1]
	pop		ebp
	pop		esi
	pop		ebx
	emms
	ret

;-----------------------------------------------------------
;
; DoubleHoriz_MMX
;
; Stretches an image horizontally using MMX instructions.
; The source image is assumed to occupy the right half
; of the destination image.
;
;	height of source
;	width of source
;	dest pointer (at end of row)
;	pitch
;
;-----------------------------------------------------------

GLOBAL	DoubleHoriz_MMX

DoubleHoriz_MMX:
	mov	edx,[esp+8]	; edx = width
	push	edi

	neg	edx		; make edx negative so we can count up
	mov	edi,[esp+16]	; edi = dest pointer

	sar	edx,2		; and make edx count groups of 4 pixels
	push	ebp

	mov	ebp,edx		; ebp = # of columns remaining in this row
	push	ebx

	mov	ebx,[esp+28]	; ebx = pitch
	mov	ecx,[esp+16]	; ecx = # of rows remaining

.loop	movq		mm0,[edi+ebp*4]

.loop2	movq		mm1,mm0
	punpcklbw	mm0,mm0			; double left 4 pixels

	movq		mm2,[edi+ebp*4+8]
	punpckhbw	mm1,mm1			; double right 4 pixels

	movq		[edi+ebp*8],mm0		; write left pixels
	movq		mm0,mm2

	movq		[edi+ebp*8+8],mm1	; write right pixels

	add		ebp,2			; increment counter
	jnz		.loop2			; repeat until done with this row


	add	edi,ebx		; move edi to next row
	dec	ecx		; decrease row counter

	mov	ebp,edx		; prep ebp for next row
	jnz	.loop		; repeat until every row is done

	emms
	pop	ebx
	pop	ebp
	pop	edi
	ret

;-----------------------------------------------------------
;
; DoubleHorizVert_MMX
;
; Stretches an image horizontally and vertically using
; MMX instructions. The source image is assumed to occupy
; the right half of the destination image and to leave
; every other line unused for expansion.
;
;	height of source
;	width of source
;	dest pointer (at end of row)
;	pitch
;
;-----------------------------------------------------------

GLOBAL	DoubleHorizVert_MMX

DoubleHorizVert_MMX:
	mov	edx,[esp+8]	; edx = width
	push	edi

	neg	edx		; make edx negative so we can count up
	mov	edi,[esp+16]	; edi = dest pointer

	sar	edx,2		; and make edx count groups of 4 pixels
	push	ebp

	mov	ebp,edx		; ebp = # of columns remaining in this row
	push	ebx

	mov	ebx,[esp+28]	; ebx = pitch
	mov	ecx,[esp+16]	; ecx = # of rows remaining

	push	esi
	lea	esi,[edi+ebx]

.loop	movq		mm0,[edi+ebp*4]		; get 8 pixels

	movq		mm1,mm0
	punpcklbw	mm0,mm0			; double left 4

	punpckhbw	mm1,mm1			; double right 4
	add		ebp,2			; increment counter

	movq		[edi+ebp*8-16],mm0	; write them back out

	movq		[edi+ebp*8-8],mm1

	movq		[esi+ebp*8-16],mm0

	movq		[esi+ebp*8-8],mm1

	jnz		.loop			; repeat until done with this row

	lea	edi,[edi+ebx*2]	; move edi and esi to next row
	lea	esi,[esi+ebx*2]

	dec	ecx		; decrease row counter
	mov	ebp,edx		; prep ebp for next row

	jnz	.loop		; repeat until every row is done

	emms
	pop	esi
	pop	ebx
	pop	ebp
	pop	edi
	ret

;-----------------------------------------------------------
;
; DoubleVert_ASM
;
; Stretches an image vertically using regular x86
; instructions. The source image should be interleaved.
;
;	height of source
;	width of source
;	source/dest pointer
;	pitch
;
;-----------------------------------------------------------

GLOBAL	DoubleVert_ASM

DoubleVert_ASM:
	mov	edx,[esp+16]	; edx = pitch
	mov	eax,[esp+4]	; eax = # of rows left

	push	esi
	mov	esi,[esp+16]

	push	edi
	lea	edi,[esi+edx]

	shl	edx,1		; edx = pitch*2
	mov	ecx,[esp+16]

	sub	edx,ecx		; edx = dist from end of one line to start of next
	shr	ecx,2

.loop	rep movsd
	
	mov	ecx,[esp+16]
	add	esi,edx

	add	edi,edx
	shr	ecx,2

	dec	eax
	jnz	.loop

	pop	edi
	pop	esi
	ret