qzdoom/src/asm_ia32/misc.asm

;*
;* misc.nas
;* Miscellaneous assembly functions
;*
;*---------------------------------------------------------------------------
;* Copyright 1998-2006 Randy Heit
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* 1. Redistributions of source code must retain the above copyright
;*    notice, this list of conditions and the following disclaimer.
;* 2. Redistributions in binary form must reproduce the above copyright
;*    notice, this list of conditions and the following disclaimer in the
;*    documentation and/or other materials provided with the distribution.
;* 3. The name of the author may not be used to endorse or promote products
;*    derived from this software without specific prior written permission.
;*
;* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
;* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
;* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
;* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
;* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
;* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;*---------------------------------------------------------------------------
;*

BITS 32

%ifndef M_TARGET_LINUX

%define DoBlending_MMX	_DoBlending_MMX
%define BestColor_MMX	_BestColor_MMX

%endif

%ifdef M_TARGET_WATCOM
  SEGMENT DATA PUBLIC ALIGN=16 CLASS=DATA USE32
  SEGMENT DATA
%else
  SECTION .data
%endif

Blending256:
	dd	0x01000100,0x00000100

%ifdef M_TARGET_WATCOM
  SEGMENT CODE PUBLIC ALIGN=16 CLASS=CODE USE32
  SEGMENT CODE
%else
  SECTION .text
%endif

;-----------------------------------------------------------
;
; DoBlending_MMX
;
; MMX version of DoBlending
;
; (DWORD *from, DWORD *to, count, tor, tog, tob, toa)
;-----------------------------------------------------------

GLOBAL	DoBlending_MMX

DoBlending_MMX:
		pxor		mm0,mm0			; mm0 = 0
		mov			eax,[esp+4*4]
		shl			eax,16
		mov			edx,[esp+4*5]
		shl			edx,8
		or			eax,[esp+4*6]
		or			eax,edx
		mov			ecx,[esp+4*3]	; ecx = count
		movd		mm1,eax			; mm1 = 00000000 00RRGGBB
		mov			eax,[esp+4*7]
		shl			eax,16
		mov			edx,[esp+4*7]
		shl			edx,8
		or			eax,[esp+4*7]
		or			eax,edx
		mov			edx,[esp+4*2]	; edx = dest
		movd		mm6,eax			; mm6 = 00000000 00AAAAAA
		punpcklbw	mm1,mm0			; mm1 = 000000RR 00GG00BB
		movq		mm7,[Blending256]
		punpcklbw	mm6,mm0			; mm6 = 000000AA 00AA00AA
		mov			eax,[esp+4*1]	; eax = source
		pmullw		mm1,mm6			; mm1 = 000000RR 00GG00BB (multiplied by alpha)
		psubusw		mm7,mm6			; mm7 = 000000aa 00aa00aa (one minus alpha)
		nop							; Does this actually pair on a Pentium?

; Do four colors per iteration: Count must be a multiple of four.

.loop	movq		mm2,[eax]	; mm2 = 00r2g2b2 00r1g1b1
		add			eax,8
		movq		mm3,mm2		; mm3 = 00r2g2b2 00r1g1b1
		punpcklbw	mm2,mm0		; mm2 = 000000r1 00g100b1
		punpckhbw	mm3,mm0		; mm3 = 000000r2 00g200b2
		pmullw		mm2,mm7		; mm2 = 0000r1rr g1ggb1bb
		add			edx,8
		pmullw		mm3,mm7		; mm3 = 0000r2rr g2ggb2bb
		sub			ecx,2
		paddusw		mm2,mm1
		psrlw		mm2,8
		paddusw		mm3,mm1
		psrlw		mm3,8
		packuswb	mm2,mm3		; mm2 = 00r2g2b2 00r1g1b1
		movq		[edx-8],mm2

		movq		mm2,[eax]	; mm2 = 00r2g2b2 00r1g1b1
		add			eax,8
		movq		mm3,mm2		; mm3 = 00r2g2b2 00r1g1b1
		punpcklbw	mm2,mm0		; mm2 = 000000r1 00g100b1
		punpckhbw	mm3,mm0		; mm3 = 000000r2 00g200b2
		pmullw		mm2,mm7		; mm2 = 0000r1rr g1ggb1bb
		add			edx,8
		pmullw		mm3,mm7		; mm3 = 0000r2rr g2ggb2bb
		sub			ecx,2
		paddusw		mm2,mm1
		psrlw		mm2,8
		paddusw		mm3,mm1
		psrlw		mm3,8
		packuswb	mm2,mm3		; mm2 = 00r2g2b2 00r1g1b1
		movq		[edx-8],mm2

		jnz			.loop

		emms
		ret

;-----------------------------------------------------------
;
; BestColor_MMX
;
; Picks the closest matching color from a palette
;
; Passed FFRRGGBB and palette array in same format
; FF is the index of the first palette entry to consider
;
;-----------------------------------------------------------

GLOBAL	BestColor_MMX
GLOBAL	@BestColor_MMX@8

BestColor_MMX:
		mov			ecx,[esp+4]
		mov			edx,[esp+8]
@BestColor_MMX@8:
		pxor		mm0,mm0
		movd		mm1,ecx			; mm1 = color searching for
		mov			eax,257*257+257*257+257*257	;eax = bestdist
		push		ebx
		punpcklbw	mm1,mm0
		mov			ebx,ecx			; ebx = best color
		shr			ecx,24			; ecx = count
		and			ebx,0xffffff
		push		esi
		push		ebp

.loop	movd		mm2,[edx+ecx*4]	; mm2 = color considering now
		inc			ecx
		punpcklbw	mm2,mm0
		movq		mm3,mm1
		psubsw		mm3,mm2
		pmullw		mm3,mm3			; mm3 = color distance squared

		movd		ebp,mm3			; add the three components
		psrlq		mm3,32			; into ebp to get the real
		mov			esi,ebp			; (squared) distance
		shr			esi,16
		and			ebp,0xffff
		add			ebp,esi
		movd		esi,mm3
		add			ebp,esi

		jz			.perf			; found a perfect match
		cmp			eax,ebp
		jb			.skip
		mov			eax,ebp
		lea			ebx,[ecx-1]
.skip	cmp			ecx,256
		jne			.loop
		mov			eax,ebx
		pop			ebp
		pop			esi
		pop			ebx
		emms
		ret

.perf	lea		eax,[ecx-1]
		pop		ebp
		pop		esi
		pop		ebx
		emms
		ret