/*
===========================================================================

Doom 3 GPL Source Code
Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.

This file is part of the Doom 3 GPL Source Code ("Doom 3 Source Code").

Doom 3 Source Code is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Doom 3 Source Code is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with Doom 3 Source Code.  If not, see <http://www.gnu.org/licenses/>.

In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code.  If not, please request a copy in writing from id Software at the address below.

If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.

===========================================================================
*/

#include "sys/platform.h"

#include "idlib/math/Simd_MMX.h"

//===============================================================
//
//	MMX implementation of idSIMDProcessor
//
//===============================================================

#if defined(__GNUC__) && defined(__MMX__)
/*
============
idSIMD_MMX::GetName
============
*/
const char * idSIMD_MMX::GetName( void ) const {
	return "MMX";
}

#elif defined(_MSC_VER) && defined(_M_IX86)

#define EMMS_INSTRUCTION		__asm emms

/*
============
idSIMD_MMX::GetName
============
*/
const char * idSIMD_MMX::GetName( void ) const {
	return "MMX";
}

/*
================
MMX_Memcpy8B
================
*/
void MMX_Memcpy8B( void *dest, const void *src, const int count ) {
	_asm {
		mov		esi, src
		mov		edi, dest
		mov		ecx, count
		shr		ecx, 3			// 8 bytes per iteration

loop1:
		movq	mm1,  0[ESI]	// Read in source data
		movntq	0[EDI], mm1		// Non-temporal stores

		add		esi, 8
		add		edi, 8
		dec		ecx
		jnz		loop1

	}
	EMMS_INSTRUCTION
}

/*
================
MMX_Memcpy64B

  165MB/sec
================
*/
void MMX_Memcpy64B( void *dest, const void *src, const int count ) {
	_asm {
		mov		esi, src
		mov		edi, dest
		mov		ecx, count
		shr		ecx, 6		// 64 bytes per iteration

loop1:
		prefetchnta 64[ESI]	// Prefetch next loop, non-temporal
		prefetchnta 96[ESI]

		movq mm1,  0[ESI]	// Read in source data
		movq mm2,  8[ESI]
		movq mm3, 16[ESI]
		movq mm4, 24[ESI]
		movq mm5, 32[ESI]
		movq mm6, 40[ESI]
		movq mm7, 48[ESI]
		movq mm0, 56[ESI]

		movntq  0[EDI], mm1	// Non-temporal stores
		movntq  8[EDI], mm2
		movntq 16[EDI], mm3
		movntq 24[EDI], mm4
		movntq 32[EDI], mm5
		movntq 40[EDI], mm6
		movntq 48[EDI], mm7
		movntq 56[EDI], mm0

		add		esi, 64
		add		edi, 64
		dec		ecx
		jnz		loop1
	}
	EMMS_INSTRUCTION
}

/*
================
MMX_Memcpy2kB

  240MB/sec
================
*/
void MMX_Memcpy2kB( void *dest, const void *src, const int count ) {
	byte *tbuf = (byte *)_alloca16(2048);
	__asm {
		push	ebx
		mov		esi, src
		mov		ebx, count
		shr		ebx, 11		// 2048 bytes at a time
		mov		edi, dest

loop2k:
		push	edi			// copy 2k into temporary buffer
		mov		edi, tbuf
		mov		ecx, 32

loopMemToL1:
		prefetchnta 64[ESI] // Prefetch next loop, non-temporal
		prefetchnta 96[ESI]

		movq mm1,  0[ESI]	// Read in source data
		movq mm2,  8[ESI]
		movq mm3, 16[ESI]
		movq mm4, 24[ESI]
		movq mm5, 32[ESI]
		movq mm6, 40[ESI]
		movq mm7, 48[ESI]
		movq mm0, 56[ESI]

		movq  0[EDI], mm1	// Store into L1
		movq  8[EDI], mm2
		movq 16[EDI], mm3
		movq 24[EDI], mm4
		movq 32[EDI], mm5
		movq 40[EDI], mm6
		movq 48[EDI], mm7
		movq 56[EDI], mm0
		add		esi, 64
		add		edi, 64
		dec		ecx
		jnz		loopMemToL1

		pop		edi			// Now copy from L1 to system memory
		push	esi
		mov		esi, tbuf
		mov		ecx, 32

loopL1ToMem:
		movq mm1, 0[ESI]	// Read in source data from L1
		movq mm2, 8[ESI]
		movq mm3, 16[ESI]
		movq mm4, 24[ESI]
		movq mm5, 32[ESI]
		movq mm6, 40[ESI]
		movq mm7, 48[ESI]
		movq mm0, 56[ESI]

		movntq 0[EDI], mm1	// Non-temporal stores
		movntq 8[EDI], mm2
		movntq 16[EDI], mm3
		movntq 24[EDI], mm4
		movntq 32[EDI], mm5
		movntq 40[EDI], mm6
		movntq 48[EDI], mm7
		movntq 56[EDI], mm0

		add		esi, 64
		add		edi, 64
		dec		ecx
		jnz		loopL1ToMem

		pop		esi			// Do next 2k block
		dec		ebx
		jnz		loop2k
		pop		ebx
	}
	EMMS_INSTRUCTION
}


/*
================
idSIMD_MMX::Memcpy

  optimized memory copy routine that handles all alignment cases and block sizes efficiently
================
*/
void VPCALL idSIMD_MMX::Memcpy( void *dest0, const void *src0, const int count0 ) {
	// if copying more than 16 bytes and we can copy 8 byte aligned
	if ( count0 > 16 && !( ( (int)dest0 ^ (int)src0 ) & 7 ) ) {
		byte *dest = (byte *)dest0;
		byte *src = (byte *)src0;

		// copy up to the first 8 byte aligned boundary
		int count = ((int)dest) & 7;
		memcpy( dest, src, count );
		dest += count;
		src += count;
		count = count0 - count;

		// if there are multiple blocks of 2kB
		if ( count & ~4095 ) {
			MMX_Memcpy2kB( dest, src, count );
			src += (count & ~2047);
			dest += (count & ~2047);
			count &= 2047;
		}

		// if there are blocks of 64 bytes
		if ( count & ~63 ) {
			MMX_Memcpy64B( dest, src, count );
			src += (count & ~63);
			dest += (count & ~63);
			count &= 63;
		}

		// if there are blocks of 8 bytes
		if ( count & ~7 ) {
			MMX_Memcpy8B( dest, src, count );
			src += (count & ~7);
			dest += (count & ~7);
			count &= 7;
		}

		// copy any remaining bytes
		memcpy( dest, src, count );
	} else {
		// use the regular one if we cannot copy 8 byte aligned
		memcpy( dest0, src0, count0 );
	}

	// the MMX_Memcpy* functions use MOVNTQ, issue a fence operation
	__asm {
		sfence
	}
}

/*
================
idSIMD_MMX::Memset
================
*/
void VPCALL idSIMD_MMX::Memset( void* dest0, const int val, const int count0 ) {
	union {
		byte	bytes[8];
		word	words[4];
		dword	dwords[2];
	} dat;

	byte *dest = (byte *)dest0;
	int count = count0;

	while ( count > 0 && (((int)dest) & 7) ) {
		*dest = val;
		dest++;
		count--;
	}
	if ( !count ) {
		return;
	}

	dat.bytes[0] = val;
	dat.bytes[1] = val;
	dat.words[1] = dat.words[0];
	dat.dwords[1] = dat.dwords[0];

	if ( count >= 64 ) {
		__asm {
			mov edi, dest
			mov ecx, count
			shr ecx, 6				// 64 bytes per iteration
			movq mm1, dat			// Read in source data
			movq mm2, mm1
			movq mm3, mm1
			movq mm4, mm1
			movq mm5, mm1
			movq mm6, mm1
			movq mm7, mm1
			movq mm0, mm1
loop1:
			movntq  0[EDI], mm1		// Non-temporal stores
			movntq  8[EDI], mm2
			movntq 16[EDI], mm3
			movntq 24[EDI], mm4
			movntq 32[EDI], mm5
			movntq 40[EDI], mm6
			movntq 48[EDI], mm7
			movntq 56[EDI], mm0

			add edi, 64
			dec ecx
			jnz loop1
		}
		dest += ( count & ~63 );
		count &= 63;
	}

	if ( count >= 8 ) {
		__asm {
			mov edi, dest
			mov ecx, count
			shr ecx, 3				// 8 bytes per iteration
			movq mm1, dat			// Read in source data
loop2:
			movntq  0[EDI], mm1		// Non-temporal stores

			add edi, 8
			dec ecx
			jnz loop2
		}
		dest += (count & ~7);
		count &= 7;
	}

	while ( count > 0 ) {
		*dest = val;
		dest++;
		count--;
	}

	EMMS_INSTRUCTION

	// the MMX_Memcpy* functions use MOVNTQ, issue a fence operation
	__asm {
		sfence
	}
}

#endif /* _MSC_VER */