#ifdef _MSC_VER
#include <intrin.h>
#endif
#include <mmintrin.h>
#include <emmintrin.h>

#include "doomtype.h"
#include "i_system.h"

#ifdef __GNUC__
#define __cpuid(output, func) __asm__ __volatile__("cpuid" : "=a" ((output)[0]),\
	"=b" ((output)[1]), "=c" ((output)[2]), "=d" ((output)[3]) : "a" (func));
#endif

void CheckCPUID(CPUInfo *cpu)
{
	int foo[4];
	unsigned int maxext;

	memset(cpu, 0, sizeof(*cpu));

	cpu->DataL1LineSize = 32;	// Assume a 32-byte cache line

#if !defined(_M_IX86) && !defined(__i386__) && !defined(_M_X64) && !defined(__amd64__)
	return;
#else

#if defined(_M_IX86) || defined(__i386__)
	// Old 486s do not have CPUID, so we must test for its presence.
	// This code is adapted from the samples in AMD's document
	// entitled "AMD-K6™ MMX Processor Multimedia Extensions."
#ifndef __GNUC__
	__asm
	{
		pushfd				// save EFLAGS
		pop eax				// store EFLAGS in EAX
		mov ecx,eax			// save in ECX for later testing
		xor eax,0x00200000	// toggle bit 21
		push eax			// put to stack
		popfd				// save changed EAX to EFLAGS
		pushfd				// push EFLAGS to TOS
		pop eax				// store EFLAGS in EAX
		cmp eax,ecx			// see if bit 21 has changed
		jne haveid			// if no change, then no CPUID
	}
	return;
haveid:
#else
	int oldfd, newfd;

	__asm__ __volatile__("\t"
		"pushf\n\t"
		"popl %0\n\t"
		"movl %0,%1\n\t"
		"xorl $0x200000,%0\n\t"
		"pushl %0\n\t"
		"popf\n\t"
		"pushf\n\t"
		"popl %0\n\t"
		: "=r" (newfd), "=r" (oldfd));
	if (oldfd == newfd)
	{
		return;
	}
#endif
#endif

	// Get vendor ID
	__cpuid(foo, 0);
	((int *)cpu->VendorID)[0] = foo[1];
	((int *)cpu->VendorID)[1] = foo[3];
	((int *)cpu->VendorID)[2] = foo[2];
	if (foo[1] == MAKE_ID('A','u','t','h') &&
		foo[3] == MAKE_ID('e','n','t','i') &&
		foo[2] == MAKE_ID('c','A','M','D'))
	{
		cpu->bIsAMD = true;
	}

	// Get features flags and other info
	__cpuid(foo, 1);
	((int *)cpu)[17] = foo[1];	// Store brand index and other stuff
	((int *)cpu)[18] = foo[2];	// Store extended feature flags
	((int *)cpu)[19] = foo[3];	// Store feature flags

	// If CLFLUSH instruction is supported, get the real cache line size.
	if (foo[3] & (1 << 19))
	{
		cpu->DataL1LineSize = (foo[1] & 0xFF00) >> (8 - 3);
	}

	cpu->Stepping = foo[0] & 0x0F;
	cpu->Type = (foo[0] & 0x3000) >> 12;	// valid on Intel only
	cpu->Model = (foo[0] & 0xF0) >> 4;
	cpu->Family = (foo[0] & 0xF00) >> 8;

	if (cpu->Family == 15)
	{ // Add extended model and family.
		cpu->Family += (foo[0] >> 20) & 0xFF;
		cpu->Model |= (foo[0] >> 12) & 0xF0;
	}

	// Check for extended functions.
	__cpuid(foo, 0x80000000);
	maxext = (unsigned int)foo[0];

	if (maxext >= 0x80000004)
	{ // Get processor brand string.
		__cpuid((int *)&cpu->CPUString[0],  0x80000002);
		__cpuid((int *)&cpu->CPUString[16], 0x80000003);
		__cpuid((int *)&cpu->CPUString[32], 0x80000004);
	}

	if (cpu->bIsAMD)
	{
		if (maxext >= 0x80000005)
		{ // Get data L1 cache info.
			__cpuid(foo, 0x80000005);
			*(int *)(&cpu->DataL1LineSize) = foo[2];
		}
		if (maxext >= 0x80000001)
		{ // Get AMD-specific feature flags.
			__cpuid(foo, 0x80000001);
			cpu->AMDStepping = foo[0] & 0x0F;
			cpu->AMDModel = (foo[0] & 0xF0) >> 4;
			cpu->AMDFamily = (foo[0] & 0xF00) >> 8;

			if (cpu->AMDFamily == 15)
			{ // Add extended model and family.
				cpu->AMDFamily += (foo[0] >> 20) & 0xFF;
				cpu->AMDModel |= (foo[0] >> 12) & 0xF0;
			}
		}
	}

#endif
}

#if 0
// Compiler output for this function is crap compared to the assembly
// version, which is why it isn't used.
void DoBlending_MMX2(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a)
{
	__m64 blendcolor;
	__m64 blendalpha;
	__m64 zero;
	__m64 blending256;
	__m64 color1;
	__m64 color2;

	zero = _mm_setzero_si64();
#ifndef __GNUC__
	blending256.m64_i64 = 0x10001000100;
#else
	blending256 = (__m64)0x10001000100ll;
#endif

	blendcolor = _mm_unpacklo_pi8(_m_from_int((r << 16) | (g << 8) | b), zero);	// 000000RR 00GG00BB
	blendalpha = _mm_unpacklo_pi8(_m_from_int((a << 16) | (a << 8) | a), zero);	// 000000AA 00AA00AA

	blendcolor = _mm_mullo_pi16(blendcolor, blendalpha);	// premultiply blend by alpha
	blendalpha = _mm_subs_pu16(blending256, blendalpha);	// one minus alpha

	// Do two colors per iteration: Count must be even
	for (count >>= 1; count > 0; --count)
	{
		color1 = *(__m64 *)from;						// 00r2g2b2 00r1g1b1
		from += 2;
		color2 = _mm_unpackhi_pi8(color1, zero);		// 000000r2 00g200b2
		color1 = _mm_unpacklo_pi8(color1, zero);		// 000000r1 00g100b1
		color1 = _mm_mullo_pi16(blendalpha, color1);	// 0000r1rr g1ggb1bb
		color2 = _mm_mullo_pi16(blendalpha, color2);	// 0000r2rr g2ggb2bb
		color1 = _mm_adds_pu16(blendcolor, color1);
		color2 = _mm_adds_pu16(blendcolor, color2);
		color1 = _mm_srli_pi16(color1, 8);
		color2 = _mm_srli_pi16(color2, 8);
		*(__m64 *)to = _mm_packs_pu16(color1, color2);	// 00r2g2b2 00r1g1b1
		to += 2;
	}
	_mm_empty();
}
#endif

#ifdef X86_ASM
extern "C" void STACK_ARGS DoBlending_MMX(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a);
#endif

void DoBlending_SSE2(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a)
{
	__m128i blendcolor;
	__m128i blendalpha;
	__m128i zero;
	__m128i blending256;
	__m128i color1;
	__m128i color2;
	size_t unaligned;

	unaligned = ((size_t)from | (size_t)to) & 0xF;

#ifdef X86_ASM
	// For unaligned accesses, the assembly MMX version is slightly faster.
	// Note that using unaligned SSE loads and stores is still faster than
	// the compiler-generated MMX version.
	if (unaligned)
	{
		DoBlending_MMX(from, to, count, r, g, b, a);
		return;
	}
#endif

#if defined(__amd64__) || defined(_M_IX64)
	long long color;

	blending256 = _mm_set_epi64x(0x10001000100ll, 0x10001000100ll);

	color = ((long long)r << 32) | (g << 16) | b;
	blendcolor = _mm_set_epi64x(color, color);

	color = ((long long)a << 32) | (a << 16) | a;
	blendalpha = _mm_set_epi64x(color, color);
#else
	int color;

	blending256 = _mm_set_epi32(0x100, 0x1000100, 0x100, 0x1000100);

	color = (g << 16) | b;
	blendcolor = _mm_set_epi32(r, color, r, color);

	color = (a << 16) | a;
	blendalpha = _mm_set_epi32(a, color, a, color);
#endif

	blendcolor = _mm_mullo_epi16(blendcolor, blendalpha);	// premultiply blend by alpha
	blendalpha = _mm_subs_epu16(blending256, blendalpha);	// one minus alpha

	zero = _mm_setzero_si128();

#ifndef X86_ASM
	if (unaligned)
	{
		for (count >>= 2; count > 0; --count)
		{
			color1 = _mm_loadu_si128((__m128i *)from);
			from += 4;
			color2 = _mm_unpackhi_epi8(color1, zero);
			color1 = _mm_unpacklo_epi8(color1, zero);
			color1 = _mm_mullo_epi16(blendalpha, color1);
			color2 = _mm_mullo_epi16(blendalpha, color2);
			color1 = _mm_adds_epu16(blendcolor, color1);
			color2 = _mm_adds_epu16(blendcolor, color2);
			color1 = _mm_srli_epi16(color1, 8);
			color2 = _mm_srli_epi16(color2, 8);
			_mm_storeu_si128((__m128i *)to, _mm_packus_epi16(color1, color2));
			to += 4;
		}
	}
	else
#endif
	{
		for (count >>= 2; count > 0; --count)
		{
			color1 = _mm_load_si128((__m128i *)from);
			from += 4;
			color2 = _mm_unpackhi_epi8(color1, zero);
			color1 = _mm_unpacklo_epi8(color1, zero);
			color1 = _mm_mullo_epi16(blendalpha, color1);
			color2 = _mm_mullo_epi16(blendalpha, color2);
			color1 = _mm_adds_epu16(blendcolor, color1);
			color2 = _mm_adds_epu16(blendcolor, color2);
			color1 = _mm_srli_epi16(color1, 8);
			color2 = _mm_srli_epi16(color2, 8);
			_mm_store_si128((__m128i *)to, _mm_packus_epi16(color1, color2));
			to += 4;
		}
	}
}