diff --git a/src/x86.cpp b/src/x86.cpp
new file mode 100644
index 0000000000..1bd6782d8e
--- /dev/null
+++ b/src/x86.cpp
@@ -0,0 +1,276 @@
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+#include <mmintrin.h>
+#include <emmintrin.h>
+
+#include "doomtype.h"
+#include "i_system.h"
+
+#ifdef __GNUC__
+#define __cpuid(output, func) __asm__ __volatile__("cpuid" : "=a" ((output)[0]),\
+	"=b" ((output)[1]), "=c" ((output)[2]), "=d" ((output)[3]) : "a" (func));
+#endif
+
+void CheckCPUID(CPUInfo *cpu)
+{
+	int foo[4];
+	unsigned int maxext;
+
+	memset(cpu, 0, sizeof(*cpu));
+
+	cpu->DataL1LineSize = 32;	// Assume a 32-byte cache line
+
+#if !defined(_M_IX86) && !defined(__i386__) && !defined(_M_X64) && !defined(__amd64__)
+	return;
+#else
+
+#if defined(_M_IX86) || defined(__i386__)
+	// Old 486s do not have CPUID, so we must test for its presence.
+	// This code is adapted from the samples in AMD's document
+	// entitled "AMD-K6™ MMX Processor Multimedia Extensions."
+#ifndef __GNUC__
+	__asm
+	{
+		pushfd				// save EFLAGS
+		pop eax				// store EFLAGS in EAX
+		mov ecx,eax			// save in ECX for later testing
+		xor eax,0x00200000	// toggle bit 21
+		push eax			// put to stack
+		popfd				// save changed EAX to EFLAGS
+		pushfd				// push EFLAGS to TOS
+		pop eax				// store EFLAGS in EAX
+		cmp eax,ecx			// see if bit 21 has changed
+		jne haveid			// if no change, then no CPUID
+	}
+	return;
+haveid:
+#else
+	int oldfd, newfd;
+
+	__asm__ __volatile__("\t"
+		"pushf\n\t"
+		"popl %0\n\t"
+		"movl %0,%1\n\t"
+		"xorl $0x200000,%0\n\t"
+		"pushl %0\n\t"
+		"popf\n\t"
+		"pushf\n\t"
+		"popl %0\n\t"
+		: "=r" (newfd), "=r" (oldfd));
+	if (oldfd == newfd)
+	{
+		return;
+	}
+#endif
+#endif
+
+	// Get vendor ID
+	__cpuid(foo, 0);
+	((int *)cpu->VendorID)[0] = foo[1];
+	((int *)cpu->VendorID)[1] = foo[3];
+	((int *)cpu->VendorID)[2] = foo[2];
+	if (foo[1] == MAKE_ID('A','u','t','h') &&
+		foo[3] == MAKE_ID('e','n','t','i') &&
+		foo[2] == MAKE_ID('c','A','M','D'))
+	{
+		cpu->bIsAMD = true;
+	}
+
+	// Get features flags and other info
+	__cpuid(foo, 1);
+	((int *)cpu)[17] = foo[1];	// Store brand index and other stuff
+	((int *)cpu)[18] = foo[2];	// Store extended feature flags
+	((int *)cpu)[19] = foo[3];	// Store feature flags
+
+	// If CLFLUSH instruction is supported, get the real cache line size.
+	if (foo[3] & (1 << 19))
+	{
+		cpu->DataL1LineSize = (foo[1] & 0xFF00) >> (8 - 3);
+	}
+
+	cpu->Stepping = foo[0] & 0x0F;
+	cpu->Type = (foo[0] & 0x3000) >> 12;	// valid on Intel only
+	cpu->Model = (foo[0] & 0xF0) >> 4;
+	cpu->Family = (foo[0] & 0xF00) >> 8;
+
+	if (cpu->Family == 15)
+	{ // Add extended model and family.
+		cpu->Family += (foo[0] >> 20) & 0xFF;
+		cpu->Model |= (foo[0] >> 12) & 0xF0;
+	}
+
+	// Check for extended functions.
+	__cpuid(foo, 0x80000000);
+	maxext = (unsigned int)foo[0];
+
+	if (maxext >= 0x80000004)
+	{ // Get processor brand string.
+		__cpuid((int *)&cpu->CPUString[0],  0x80000002);
+		__cpuid((int *)&cpu->CPUString[16], 0x80000003);
+		__cpuid((int *)&cpu->CPUString[32], 0x80000004);
+	}
+
+	if (cpu->bIsAMD)
+	{
+		if (maxext >= 0x80000005)
+		{ // Get data L1 cache info.
+			__cpuid(foo, 0x80000005);
+			*(int *)(&cpu->DataL1LineSize) = foo[2];
+		}
+		if (maxext >= 0x80000001)
+		{ // Get AMD-specific feature flags.
+			__cpuid(foo, 0x80000001);
+			cpu->AMDStepping = foo[0] & 0x0F;
+			cpu->AMDModel = (foo[0] & 0xF0) >> 4;
+			cpu->AMDFamily = (foo[0] & 0xF00) >> 8;
+
+			if (cpu->AMDFamily == 15)
+			{ // Add extended model and family.
+				cpu->AMDFamily += (foo[0] >> 20) & 0xFF;
+				cpu->AMDModel |= (foo[0] >> 12) & 0xF0;
+			}
+		}
+	}
+
+#endif
+}
+
+#if 0
+// Compiler output for this function is crap compared to the assembly
+// version, which is why it isn't used.
+void DoBlending_MMX2(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a)
+{
+	__m64 blendcolor;
+	__m64 blendalpha;
+	__m64 zero;
+	__m64 blending256;
+	__m64 color1;
+	__m64 color2;
+
+	zero = _mm_setzero_si64();
+#ifndef __GNUC__
+	blending256.m64_i64 = 0x10001000100;
+#else
+	blending256 = (__m64)0x10001000100ll;
+#endif
+
+	blendcolor = _mm_unpacklo_pi8(_m_from_int((r << 16) | (g << 8) | b), zero);	// 000000RR 00GG00BB
+	blendalpha = _mm_unpacklo_pi8(_m_from_int((a << 16) | (a << 8) | a), zero);	// 000000AA 00AA00AA
+
+	blendcolor = _mm_mullo_pi16(blendcolor, blendalpha);	// premultiply blend by alpha
+	blendalpha = _mm_subs_pu16(blending256, blendalpha);	// one minus alpha
+
+	// Do two colors per iteration: Count must be even
+	for (count >>= 1; count > 0; --count)
+	{
+		color1 = *(__m64 *)from;						// 00r2g2b2 00r1g1b1
+		from += 2;
+		color2 = _mm_unpackhi_pi8(color1, zero);		// 000000r2 00g200b2
+		color1 = _mm_unpacklo_pi8(color1, zero);		// 000000r1 00g100b1
+		color1 = _mm_mullo_pi16(blendalpha, color1);	// 0000r1rr g1ggb1bb
+		color2 = _mm_mullo_pi16(blendalpha, color2);	// 0000r2rr g2ggb2bb
+		color1 = _mm_adds_pu16(blendcolor, color1);
+		color2 = _mm_adds_pu16(blendcolor, color2);
+		color1 = _mm_srli_pi16(color1, 8);
+		color2 = _mm_srli_pi16(color2, 8);
+		*(__m64 *)to = _mm_packs_pu16(color1, color2);	// 00r2g2b2 00r1g1b1
+		to += 2;
+	}
+	_mm_empty();
+}
+#endif
+
+#ifdef X86_ASM
+extern "C" void STACK_ARGS DoBlending_MMX(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a);
+#endif
+
+void DoBlending_SSE2(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a)
+{
+	__m128i blendcolor;
+	__m128i blendalpha;
+	__m128i zero;
+	__m128i blending256;
+	__m128i color1;
+	__m128i color2;
+	size_t unaligned;
+
+	unaligned = ((size_t)from | (size_t)to) & 0xF;
+
+#ifdef X86_ASM
+	// For unaligned accesses, the assembly MMX version is slightly faster.
+	// Note that using unaligned SSE loads and stores is still faster than
+	// the compiler-generated MMX version.
+	if (unaligned)
+	{
+		DoBlending_MMX(from, to, count, r, g, b, a);
+		return;
+	}
+#endif
+
+#if defined(__amd64__) || defined(_M_IX64)
+	long long color;
+
+	blending256 = _mm_set_epi64x(0x10001000100ll, 0x10001000100ll);
+
+	color = ((long long)r << 32) | (g << 16) | b;
+	blendcolor = _mm_set_epi64x(color, color);
+
+	color = ((long long)a << 32) | (a << 16) | a;
+	blendalpha = _mm_set_epi64x(color, color);
+#else
+	int color;
+
+	blending256 = _mm_set_epi32(0x100, 0x1000100, 0x100, 0x1000100);
+
+	color = (g << 16) | b;
+	blendcolor = _mm_set_epi32(r, color, r, color);
+
+	color = (a << 16) | a;
+	blendalpha = _mm_set_epi32(a, color, a, color);
+#endif
+
+	blendcolor = _mm_mullo_epi16(blendcolor, blendalpha);	// premultiply blend by alpha
+	blendalpha = _mm_subs_epu16(blending256, blendalpha);	// one minus alpha
+
+	zero = _mm_setzero_si128();
+
+#ifndef X86_ASM
+	if (unaligned)
+	{
+		for (count >>= 2; count > 0; --count)
+		{
+			color1 = _mm_loadu_si128((__m128i *)from);
+			from += 4;
+			color2 = _mm_unpackhi_epi8(color1, zero);
+			color1 = _mm_unpacklo_epi8(color1, zero);
+			color1 = _mm_mullo_epi16(blendalpha, color1);
+			color2 = _mm_mullo_epi16(blendalpha, color2);
+			color1 = _mm_adds_epu16(blendcolor, color1);
+			color2 = _mm_adds_epu16(blendcolor, color2);
+			color1 = _mm_srli_epi16(color1, 8);
+			color2 = _mm_srli_epi16(color2, 8);
+			_mm_storeu_si128((__m128i *)to, _mm_packus_epi16(color1, color2));
+			to += 4;
+		}
+	}
+	else
+#endif
+	{
+		for (count >>= 2; count > 0; --count)
+		{
+			color1 = _mm_load_si128((__m128i *)from);
+			from += 4;
+			color2 = _mm_unpackhi_epi8(color1, zero);
+			color1 = _mm_unpacklo_epi8(color1, zero);
+			color1 = _mm_mullo_epi16(blendalpha, color1);
+			color2 = _mm_mullo_epi16(blendalpha, color2);
+			color1 = _mm_adds_epu16(blendcolor, color1);
+			color2 = _mm_adds_epu16(blendcolor, color2);
+			color1 = _mm_srli_epi16(color1, 8);
+			color2 = _mm_srli_epi16(color2, 8);
+			_mm_store_si128((__m128i *)to, _mm_packus_epi16(color1, color2));
+			to += 4;
+		}
+	}
+}