Some benchmarking shows that on SSE systems it only harms performance and compared to the intrinsics version the gains are too marginal for something this infrequently called. Doing 100000 calls of DoBlending results in a 5 ms decrease of using assembly vs intrinsics on a 3.4 GHz Core i7, meaning that even on a computer that is 10x slower you can still do 1000 or so blends per frame without a speed hit.
#include "doomtype.h"
#include "doomdef.h"
#include "x86.h"
extern "C"
#if !defined(__amd64__) && !defined(__i386__) && !defined(_M_IX86) && !defined(_M_X64)
void CheckCPUID(CPUInfo *cpu)
memset(cpu, 0, sizeof(*cpu));
cpu->DataL1LineSize = 32; // Assume a 32-byte cache line
void DumpCPUInfo(const CPUInfo *cpu)
#ifdef _MSC_VER
#include <intrin.h>
#include <mmintrin.h>
#include <emmintrin.h>
#ifdef __GNUC__
#if defined(__i386__) && defined(__PIC__)
// %ebx may by the PIC register. */
#define __cpuid(output, func) \
__asm__ __volatile__("xchgl\t%%ebx, %1\n\t" \
"cpuid\n\t" \
"xchgl\t%%ebx, %1\n\t" \
: "=a" ((output)[0]), "=r" ((output)[1]), "=c" ((output)[2]), "=d" ((output)[3]) \
: "a" (func));
#define __cpuid(output, func) __asm__ __volatile__("cpuid" : "=a" ((output)[0]),\
"=b" ((output)[1]), "=c" ((output)[2]), "=d" ((output)[3]) : "a" (func));
void CheckCPUID(CPUInfo *cpu)
int foo[4];
unsigned int maxext;
memset(cpu, 0, sizeof(*cpu));
cpu->DataL1LineSize = 32; // Assume a 32-byte cache line
#if !defined(_M_IX86) && !defined(__i386__) && !defined(_M_X64) && !defined(__amd64__)
#if defined(_M_IX86) || defined(__i386__)
// Old 486s do not have CPUID, so we must test for its presence.
// This code is adapted from the samples in AMD's document
// entitled "AMD-K6 MMX Processor Multimedia Extensions."
#ifndef __GNUC__
pushfd // save EFLAGS
pop eax // store EFLAGS in EAX
mov ecx,eax // save in ECX for later testing
xor eax,0x00200000 // toggle bit 21
push eax // put to stack
popfd // save changed EAX to EFLAGS
pushfd // push EFLAGS to TOS
pop eax // store EFLAGS in EAX
cmp eax,ecx // see if bit 21 has changed
jne haveid // if no change, then no CPUID
int oldfd, newfd;
__asm__ __volatile__("\t"
"popl %0\n\t"
"movl %0,%1\n\t"
"xorl $0x200000,%0\n\t"
"pushl %0\n\t"
"popl %0\n\t"
: "=r" (newfd), "=r" (oldfd));
if (oldfd == newfd)
// Get vendor ID
__cpuid(foo, 0);
cpu->dwVendorID[0] = foo[1];
cpu->dwVendorID[1] = foo[3];
cpu->dwVendorID[2] = foo[2];
if (foo[1] == MAKE_ID('A','u','t','h') &&
foo[3] == MAKE_ID('e','n','t','i') &&
foo[2] == MAKE_ID('c','A','M','D'))
cpu->bIsAMD = true;
// Get features flags and other info
__cpuid(foo, 1);
cpu->FeatureFlags[0] = foo[1]; // Store brand index and other stuff
cpu->FeatureFlags[1] = foo[2]; // Store extended feature flags
cpu->FeatureFlags[2] = foo[3]; // Store feature flags
// If CLFLUSH instruction is supported, get the real cache line size.
if (foo[3] & (1 << 19))
cpu->DataL1LineSize = (foo[1] & 0xFF00) >> (8 - 3);
cpu->Stepping = foo[0] & 0x0F;
cpu->Type = (foo[0] & 0x3000) >> 12; // valid on Intel only
cpu->Model = (foo[0] & 0xF0) >> 4;
cpu->Family = (foo[0] & 0xF00) >> 8;
if (cpu->Family == 15)
{ // Add extended family.
cpu->Family += (foo[0] >> 20) & 0xFF;
if (cpu->Family == 6 || cpu->Family == 15)
{ // Add extended model ID.
cpu->Model |= (foo[0] >> 12) & 0xF0;
// Check for extended functions.
__cpuid(foo, 0x80000000);
maxext = (unsigned int)foo[0];
if (maxext >= 0x80000004)
{ // Get processor brand string.
__cpuid((int *)&cpu->dwCPUString[0], 0x80000002);
__cpuid((int *)&cpu->dwCPUString[4], 0x80000003);
__cpuid((int *)&cpu->dwCPUString[8], 0x80000004);
if (cpu->bIsAMD)
if (maxext >= 0x80000005)
{ // Get data L1 cache info.
__cpuid(foo, 0x80000005);
cpu->AMD_DataL1Info = foo[2];
if (maxext >= 0x80000001)
{ // Get AMD-specific feature flags.
__cpuid(foo, 0x80000001);
cpu->AMDStepping = foo[0] & 0x0F;
cpu->AMDModel = (foo[0] & 0xF0) >> 4;
cpu->AMDFamily = (foo[0] & 0xF00) >> 8;
if (cpu->AMDFamily == 15)
{ // Add extended model and family.
cpu->AMDFamily += (foo[0] >> 20) & 0xFF;
cpu->AMDModel |= (foo[0] >> 12) & 0xF0;
cpu->FeatureFlags[3] = foo[3]; // AMD feature flags
void DumpCPUInfo(const CPUInfo *cpu)
char cpustring[4*4*3+1];
// Why does Intel right-justify this string (on P4s)
// or add extra spaces (on Cores)?
const char *f = cpu->CPUString;
char *t;
// Skip extra whitespace at the beginning.
while (*f == ' ')
// Copy string to temp buffer, but condense consecutive
// spaces to a single space character.
for (t = cpustring; *f != '\0'; ++f)
if (*f == ' ' && *(f - 1) == ' ')
*t++ = *f;
*t = '\0';
if (cpu->VendorID[0] && !batchrun)
Printf("CPU Vendor ID: %s\n", cpu->VendorID);
if (cpustring[0])
Printf(" Name: %s\n", cpustring);
if (cpu->bIsAMD)
Printf(" Family %d (%d), Model %d, Stepping %d\n",
cpu->Family, cpu->AMDFamily, cpu->AMDModel, cpu->AMDStepping);
Printf(" Family %d, Model %d, Stepping %d\n",
cpu->Family, cpu->Model, cpu->Stepping);
Printf(" Features:");
if (cpu->bMMX) Printf(" MMX");
if (cpu->bMMXPlus) Printf(" MMX+");
if (cpu->bSSE) Printf(" SSE");
if (cpu->bSSE2) Printf(" SSE2");
if (cpu->bSSE3) Printf(" SSE3");
if (cpu->bSSSE3) Printf(" SSSE3");
if (cpu->bSSE41) Printf(" SSE4.1");
if (cpu->bSSE42) Printf(" SSE4.2");
if (cpu->b3DNow) Printf(" 3DNow!");
if (cpu->b3DNowPlus) Printf(" 3DNow!+");
Printf ("\n");
#if !defined(__amd64__) && !defined(_M_X64)
void DoBlending_MMX(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a)
__m64 blendcolor;
__m64 blendalpha;
__m64 zero;
__m64 blending256;
__m64 color1;
__m64 color2;
zero = _mm_setzero_si64();
#ifndef __GNUC__
blending256.m64_i64 = 0x10001000100;
blending256 = (__m64)0x10001000100ll;
blendcolor = _mm_unpacklo_pi8(_m_from_int((r << 16) | (g << 8) | b), zero); // 000000RR 00GG00BB
blendalpha = _mm_unpacklo_pi8(_m_from_int((a << 16) | (a << 8) | a), zero); // 000000AA 00AA00AA
blendcolor = _mm_mullo_pi16(blendcolor, blendalpha); // premultiply blend by alpha
blendalpha = _mm_subs_pu16(blending256, blendalpha); // one minus alpha
// Do two colors per iteration: Count must be even
for (count >>= 1; count > 0; --count)
color1 = *(__m64 *)from; // 00r2g2b2 00r1g1b1
from += 2;
color2 = _mm_unpackhi_pi8(color1, zero); // 000000r2 00g200b2
color1 = _mm_unpacklo_pi8(color1, zero); // 000000r1 00g100b1
color1 = _mm_mullo_pi16(blendalpha, color1); // 0000r1rr g1ggb1bb
color2 = _mm_mullo_pi16(blendalpha, color2); // 0000r2rr g2ggb2bb
color1 = _mm_adds_pu16(blendcolor, color1);
color2 = _mm_adds_pu16(blendcolor, color2);
color1 = _mm_srli_pi16(color1, 8);
color2 = _mm_srli_pi16(color2, 8);
*(__m64 *)to = _mm_packs_pu16(color1, color2); // 00r2g2b2 00r1g1b1
to += 2;
void DoBlending_SSE2(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a)
__m128i blendcolor;
__m128i blendalpha;
__m128i zero;
__m128i blending256;
__m128i color1;
__m128i color2;
size_t unaligned;
unaligned = ((size_t)from | (size_t)to) & 0xF;
#if defined(__amd64__) || defined(_M_X64)
long long color;
blending256 = _mm_set_epi64x(0x10001000100ll, 0x10001000100ll);
color = ((long long)r << 32) | (g << 16) | b;
blendcolor = _mm_set_epi64x(color, color);
color = ((long long)a << 32) | (a << 16) | a;
blendalpha = _mm_set_epi64x(color, color);
int color;
blending256 = _mm_set_epi32(0x100, 0x1000100, 0x100, 0x1000100);
color = (g << 16) | b;
blendcolor = _mm_set_epi32(r, color, r, color);
color = (a << 16) | a;
blendalpha = _mm_set_epi32(a, color, a, color);
blendcolor = _mm_mullo_epi16(blendcolor, blendalpha); // premultiply blend by alpha
blendalpha = _mm_subs_epu16(blending256, blendalpha); // one minus alpha
zero = _mm_setzero_si128();
if (unaligned)
for (count >>= 2; count > 0; --count)
color1 = _mm_loadu_si128((__m128i *)from);
from += 4;
color2 = _mm_unpackhi_epi8(color1, zero);
color1 = _mm_unpacklo_epi8(color1, zero);
color1 = _mm_mullo_epi16(blendalpha, color1);
color2 = _mm_mullo_epi16(blendalpha, color2);
color1 = _mm_adds_epu16(blendcolor, color1);
color2 = _mm_adds_epu16(blendcolor, color2);
color1 = _mm_srli_epi16(color1, 8);
color2 = _mm_srli_epi16(color2, 8);
_mm_storeu_si128((__m128i *)to, _mm_packus_epi16(color1, color2));
to += 4;
for (count >>= 2; count > 0; --count)
color1 = _mm_load_si128((__m128i *)from);
from += 4;
color2 = _mm_unpackhi_epi8(color1, zero);
color1 = _mm_unpacklo_epi8(color1, zero);
color1 = _mm_mullo_epi16(blendalpha, color1);
color2 = _mm_mullo_epi16(blendalpha, color2);
color1 = _mm_adds_epu16(blendcolor, color1);
color2 = _mm_adds_epu16(blendcolor, color2);
color1 = _mm_srli_epi16(color1, 8);
color2 = _mm_srli_epi16(color2, 8);
_mm_store_si128((__m128i *)to, _mm_packus_epi16(color1, color2));
to += 4;