diff --git a/src/x86.cpp b/src/x86.cpp new file mode 100644 index 0000000000..1bd6782d8e --- /dev/null +++ b/src/x86.cpp @@ -0,0 +1,276 @@ +#ifdef _MSC_VER +#include +#endif +#include +#include + +#include "doomtype.h" +#include "i_system.h" + +#ifdef __GNUC__ +#define __cpuid(output, func) __asm__ __volatile__("cpuid" : "=a" ((output)[0]),\ + "=b" ((output)[1]), "=c" ((output)[2]), "=d" ((output)[3]) : "a" (func)); +#endif + +void CheckCPUID(CPUInfo *cpu) +{ + int foo[4]; + unsigned int maxext; + + memset(cpu, 0, sizeof(*cpu)); + + cpu->DataL1LineSize = 32; // Assume a 32-byte cache line + +#if !defined(_M_IX86) && !defined(__i386__) && !defined(_M_X64) && !defined(__amd64__) + return; +#else + +#if defined(_M_IX86) || defined(__i386__) + // Old 486s do not have CPUID, so we must test for its presence. + // This code is adapted from the samples in AMD's document + // entitled "AMD-K6™ MMX Processor Multimedia Extensions." +#ifndef __GNUC__ + __asm + { + pushfd // save EFLAGS + pop eax // store EFLAGS in EAX + mov ecx,eax // save in ECX for later testing + xor eax,0x00200000 // toggle bit 21 + push eax // put to stack + popfd // save changed EAX to EFLAGS + pushfd // push EFLAGS to TOS + pop eax // store EFLAGS in EAX + cmp eax,ecx // see if bit 21 has changed + jne haveid // if no change, then no CPUID + } + return; +haveid: +#else + int oldfd, newfd; + + __asm__ __volatile__("\t" + "pushf\n\t" + "popl %0\n\t" + "movl %0,%1\n\t" + "xorl $0x200000,%0\n\t" + "pushl %0\n\t" + "popf\n\t" + "pushf\n\t" + "popl %0\n\t" + : "=r" (newfd), "=r" (oldfd)); + if (oldfd == newfd) + { + return; + } +#endif +#endif + + // Get vendor ID + __cpuid(foo, 0); + ((int *)cpu->VendorID)[0] = foo[1]; + ((int *)cpu->VendorID)[1] = foo[3]; + ((int *)cpu->VendorID)[2] = foo[2]; + if (foo[1] == MAKE_ID('A','u','t','h') && + foo[3] == MAKE_ID('e','n','t','i') && + foo[2] == MAKE_ID('c','A','M','D')) + { + cpu->bIsAMD = true; + } + + // Get features flags and other info + __cpuid(foo, 1); + ((int *)cpu)[17] = foo[1]; // Store brand index and other stuff + ((int *)cpu)[18] = foo[2]; // Store extended feature flags + ((int *)cpu)[19] = foo[3]; // Store feature flags + + // If CLFLUSH instruction is supported, get the real cache line size. + if (foo[3] & (1 << 19)) + { + cpu->DataL1LineSize = (foo[1] & 0xFF00) >> (8 - 3); + } + + cpu->Stepping = foo[0] & 0x0F; + cpu->Type = (foo[0] & 0x3000) >> 12; // valid on Intel only + cpu->Model = (foo[0] & 0xF0) >> 4; + cpu->Family = (foo[0] & 0xF00) >> 8; + + if (cpu->Family == 15) + { // Add extended model and family. + cpu->Family += (foo[0] >> 20) & 0xFF; + cpu->Model |= (foo[0] >> 12) & 0xF0; + } + + // Check for extended functions. + __cpuid(foo, 0x80000000); + maxext = (unsigned int)foo[0]; + + if (maxext >= 0x80000004) + { // Get processor brand string. + __cpuid((int *)&cpu->CPUString[0], 0x80000002); + __cpuid((int *)&cpu->CPUString[16], 0x80000003); + __cpuid((int *)&cpu->CPUString[32], 0x80000004); + } + + if (cpu->bIsAMD) + { + if (maxext >= 0x80000005) + { // Get data L1 cache info. + __cpuid(foo, 0x80000005); + *(int *)(&cpu->DataL1LineSize) = foo[2]; + } + if (maxext >= 0x80000001) + { // Get AMD-specific feature flags. + __cpuid(foo, 0x80000001); + cpu->AMDStepping = foo[0] & 0x0F; + cpu->AMDModel = (foo[0] & 0xF0) >> 4; + cpu->AMDFamily = (foo[0] & 0xF00) >> 8; + + if (cpu->AMDFamily == 15) + { // Add extended model and family. + cpu->AMDFamily += (foo[0] >> 20) & 0xFF; + cpu->AMDModel |= (foo[0] >> 12) & 0xF0; + } + } + } + +#endif +} + +#if 0 +// Compiler output for this function is crap compared to the assembly +// version, which is why it isn't used. +void DoBlending_MMX2(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a) +{ + __m64 blendcolor; + __m64 blendalpha; + __m64 zero; + __m64 blending256; + __m64 color1; + __m64 color2; + + zero = _mm_setzero_si64(); +#ifndef __GNUC__ + blending256.m64_i64 = 0x10001000100; +#else + blending256 = (__m64)0x10001000100ll; +#endif + + blendcolor = _mm_unpacklo_pi8(_m_from_int((r << 16) | (g << 8) | b), zero); // 000000RR 00GG00BB + blendalpha = _mm_unpacklo_pi8(_m_from_int((a << 16) | (a << 8) | a), zero); // 000000AA 00AA00AA + + blendcolor = _mm_mullo_pi16(blendcolor, blendalpha); // premultiply blend by alpha + blendalpha = _mm_subs_pu16(blending256, blendalpha); // one minus alpha + + // Do two colors per iteration: Count must be even + for (count >>= 1; count > 0; --count) + { + color1 = *(__m64 *)from; // 00r2g2b2 00r1g1b1 + from += 2; + color2 = _mm_unpackhi_pi8(color1, zero); // 000000r2 00g200b2 + color1 = _mm_unpacklo_pi8(color1, zero); // 000000r1 00g100b1 + color1 = _mm_mullo_pi16(blendalpha, color1); // 0000r1rr g1ggb1bb + color2 = _mm_mullo_pi16(blendalpha, color2); // 0000r2rr g2ggb2bb + color1 = _mm_adds_pu16(blendcolor, color1); + color2 = _mm_adds_pu16(blendcolor, color2); + color1 = _mm_srli_pi16(color1, 8); + color2 = _mm_srli_pi16(color2, 8); + *(__m64 *)to = _mm_packs_pu16(color1, color2); // 00r2g2b2 00r1g1b1 + to += 2; + } + _mm_empty(); +} +#endif + +#ifdef X86_ASM +extern "C" void STACK_ARGS DoBlending_MMX(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a); +#endif + +void DoBlending_SSE2(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a) +{ + __m128i blendcolor; + __m128i blendalpha; + __m128i zero; + __m128i blending256; + __m128i color1; + __m128i color2; + size_t unaligned; + + unaligned = ((size_t)from | (size_t)to) & 0xF; + +#ifdef X86_ASM + // For unaligned accesses, the assembly MMX version is slightly faster. + // Note that using unaligned SSE loads and stores is still faster than + // the compiler-generated MMX version. + if (unaligned) + { + DoBlending_MMX(from, to, count, r, g, b, a); + return; + } +#endif + +#if defined(__amd64__) || defined(_M_IX64) + long long color; + + blending256 = _mm_set_epi64x(0x10001000100ll, 0x10001000100ll); + + color = ((long long)r << 32) | (g << 16) | b; + blendcolor = _mm_set_epi64x(color, color); + + color = ((long long)a << 32) | (a << 16) | a; + blendalpha = _mm_set_epi64x(color, color); +#else + int color; + + blending256 = _mm_set_epi32(0x100, 0x1000100, 0x100, 0x1000100); + + color = (g << 16) | b; + blendcolor = _mm_set_epi32(r, color, r, color); + + color = (a << 16) | a; + blendalpha = _mm_set_epi32(a, color, a, color); +#endif + + blendcolor = _mm_mullo_epi16(blendcolor, blendalpha); // premultiply blend by alpha + blendalpha = _mm_subs_epu16(blending256, blendalpha); // one minus alpha + + zero = _mm_setzero_si128(); + +#ifndef X86_ASM + if (unaligned) + { + for (count >>= 2; count > 0; --count) + { + color1 = _mm_loadu_si128((__m128i *)from); + from += 4; + color2 = _mm_unpackhi_epi8(color1, zero); + color1 = _mm_unpacklo_epi8(color1, zero); + color1 = _mm_mullo_epi16(blendalpha, color1); + color2 = _mm_mullo_epi16(blendalpha, color2); + color1 = _mm_adds_epu16(blendcolor, color1); + color2 = _mm_adds_epu16(blendcolor, color2); + color1 = _mm_srli_epi16(color1, 8); + color2 = _mm_srli_epi16(color2, 8); + _mm_storeu_si128((__m128i *)to, _mm_packus_epi16(color1, color2)); + to += 4; + } + } + else +#endif + { + for (count >>= 2; count > 0; --count) + { + color1 = _mm_load_si128((__m128i *)from); + from += 4; + color2 = _mm_unpackhi_epi8(color1, zero); + color1 = _mm_unpacklo_epi8(color1, zero); + color1 = _mm_mullo_epi16(blendalpha, color1); + color2 = _mm_mullo_epi16(blendalpha, color2); + color1 = _mm_adds_epu16(blendcolor, color1); + color2 = _mm_adds_epu16(blendcolor, color2); + color1 = _mm_srli_epi16(color1, 8); + color2 = _mm_srli_epi16(color2, 8); + _mm_store_si128((__m128i *)to, _mm_packus_epi16(color1, color2)); + to += 4; + } + } +}