#ifdef _MSC_VER #include #endif #include #include #include "doomtype.h" #include "i_system.h" #ifdef __GNUC__ #define __cpuid(output, func) __asm__ __volatile__("cpuid" : "=a" ((output)[0]),\ "=b" ((output)[1]), "=c" ((output)[2]), "=d" ((output)[3]) : "a" (func)); #endif void CheckCPUID(CPUInfo *cpu) { int foo[4]; unsigned int maxext; memset(cpu, 0, sizeof(*cpu)); cpu->DataL1LineSize = 32; // Assume a 32-byte cache line #if !defined(_M_IX86) && !defined(__i386__) && !defined(_M_X64) && !defined(__amd64__) return; #else #if defined(_M_IX86) || defined(__i386__) // Old 486s do not have CPUID, so we must test for its presence. // This code is adapted from the samples in AMD's document // entitled "AMD-K6™ MMX Processor Multimedia Extensions." #ifndef __GNUC__ __asm { pushfd // save EFLAGS pop eax // store EFLAGS in EAX mov ecx,eax // save in ECX for later testing xor eax,0x00200000 // toggle bit 21 push eax // put to stack popfd // save changed EAX to EFLAGS pushfd // push EFLAGS to TOS pop eax // store EFLAGS in EAX cmp eax,ecx // see if bit 21 has changed jne haveid // if no change, then no CPUID } return; haveid: #else int oldfd, newfd; __asm__ __volatile__("\t" "pushf\n\t" "popl %0\n\t" "movl %0,%1\n\t" "xorl $0x200000,%0\n\t" "pushl %0\n\t" "popf\n\t" "pushf\n\t" "popl %0\n\t" : "=r" (newfd), "=r" (oldfd)); if (oldfd == newfd) { return; } #endif #endif // Get vendor ID __cpuid(foo, 0); ((int *)cpu->VendorID)[0] = foo[1]; ((int *)cpu->VendorID)[1] = foo[3]; ((int *)cpu->VendorID)[2] = foo[2]; if (foo[1] == MAKE_ID('A','u','t','h') && foo[3] == MAKE_ID('e','n','t','i') && foo[2] == MAKE_ID('c','A','M','D')) { cpu->bIsAMD = true; } // Get features flags and other info __cpuid(foo, 1); ((int *)cpu)[17] = foo[1]; // Store brand index and other stuff ((int *)cpu)[18] = foo[2]; // Store extended feature flags ((int *)cpu)[19] = foo[3]; // Store feature flags // If CLFLUSH instruction is supported, get the real cache line size. if (foo[3] & (1 << 19)) { cpu->DataL1LineSize = (foo[1] & 0xFF00) >> (8 - 3); } cpu->Stepping = foo[0] & 0x0F; cpu->Type = (foo[0] & 0x3000) >> 12; // valid on Intel only cpu->Model = (foo[0] & 0xF0) >> 4; cpu->Family = (foo[0] & 0xF00) >> 8; if (cpu->Family == 15) { // Add extended model and family. cpu->Family += (foo[0] >> 20) & 0xFF; cpu->Model |= (foo[0] >> 12) & 0xF0; } // Check for extended functions. __cpuid(foo, 0x80000000); maxext = (unsigned int)foo[0]; if (maxext >= 0x80000004) { // Get processor brand string. __cpuid((int *)&cpu->CPUString[0], 0x80000002); __cpuid((int *)&cpu->CPUString[16], 0x80000003); __cpuid((int *)&cpu->CPUString[32], 0x80000004); } if (cpu->bIsAMD) { if (maxext >= 0x80000005) { // Get data L1 cache info. __cpuid(foo, 0x80000005); *(int *)(&cpu->DataL1LineSize) = foo[2]; } if (maxext >= 0x80000001) { // Get AMD-specific feature flags. __cpuid(foo, 0x80000001); cpu->AMDStepping = foo[0] & 0x0F; cpu->AMDModel = (foo[0] & 0xF0) >> 4; cpu->AMDFamily = (foo[0] & 0xF00) >> 8; if (cpu->AMDFamily == 15) { // Add extended model and family. cpu->AMDFamily += (foo[0] >> 20) & 0xFF; cpu->AMDModel |= (foo[0] >> 12) & 0xF0; } } } #endif } #if 0 // Compiler output for this function is crap compared to the assembly // version, which is why it isn't used. void DoBlending_MMX2(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a) { __m64 blendcolor; __m64 blendalpha; __m64 zero; __m64 blending256; __m64 color1; __m64 color2; zero = _mm_setzero_si64(); #ifndef __GNUC__ blending256.m64_i64 = 0x10001000100; #else blending256 = (__m64)0x10001000100ll; #endif blendcolor = _mm_unpacklo_pi8(_m_from_int((r << 16) | (g << 8) | b), zero); // 000000RR 00GG00BB blendalpha = _mm_unpacklo_pi8(_m_from_int((a << 16) | (a << 8) | a), zero); // 000000AA 00AA00AA blendcolor = _mm_mullo_pi16(blendcolor, blendalpha); // premultiply blend by alpha blendalpha = _mm_subs_pu16(blending256, blendalpha); // one minus alpha // Do two colors per iteration: Count must be even for (count >>= 1; count > 0; --count) { color1 = *(__m64 *)from; // 00r2g2b2 00r1g1b1 from += 2; color2 = _mm_unpackhi_pi8(color1, zero); // 000000r2 00g200b2 color1 = _mm_unpacklo_pi8(color1, zero); // 000000r1 00g100b1 color1 = _mm_mullo_pi16(blendalpha, color1); // 0000r1rr g1ggb1bb color2 = _mm_mullo_pi16(blendalpha, color2); // 0000r2rr g2ggb2bb color1 = _mm_adds_pu16(blendcolor, color1); color2 = _mm_adds_pu16(blendcolor, color2); color1 = _mm_srli_pi16(color1, 8); color2 = _mm_srli_pi16(color2, 8); *(__m64 *)to = _mm_packs_pu16(color1, color2); // 00r2g2b2 00r1g1b1 to += 2; } _mm_empty(); } #endif #ifdef X86_ASM extern "C" void STACK_ARGS DoBlending_MMX(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a); #endif void DoBlending_SSE2(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a) { __m128i blendcolor; __m128i blendalpha; __m128i zero; __m128i blending256; __m128i color1; __m128i color2; size_t unaligned; unaligned = ((size_t)from | (size_t)to) & 0xF; #ifdef X86_ASM // For unaligned accesses, the assembly MMX version is slightly faster. // Note that using unaligned SSE loads and stores is still faster than // the compiler-generated MMX version. if (unaligned) { DoBlending_MMX(from, to, count, r, g, b, a); return; } #endif #if defined(__amd64__) || defined(_M_IX64) long long color; blending256 = _mm_set_epi64x(0x10001000100ll, 0x10001000100ll); color = ((long long)r << 32) | (g << 16) | b; blendcolor = _mm_set_epi64x(color, color); color = ((long long)a << 32) | (a << 16) | a; blendalpha = _mm_set_epi64x(color, color); #else int color; blending256 = _mm_set_epi32(0x100, 0x1000100, 0x100, 0x1000100); color = (g << 16) | b; blendcolor = _mm_set_epi32(r, color, r, color); color = (a << 16) | a; blendalpha = _mm_set_epi32(a, color, a, color); #endif blendcolor = _mm_mullo_epi16(blendcolor, blendalpha); // premultiply blend by alpha blendalpha = _mm_subs_epu16(blending256, blendalpha); // one minus alpha zero = _mm_setzero_si128(); #ifndef X86_ASM if (unaligned) { for (count >>= 2; count > 0; --count) { color1 = _mm_loadu_si128((__m128i *)from); from += 4; color2 = _mm_unpackhi_epi8(color1, zero); color1 = _mm_unpacklo_epi8(color1, zero); color1 = _mm_mullo_epi16(blendalpha, color1); color2 = _mm_mullo_epi16(blendalpha, color2); color1 = _mm_adds_epu16(blendcolor, color1); color2 = _mm_adds_epu16(blendcolor, color2); color1 = _mm_srli_epi16(color1, 8); color2 = _mm_srli_epi16(color2, 8); _mm_storeu_si128((__m128i *)to, _mm_packus_epi16(color1, color2)); to += 4; } } else #endif { for (count >>= 2; count > 0; --count) { color1 = _mm_load_si128((__m128i *)from); from += 4; color2 = _mm_unpackhi_epi8(color1, zero); color1 = _mm_unpacklo_epi8(color1, zero); color1 = _mm_mullo_epi16(blendalpha, color1); color2 = _mm_mullo_epi16(blendalpha, color2); color1 = _mm_adds_epu16(blendcolor, color1); color2 = _mm_adds_epu16(blendcolor, color2); color1 = _mm_srli_epi16(color1, 8); color2 = _mm_srli_epi16(color2, 8); _mm_store_si128((__m128i *)to, _mm_packus_epi16(color1, color2)); to += 4; } } }