#include "../precompiled.h" #pragma hdrstop #include "Simd_generic.h" #include "Simd_MMX.h" //=============================================================== // // MMX implementation of idSIMDProcessor // //=============================================================== /* gcc inline assembly: inline assembly for the MMX SIMD processor written there mostly as an experiment does not increase performance on timedemos ( nor did I expect it to, libc-i686 does the job very well already ) although the newer gcc can read inline asm using the intel syntax ( with minor reformatting and escaping of register names ), it's still a long way from providing an easy compatibility with MSVC inline assembly mostly because of the input/output registers, the clobber lists and generally all the things gcc tries to be clever about when you give it a piece of inline assembly ( typically, compiling this at -O1 or better will produce bad code, and some of it won't compile with -fPIC either ) at this point, writing everything in nasm from the ground up, or using intel's compiler to produce the Simd_*.o objects is still the best alternative */ #if defined( _WINDOWS ) || defined( __linux__ ) #ifdef _WINDOWS #define EMMS_INSTRUCTION __asm emms #else #define EMMS_INSTRUCTION __asm__ __volatile__ ( "emms\n\t" ); #endif /* ============ idSIMD_MMX::GetName ============ */ const char * idSIMD_MMX::GetName( void ) const { return "MMX"; } /* ================ MMX_Memcpy8B ================ */ void MMX_Memcpy8B( void *dest, const void *src, const int count ) { #ifdef _MSC_VER _asm { mov esi, src mov edi, dest mov ecx, count shr ecx, 3 // 8 bytes per iteration loop1: movq mm1, 0[ESI] // Read in source data movntq 0[EDI], mm1 // Non-temporal stores add esi, 8 add edi, 8 dec ecx jnz loop1 } EMMS_INSTRUCTION #elif 0 /* not using constraints, so no double escape of registers not necessary to push edi/esi */ __asm__ __volatile__ ( // "mov %edi, dest\n\t" "mov %edi, DWORD PTR [%ebp+8]\n\t" // "mov %esi, src\n\t" "mov %esi, DWORD PTR [%ebp+12]\n\t" // "mov %ecx, count\n\t" "mov %ecx, DWORD PTR [%ebp+16]\n\t" "shr %ecx, 3\n\t" // 8 bytes per iteration "loop1_1:\n\t" "movq %mm1, 0[%ESI]\n\t" // Read in source data "movntq 0[%EDI], %mm1\n\t" // Non-temporal stores "add %esi, 8\n\t" "add %edi, 8\n\t" "dec %ecx \n\t" "jnz loop1_1\n\t" "emms\n\t" ); #elif 1 __asm__ __volatile__ ( // "mov %esi, src\n\t" // "mov %edi, dest\n\t" // "mov %ecx, count\n\t" "shr %%ecx, 3\n\t" // 8 bytes per iteration "0:\n\t" "movq %%mm1, 0[%%esi]\n\t" // Read in source data "movntq 0[%%edi], %%mm1\n\t" // Non-temporal stores "add %%esi, 8\n\t" "add %%edi, 8\n\t" "dec %%ecx \n\t" "jnz 0b\n\t" "emms\n\t" : /* no outputs */ : "S" (src), "D" (dest), "c" (count) ); #endif } /* ================ MMX_Memcpy64B 165MB/sec ================ */ void MMX_Memcpy64B( void *dest, const void *src, const int count ) { #ifdef _MSC_VER _asm { mov esi, src mov edi, dest mov ecx, count shr ecx, 6 // 64 bytes per iteration loop1: prefetchnta 64[ESI] // Prefetch next loop, non-temporal prefetchnta 96[ESI] movq mm1, 0[ESI] // Read in source data movq mm2, 8[ESI] movq mm3, 16[ESI] movq mm4, 24[ESI] movq mm5, 32[ESI] movq mm6, 40[ESI] movq mm7, 48[ESI] movq mm0, 56[ESI] movntq 0[EDI], mm1 // Non-temporal stores movntq 8[EDI], mm2 movntq 16[EDI], mm3 movntq 24[EDI], mm4 movntq 32[EDI], mm5 movntq 40[EDI], mm6 movntq 48[EDI], mm7 movntq 56[EDI], mm0 add esi, 64 add edi, 64 dec ecx jnz loop1 } EMMS_INSTRUCTION #else __asm__ __volatile__ ( //"mov %%esi, src \n\t" //"mov %%edi, dest \n\t" //"mov %%ecx, count \n\t" "shr %%ecx, 6 \n\t"// 64 bytes per iteration "\n\t" "1: \n\t" "prefetchnta 64[%%ESI] \n\t"// Prefetch next loop, non-temporal "prefetchnta 96[%%ESI] \n\t" "\n\t" "movq %%mm1, 0[%%ESI] \n\t"// Read in source data "movq %%mm2, 8[%%ESI] \n\t" "movq %%mm3, 16[%%ESI] \n\t" "movq %%mm4, 24[%%ESI] \n\t" "movq %%mm5, 32[%%ESI] \n\t" "movq %%mm6, 40[%%ESI] \n\t" "movq %%mm7, 48[%%ESI] \n\t" "movq %%mm0, 56[%%ESI] \n\t" "\n\t" "movntq 0[%%EDI], %%mm1 \n\t"// Non-temporal stores "movntq 8[%%EDI], %%mm2 \n\t" "movntq 16[%%EDI], %%mm3 \n\t" "movntq 24[%%EDI], %%mm4 \n\t" "movntq 32[%%EDI], %%mm5 \n\t" "movntq 40[%%EDI], %%mm6 \n\t" "movntq 48[%%EDI], %%mm7 \n\t" "movntq 56[%%EDI], %%mm0 \n\t" "\n\t" "add %%esi, 64 \n\t" "add %%edi, 64 \n\t" "dec %%ecx \n\t" "jnz 1b \n\t" "emms \n\t" : : "S" (src), "D" (dest), "c" (count) ); #endif } /* ================ MMX_Memcpy2kB 240MB/sec ================ */ void MMX_Memcpy2kB( void *dest, const void *src, const int count ) { byte *tbuf = (byte *)_alloca16(2048); #ifdef _MSC_VER __asm { push ebx mov esi, src mov ebx, count shr ebx, 11 // 2048 bytes at a time mov edi, dest loop2k: push edi // copy 2k into temporary buffer mov edi, tbuf mov ecx, 32 loopMemToL1: prefetchnta 64[ESI] // Prefetch next loop, non-temporal prefetchnta 96[ESI] movq mm1, 0[ESI] // Read in source data movq mm2, 8[ESI] movq mm3, 16[ESI] movq mm4, 24[ESI] movq mm5, 32[ESI] movq mm6, 40[ESI] movq mm7, 48[ESI] movq mm0, 56[ESI] movq 0[EDI], mm1 // Store into L1 movq 8[EDI], mm2 movq 16[EDI], mm3 movq 24[EDI], mm4 movq 32[EDI], mm5 movq 40[EDI], mm6 movq 48[EDI], mm7 movq 56[EDI], mm0 add esi, 64 add edi, 64 dec ecx jnz loopMemToL1 pop edi // Now copy from L1 to system memory push esi mov esi, tbuf mov ecx, 32 loopL1ToMem: movq mm1, 0[ESI] // Read in source data from L1 movq mm2, 8[ESI] movq mm3, 16[ESI] movq mm4, 24[ESI] movq mm5, 32[ESI] movq mm6, 40[ESI] movq mm7, 48[ESI] movq mm0, 56[ESI] movntq 0[EDI], mm1 // Non-temporal stores movntq 8[EDI], mm2 movntq 16[EDI], mm3 movntq 24[EDI], mm4 movntq 32[EDI], mm5 movntq 40[EDI], mm6 movntq 48[EDI], mm7 movntq 56[EDI], mm0 add esi, 64 add edi, 64 dec ecx jnz loopL1ToMem pop esi // Do next 2k block dec ebx jnz loop2k pop ebx } EMMS_INSTRUCTION #else #ifdef __PIC__ memcpy( dest, src, count ); #else /* ebx problem: when not compiling with -fPIC, compiles fine. No need to push/pop ebx, the constraints setup will save and restore ( or so it seems with no optimizations ) when compiling with -fPIC: if not putting ebx in clobber list, "can't find a register in class 'BREG' while reloading 'asm'" if putting ebx in clobber list, "PIC register 'ebx' clobbered in 'asm'" but really, you don't want to put it in clobber list, you want to push/pop it BREG error due to -masm=intel? ( doesn't sound likely - could test with the cpuid thing? ) tbuf constrained in memory since the loop loads it up in edi */ __asm__ __volatile__ ( "push %%ebx \n\t" //"mov %%esi, src \n\t" //"mov %%ebx, count \n\t" "shr %%ebx, 11 \n\t"// 2048 bytes at a time //"mov %%edi, dest \n\t" "\n\t" "loop2k: \n\t" "push %%edi \n\t"// copy 2k into temporary buffer //"mov %%edi, tbuf \n\t" "mov %%edi, %0 \n\t" "mov %%ecx, 32 \n\t" "\n\t" "loopMemToL1: \n\t" "prefetchnta 64[%%ESI] \n\t"// Prefetch next loop, non-temporal "prefetchnta 96[%%ESI] \n\t" "\n\t" "movq %%mm1, 0[%%ESI] \n\t"// Read in source data "movq %%mm2, 8[%%ESI] \n\t" "movq %%mm3, 16[%%ESI] \n\t" "movq %%mm4, 24[%%ESI] \n\t" "movq %%mm5, 32[%%ESI] \n\t" "movq %%mm6, 40[%%ESI] \n\t" "movq %%mm7, 48[%%ESI] \n\t" "movq %%mm0, 56[%%ESI] \n\t" "\n\t" "movq 0[%%EDI], %%mm1 \n\t"// Store into L1 "movq 8[%%EDI], %%mm2 \n\t" "movq 16[%%EDI], %%mm3 \n\t" "movq 24[%%EDI], %%mm4 \n\t" "movq 32[%%EDI], %%mm5 \n\t" "movq 40[%%EDI], %%mm6 \n\t" "movq 48[%%EDI], %%mm7 \n\t" "movq 56[%%EDI], %%mm0 \n\t" "add %%esi, 64 \n\t" "add %%edi, 64 \n\t" "dec %%ecx \n\t" "jnz loopMemToL1 \n\t" "\n\t" "pop %%edi \n\t"// Now copy from L1 to system memory "push %%esi \n\t" //"mov %%esi, tbuf \n\t" "mov %%esi, %0 \n\t" "mov %%ecx, 32 \n\t" "\n\t" "loopL1ToMem: \n\t" "movq %%mm1, 0[%%ESI] \n\t"// Read in source data from L1 "movq %%mm2, 8[%%ESI] \n\t" "movq %%mm3, 16[%%ESI] \n\t" "movq %%mm4, 24[%%ESI] \n\t" "movq %%mm5, 32[%%ESI] \n\t" "movq %%mm6, 40[%%ESI] \n\t" "movq %%mm7, 48[%%ESI] \n\t" "movq %%mm0, 56[%%ESI] \n\t" "\n\t" "movntq 0[%%EDI], %%mm1 \n\t"// Non-temporal stores "movntq 8[%%EDI], %%mm2 \n\t" "movntq 16[%%EDI], %%mm3 \n\t" "movntq 24[%%EDI], %%mm4 \n\t" "movntq 32[%%EDI], %%mm5 \n\t" "movntq 40[%%EDI], %%mm6 \n\t" "movntq 48[%%EDI], %%mm7 \n\t" "movntq 56[%%EDI], %%mm0 \n\t" "\n\t" "add %%esi, 64 \n\t" "add %%edi, 64 \n\t" "dec %%ecx \n\t" "jnz loopL1ToMem \n\t" "\n\t" "pop %%esi \n\t"// Do next 2k block "dec %%ebx \n\t" "jnz loop2k \n\t" "pop %%ebx \n\t" "emms \n\t" : : "m" (tbuf), "S" (src), "D" (dest), "b" (count) //: "ebx" ); #endif // !ID_PIC #endif } /* ================ idSIMD_MMX::Memcpy optimized memory copy routine that handles all alignment cases and block sizes efficiently ================ */ void VPCALL idSIMD_MMX::Memcpy( void *dest0, const void *src0, const int count0 ) { #ifndef _WIN32 memcpy( dest0, src0, count0 ); #else // if copying more than 16 bytes and we can copy 8 byte aligned if ( count0 > 16 && !( ( (int)dest0 ^ (int)src0 ) & 7 ) ) { byte *dest = (byte *)dest0; byte *src = (byte *)src0; // copy up to the first 8 byte aligned boundary int count = ((int)dest) & 7; memcpy( dest, src, count ); dest += count; src += count; count = count0 - count; // if there are multiple blocks of 2kB if ( count & ~4095 ) { MMX_Memcpy2kB( dest, src, count ); src += (count & ~2047); dest += (count & ~2047); count &= 2047; } // if there are blocks of 64 bytes if ( count & ~63 ) { MMX_Memcpy64B( dest, src, count ); src += (count & ~63); dest += (count & ~63); count &= 63; } // if there are blocks of 8 bytes if ( count & ~7 ) { MMX_Memcpy8B( dest, src, count ); src += (count & ~7); dest += (count & ~7); count &= 7; } // copy any remaining bytes memcpy( dest, src, count ); } else { // use the regular one if we cannot copy 8 byte aligned memcpy( dest0, src0, count0 ); } #endif // _WIN32 } /* ================ idSIMD_MMX::Memset ================ */ void VPCALL idSIMD_MMX::Memset( void* dest0, const int val, const int count0 ) { #ifndef _WIN32 #else union { byte bytes[8]; word words[4]; dword dwords[2]; } dat; byte *dest = (byte *)dest0; int count = count0; while( count > 0 && (((int)dest) & 7) ) { *dest = val; dest++; count--; } if ( !count ) { return; } dat.bytes[0] = val; dat.bytes[1] = val; dat.words[1] = dat.words[0]; dat.dwords[1] = dat.dwords[0]; if ( count >= 64 ) { #ifdef _MSC_VER __asm { mov edi, dest mov ecx, count shr ecx, 6 // 64 bytes per iteration movq mm1, dat // Read in source data movq mm2, mm1 movq mm3, mm1 movq mm4, mm1 movq mm5, mm1 movq mm6, mm1 movq mm7, mm1 movq mm0, mm1 loop1: movntq 0[EDI], mm1 // Non-temporal stores movntq 8[EDI], mm2 movntq 16[EDI], mm3 movntq 24[EDI], mm4 movntq 32[EDI], mm5 movntq 40[EDI], mm6 movntq 48[EDI], mm7 movntq 56[EDI], mm0 add edi, 64 dec ecx jnz loop1 } #else /* dat constrained in memory */ __asm__ __volatile__ ( //"mov %%edi, dest \n\t" //"mov %%ecx, count \n\t" "shr %%ecx, 6 \n\t"// 64 bytes per iteration //"movq %%mm1, dat \n\t"// Read in source data "movq %%mm1, %0 \n\t" "movq %%mm2, %%mm1 \n\t" "movq %%mm3, %%mm1 \n\t" "movq %%mm4, %%mm1 \n\t" "movq %%mm5, %%mm1 \n\t" "movq %%mm6, %%mm1 \n\t" "movq %%mm7, %%mm1 \n\t" "movq %%mm0, %%mm1 \n\t" "loop1_3: \n\t" "movntq 0[%%EDI], %%mm1 \n\t"// Non-temporal stores "movntq 8[%%EDI], %%mm2 \n\t" "movntq 16[%%EDI], %%mm3 \n\t" "movntq 24[%%EDI], %%mm4 \n\t" "movntq 32[%%EDI], %%mm5 \n\t" "movntq 40[%%EDI], %%mm6 \n\t" "movntq 48[%%EDI], %%mm7 \n\t" "movntq 56[%%EDI], %%mm0 \n\t" "\n\t" "add %%edi, 64 \n\t" "dec %%ecx \n\t" "jnz loop1_3 \n\t" : : "m" (dat), "D" (dest), "c" (count) ); #endif dest += ( count & ~63 ); count &= 63; } if ( count >= 8 ) { #ifdef _MSC_VER __asm { mov edi, dest mov ecx, count shr ecx, 3 // 8 bytes per iteration movq mm1, dat // Read in source data loop2: movntq 0[EDI], mm1 // Non-temporal stores add edi, 8 dec ecx jnz loop2 } #else /* dat constrained in memory */ __asm__ __volatile__ ( //"mov %%edi, dest \n\t" //"mov %%ecx, count \n\t" "shr %%ecx, 3 \n\t"// 8 bytes per iteration //"movq %%mm1, dat \n\t"// Read in source data "movq %%mm1, %0 \n\t"// Read in source data "loop2: \n\t" "movntq 0[%%EDI], %%mm1 \n\t"// Non-temporal stores "\n\t" "add %%edi, 8 \n\t" "dec %%ecx \n\t" "jnz loop2 \n\t" : : "m" (dat), "D" (dest), "c" (count) ); #endif dest += (count & ~7); count &= 7; } while( count > 0 ) { *dest = val; dest++; count--; } EMMS_INSTRUCTION #endif // _WIN32 } #endif