quake4-sdk/source/idlib/math/Simd_MMX.cpp

584 lines
14 KiB
C++
Raw Permalink Normal View History

2007-06-15 00:00:00 +00:00
#include "../precompiled.h"
#pragma hdrstop
#include "Simd_generic.h"
#include "Simd_MMX.h"
//===============================================================
//
// MMX implementation of idSIMDProcessor
//
//===============================================================
/*
gcc inline assembly:
inline assembly for the MMX SIMD processor written there mostly as an experiment
does not increase performance on timedemos ( nor did I expect it to, libc-i686 does the job very well already )
although the newer gcc can read inline asm using the intel syntax ( with minor reformatting and escaping of register names ),
it's still a long way from providing an easy compatibility with MSVC inline assembly
mostly because of the input/output registers, the clobber lists
and generally all the things gcc tries to be clever about when you give it a piece of inline assembly
( typically, compiling this at -O1 or better will produce bad code, and some of it won't compile with -fPIC either )
at this point, writing everything in nasm from the ground up, or using intel's compiler to produce the Simd_*.o objects is
still the best alternative
*/
#if defined( _WINDOWS ) || defined( __linux__ )
#ifdef _WINDOWS
#define EMMS_INSTRUCTION __asm emms
#else
#define EMMS_INSTRUCTION __asm__ __volatile__ ( "emms\n\t" );
#endif
/*
============
idSIMD_MMX::GetName
============
*/
const char * idSIMD_MMX::GetName( void ) const {
return "MMX";
}
/*
================
MMX_Memcpy8B
================
*/
void MMX_Memcpy8B( void *dest, const void *src, const int count ) {
#ifdef _MSC_VER
_asm {
mov esi, src
mov edi, dest
mov ecx, count
shr ecx, 3 // 8 bytes per iteration
loop1:
movq mm1, 0[ESI] // Read in source data
movntq 0[EDI], mm1 // Non-temporal stores
add esi, 8
add edi, 8
dec ecx
jnz loop1
}
EMMS_INSTRUCTION
#elif 0
/*
not using constraints, so no double escape of registers
not necessary to push edi/esi
*/
__asm__ __volatile__ (
// "mov %edi, dest\n\t"
"mov %edi, DWORD PTR [%ebp+8]\n\t"
// "mov %esi, src\n\t"
"mov %esi, DWORD PTR [%ebp+12]\n\t"
// "mov %ecx, count\n\t"
"mov %ecx, DWORD PTR [%ebp+16]\n\t"
"shr %ecx, 3\n\t" // 8 bytes per iteration
"loop1_1:\n\t"
"movq %mm1, 0[%ESI]\n\t" // Read in source data
"movntq 0[%EDI], %mm1\n\t" // Non-temporal stores
"add %esi, 8\n\t"
"add %edi, 8\n\t"
"dec %ecx \n\t"
"jnz loop1_1\n\t"
"emms\n\t"
);
#elif 1
__asm__ __volatile__ (
// "mov %esi, src\n\t"
// "mov %edi, dest\n\t"
// "mov %ecx, count\n\t"
"shr %%ecx, 3\n\t" // 8 bytes per iteration
"0:\n\t"
"movq %%mm1, 0[%%esi]\n\t" // Read in source data
"movntq 0[%%edi], %%mm1\n\t" // Non-temporal stores
"add %%esi, 8\n\t"
"add %%edi, 8\n\t"
"dec %%ecx \n\t"
"jnz 0b\n\t"
"emms\n\t"
: /* no outputs */
: "S" (src), "D" (dest), "c" (count)
);
#endif
}
/*
================
MMX_Memcpy64B
165MB/sec
================
*/
void MMX_Memcpy64B( void *dest, const void *src, const int count ) {
#ifdef _MSC_VER
_asm {
mov esi, src
mov edi, dest
mov ecx, count
shr ecx, 6 // 64 bytes per iteration
loop1:
prefetchnta 64[ESI] // Prefetch next loop, non-temporal
prefetchnta 96[ESI]
movq mm1, 0[ESI] // Read in source data
movq mm2, 8[ESI]
movq mm3, 16[ESI]
movq mm4, 24[ESI]
movq mm5, 32[ESI]
movq mm6, 40[ESI]
movq mm7, 48[ESI]
movq mm0, 56[ESI]
movntq 0[EDI], mm1 // Non-temporal stores
movntq 8[EDI], mm2
movntq 16[EDI], mm3
movntq 24[EDI], mm4
movntq 32[EDI], mm5
movntq 40[EDI], mm6
movntq 48[EDI], mm7
movntq 56[EDI], mm0
add esi, 64
add edi, 64
dec ecx
jnz loop1
}
EMMS_INSTRUCTION
#else
__asm__ __volatile__ (
//"mov %%esi, src \n\t"
//"mov %%edi, dest \n\t"
//"mov %%ecx, count \n\t"
"shr %%ecx, 6 \n\t"// 64 bytes per iteration
"\n\t"
"1: \n\t"
"prefetchnta 64[%%ESI] \n\t"// Prefetch next loop, non-temporal
"prefetchnta 96[%%ESI] \n\t"
"\n\t"
"movq %%mm1, 0[%%ESI] \n\t"// Read in source data
"movq %%mm2, 8[%%ESI] \n\t"
"movq %%mm3, 16[%%ESI] \n\t"
"movq %%mm4, 24[%%ESI] \n\t"
"movq %%mm5, 32[%%ESI] \n\t"
"movq %%mm6, 40[%%ESI] \n\t"
"movq %%mm7, 48[%%ESI] \n\t"
"movq %%mm0, 56[%%ESI] \n\t"
"\n\t"
"movntq 0[%%EDI], %%mm1 \n\t"// Non-temporal stores
"movntq 8[%%EDI], %%mm2 \n\t"
"movntq 16[%%EDI], %%mm3 \n\t"
"movntq 24[%%EDI], %%mm4 \n\t"
"movntq 32[%%EDI], %%mm5 \n\t"
"movntq 40[%%EDI], %%mm6 \n\t"
"movntq 48[%%EDI], %%mm7 \n\t"
"movntq 56[%%EDI], %%mm0 \n\t"
"\n\t"
"add %%esi, 64 \n\t"
"add %%edi, 64 \n\t"
"dec %%ecx \n\t"
"jnz 1b \n\t"
"emms \n\t"
:
: "S" (src), "D" (dest), "c" (count)
);
#endif
}
/*
================
MMX_Memcpy2kB
240MB/sec
================
*/
void MMX_Memcpy2kB( void *dest, const void *src, const int count ) {
byte *tbuf = (byte *)_alloca16(2048);
#ifdef _MSC_VER
__asm {
push ebx
mov esi, src
mov ebx, count
shr ebx, 11 // 2048 bytes at a time
mov edi, dest
loop2k:
push edi // copy 2k into temporary buffer
mov edi, tbuf
mov ecx, 32
loopMemToL1:
prefetchnta 64[ESI] // Prefetch next loop, non-temporal
prefetchnta 96[ESI]
movq mm1, 0[ESI] // Read in source data
movq mm2, 8[ESI]
movq mm3, 16[ESI]
movq mm4, 24[ESI]
movq mm5, 32[ESI]
movq mm6, 40[ESI]
movq mm7, 48[ESI]
movq mm0, 56[ESI]
movq 0[EDI], mm1 // Store into L1
movq 8[EDI], mm2
movq 16[EDI], mm3
movq 24[EDI], mm4
movq 32[EDI], mm5
movq 40[EDI], mm6
movq 48[EDI], mm7
movq 56[EDI], mm0
add esi, 64
add edi, 64
dec ecx
jnz loopMemToL1
pop edi // Now copy from L1 to system memory
push esi
mov esi, tbuf
mov ecx, 32
loopL1ToMem:
movq mm1, 0[ESI] // Read in source data from L1
movq mm2, 8[ESI]
movq mm3, 16[ESI]
movq mm4, 24[ESI]
movq mm5, 32[ESI]
movq mm6, 40[ESI]
movq mm7, 48[ESI]
movq mm0, 56[ESI]
movntq 0[EDI], mm1 // Non-temporal stores
movntq 8[EDI], mm2
movntq 16[EDI], mm3
movntq 24[EDI], mm4
movntq 32[EDI], mm5
movntq 40[EDI], mm6
movntq 48[EDI], mm7
movntq 56[EDI], mm0
add esi, 64
add edi, 64
dec ecx
jnz loopL1ToMem
pop esi // Do next 2k block
dec ebx
jnz loop2k
pop ebx
}
EMMS_INSTRUCTION
#else
#ifdef __PIC__
memcpy( dest, src, count );
#else
/*
ebx problem:
when not compiling with -fPIC, compiles fine. No need to push/pop ebx, the constraints setup will save and restore ( or so it seems with no optimizations )
when compiling with -fPIC:
if not putting ebx in clobber list, "can't find a register in class 'BREG' while reloading 'asm'"
if putting ebx in clobber list, "PIC register 'ebx' clobbered in 'asm'"
but really, you don't want to put it in clobber list, you want to push/pop it
BREG error due to -masm=intel? ( doesn't sound likely - could test with the cpuid thing? )
tbuf constrained in memory since the loop loads it up in edi
*/
__asm__ __volatile__ (
"push %%ebx \n\t"
//"mov %%esi, src \n\t"
//"mov %%ebx, count \n\t"
"shr %%ebx, 11 \n\t"// 2048 bytes at a time
//"mov %%edi, dest \n\t"
"\n\t"
"loop2k: \n\t"
"push %%edi \n\t"// copy 2k into temporary buffer
//"mov %%edi, tbuf \n\t"
"mov %%edi, %0 \n\t"
"mov %%ecx, 32 \n\t"
"\n\t"
"loopMemToL1: \n\t"
"prefetchnta 64[%%ESI] \n\t"// Prefetch next loop, non-temporal
"prefetchnta 96[%%ESI] \n\t"
"\n\t"
"movq %%mm1, 0[%%ESI] \n\t"// Read in source data
"movq %%mm2, 8[%%ESI] \n\t"
"movq %%mm3, 16[%%ESI] \n\t"
"movq %%mm4, 24[%%ESI] \n\t"
"movq %%mm5, 32[%%ESI] \n\t"
"movq %%mm6, 40[%%ESI] \n\t"
"movq %%mm7, 48[%%ESI] \n\t"
"movq %%mm0, 56[%%ESI] \n\t"
"\n\t"
"movq 0[%%EDI], %%mm1 \n\t"// Store into L1
"movq 8[%%EDI], %%mm2 \n\t"
"movq 16[%%EDI], %%mm3 \n\t"
"movq 24[%%EDI], %%mm4 \n\t"
"movq 32[%%EDI], %%mm5 \n\t"
"movq 40[%%EDI], %%mm6 \n\t"
"movq 48[%%EDI], %%mm7 \n\t"
"movq 56[%%EDI], %%mm0 \n\t"
"add %%esi, 64 \n\t"
"add %%edi, 64 \n\t"
"dec %%ecx \n\t"
"jnz loopMemToL1 \n\t"
"\n\t"
"pop %%edi \n\t"// Now copy from L1 to system memory
"push %%esi \n\t"
//"mov %%esi, tbuf \n\t"
"mov %%esi, %0 \n\t"
"mov %%ecx, 32 \n\t"
"\n\t"
"loopL1ToMem: \n\t"
"movq %%mm1, 0[%%ESI] \n\t"// Read in source data from L1
"movq %%mm2, 8[%%ESI] \n\t"
"movq %%mm3, 16[%%ESI] \n\t"
"movq %%mm4, 24[%%ESI] \n\t"
"movq %%mm5, 32[%%ESI] \n\t"
"movq %%mm6, 40[%%ESI] \n\t"
"movq %%mm7, 48[%%ESI] \n\t"
"movq %%mm0, 56[%%ESI] \n\t"
"\n\t"
"movntq 0[%%EDI], %%mm1 \n\t"// Non-temporal stores
"movntq 8[%%EDI], %%mm2 \n\t"
"movntq 16[%%EDI], %%mm3 \n\t"
"movntq 24[%%EDI], %%mm4 \n\t"
"movntq 32[%%EDI], %%mm5 \n\t"
"movntq 40[%%EDI], %%mm6 \n\t"
"movntq 48[%%EDI], %%mm7 \n\t"
"movntq 56[%%EDI], %%mm0 \n\t"
"\n\t"
"add %%esi, 64 \n\t"
"add %%edi, 64 \n\t"
"dec %%ecx \n\t"
"jnz loopL1ToMem \n\t"
"\n\t"
"pop %%esi \n\t"// Do next 2k block
"dec %%ebx \n\t"
"jnz loop2k \n\t"
"pop %%ebx \n\t"
"emms \n\t"
:
: "m" (tbuf), "S" (src), "D" (dest), "b" (count)
//: "ebx"
);
#endif // !ID_PIC
#endif
}
/*
================
idSIMD_MMX::Memcpy
optimized memory copy routine that handles all alignment cases and block sizes efficiently
================
*/
void VPCALL idSIMD_MMX::Memcpy( void *dest0, const void *src0, const int count0 ) {
#ifndef _WIN32
memcpy( dest0, src0, count0 );
#else
// if copying more than 16 bytes and we can copy 8 byte aligned
if ( count0 > 16 && !( ( (int)dest0 ^ (int)src0 ) & 7 ) ) {
byte *dest = (byte *)dest0;
byte *src = (byte *)src0;
// copy up to the first 8 byte aligned boundary
int count = ((int)dest) & 7;
memcpy( dest, src, count );
dest += count;
src += count;
count = count0 - count;
// if there are multiple blocks of 2kB
if ( count & ~4095 ) {
MMX_Memcpy2kB( dest, src, count );
src += (count & ~2047);
dest += (count & ~2047);
count &= 2047;
}
// if there are blocks of 64 bytes
if ( count & ~63 ) {
MMX_Memcpy64B( dest, src, count );
src += (count & ~63);
dest += (count & ~63);
count &= 63;
}
// if there are blocks of 8 bytes
if ( count & ~7 ) {
MMX_Memcpy8B( dest, src, count );
src += (count & ~7);
dest += (count & ~7);
count &= 7;
}
// copy any remaining bytes
memcpy( dest, src, count );
} else {
// use the regular one if we cannot copy 8 byte aligned
memcpy( dest0, src0, count0 );
}
#endif // _WIN32
}
/*
================
idSIMD_MMX::Memset
================
*/
void VPCALL idSIMD_MMX::Memset( void* dest0, const int val, const int count0 ) {
#ifndef _WIN32
#else
union {
byte bytes[8];
word words[4];
dword dwords[2];
} dat;
byte *dest = (byte *)dest0;
int count = count0;
while( count > 0 && (((int)dest) & 7) ) {
*dest = val;
dest++;
count--;
}
if ( !count ) {
return;
}
dat.bytes[0] = val;
dat.bytes[1] = val;
dat.words[1] = dat.words[0];
dat.dwords[1] = dat.dwords[0];
if ( count >= 64 ) {
#ifdef _MSC_VER
__asm {
mov edi, dest
mov ecx, count
shr ecx, 6 // 64 bytes per iteration
movq mm1, dat // Read in source data
movq mm2, mm1
movq mm3, mm1
movq mm4, mm1
movq mm5, mm1
movq mm6, mm1
movq mm7, mm1
movq mm0, mm1
loop1:
movntq 0[EDI], mm1 // Non-temporal stores
movntq 8[EDI], mm2
movntq 16[EDI], mm3
movntq 24[EDI], mm4
movntq 32[EDI], mm5
movntq 40[EDI], mm6
movntq 48[EDI], mm7
movntq 56[EDI], mm0
add edi, 64
dec ecx
jnz loop1
}
#else
/*
dat constrained in memory
*/
__asm__ __volatile__ (
//"mov %%edi, dest \n\t"
//"mov %%ecx, count \n\t"
"shr %%ecx, 6 \n\t"// 64 bytes per iteration
//"movq %%mm1, dat \n\t"// Read in source data
"movq %%mm1, %0 \n\t"
"movq %%mm2, %%mm1 \n\t"
"movq %%mm3, %%mm1 \n\t"
"movq %%mm4, %%mm1 \n\t"
"movq %%mm5, %%mm1 \n\t"
"movq %%mm6, %%mm1 \n\t"
"movq %%mm7, %%mm1 \n\t"
"movq %%mm0, %%mm1 \n\t"
"loop1_3: \n\t"
"movntq 0[%%EDI], %%mm1 \n\t"// Non-temporal stores
"movntq 8[%%EDI], %%mm2 \n\t"
"movntq 16[%%EDI], %%mm3 \n\t"
"movntq 24[%%EDI], %%mm4 \n\t"
"movntq 32[%%EDI], %%mm5 \n\t"
"movntq 40[%%EDI], %%mm6 \n\t"
"movntq 48[%%EDI], %%mm7 \n\t"
"movntq 56[%%EDI], %%mm0 \n\t"
"\n\t"
"add %%edi, 64 \n\t"
"dec %%ecx \n\t"
"jnz loop1_3 \n\t"
:
: "m" (dat), "D" (dest), "c" (count)
);
#endif
dest += ( count & ~63 );
count &= 63;
}
if ( count >= 8 ) {
#ifdef _MSC_VER
__asm {
mov edi, dest
mov ecx, count
shr ecx, 3 // 8 bytes per iteration
movq mm1, dat // Read in source data
loop2:
movntq 0[EDI], mm1 // Non-temporal stores
add edi, 8
dec ecx
jnz loop2
}
#else
/*
dat constrained in memory
*/
__asm__ __volatile__ (
//"mov %%edi, dest \n\t"
//"mov %%ecx, count \n\t"
"shr %%ecx, 3 \n\t"// 8 bytes per iteration
//"movq %%mm1, dat \n\t"// Read in source data
"movq %%mm1, %0 \n\t"// Read in source data
"loop2: \n\t"
"movntq 0[%%EDI], %%mm1 \n\t"// Non-temporal stores
"\n\t"
"add %%edi, 8 \n\t"
"dec %%ecx \n\t"
"jnz loop2 \n\t"
:
: "m" (dat), "D" (dest), "c" (count)
);
#endif
dest += (count & ~7);
count &= 7;
}
while( count > 0 ) {
*dest = val;
dest++;
count--;
}
EMMS_INSTRUCTION
#endif // _WIN32
}
#endif