etqw-sdk/source/idlib/math/Simd_MMX.cpp

336 lines
7 KiB
C++
Raw Permalink Normal View History

2008-05-29 00:00:00 +00:00
// Copyright (C) 2007 Id Software, Inc.
//
#include "../precompiled.h"
#pragma hdrstop
#include "Simd_Generic.h"
#include "Simd_MMX.h"
#pragma warning( push )
#pragma warning( disable: 4311 )
//===============================================================
//
// MMX implementation of idSIMDProcessor
//
//===============================================================
#ifdef ID_WIN_X86_ASM
#define EMMS_INSTRUCTION __asm emms
/*
============
idSIMD_MMX::GetName
============
*/
const char * idSIMD_MMX::GetName( void ) const {
return "MMX";
}
/*
================
MMX_Memcpy8B
================
*/
void MMX_Memcpy8B( void *dest, const void *src, const int count ) {
_asm {
mov esi, src
mov edi, dest
mov ecx, count
shr ecx, 3 // 8 bytes per iteration
loop1:
movq mm1, 0[ESI] // Read in source data
movntq 0[EDI], mm1 // Non-temporal stores
add esi, 8
add edi, 8
dec ecx
jnz loop1
}
EMMS_INSTRUCTION
}
/*
================
MMX_Memcpy64B
165MB/sec
================
*/
void MMX_Memcpy64B( void *dest, const void *src, const int count ) {
_asm {
mov esi, src
mov edi, dest
mov ecx, count
shr ecx, 6 // 64 bytes per iteration
loop1:
prefetchnta 64[ESI] // Prefetch next loop, non-temporal
prefetchnta 96[ESI]
movq mm1, 0[ESI] // Read in source data
movq mm2, 8[ESI]
movq mm3, 16[ESI]
movq mm4, 24[ESI]
movq mm5, 32[ESI]
movq mm6, 40[ESI]
movq mm7, 48[ESI]
movq mm0, 56[ESI]
movntq 0[EDI], mm1 // Non-temporal stores
movntq 8[EDI], mm2
movntq 16[EDI], mm3
movntq 24[EDI], mm4
movntq 32[EDI], mm5
movntq 40[EDI], mm6
movntq 48[EDI], mm7
movntq 56[EDI], mm0
add esi, 64
add edi, 64
dec ecx
jnz loop1
}
EMMS_INSTRUCTION
}
/*
================
MMX_Memcpy2kB
240MB/sec
================
*/
void MMX_Memcpy2kB( void *dest, const void *src, const int count ) {
byte *tbuf = (byte *)_alloca16(2048);
__asm {
push ebx
mov esi, src
mov ebx, count
shr ebx, 11 // 2048 bytes at a time
mov edi, dest
loop2k:
push edi // copy 2k into temporary buffer
mov edi, tbuf
mov ecx, 32
loopMemToL1:
prefetchnta 64[ESI] // Prefetch next loop, non-temporal
prefetchnta 96[ESI]
movq mm1, 0[ESI] // Read in source data
movq mm2, 8[ESI]
movq mm3, 16[ESI]
movq mm4, 24[ESI]
movq mm5, 32[ESI]
movq mm6, 40[ESI]
movq mm7, 48[ESI]
movq mm0, 56[ESI]
movq 0[EDI], mm1 // Store into L1
movq 8[EDI], mm2
movq 16[EDI], mm3
movq 24[EDI], mm4
movq 32[EDI], mm5
movq 40[EDI], mm6
movq 48[EDI], mm7
movq 56[EDI], mm0
add esi, 64
add edi, 64
dec ecx
jnz loopMemToL1
pop edi // Now copy from L1 to system memory
push esi
mov esi, tbuf
mov ecx, 32
loopL1ToMem:
movq mm1, 0[ESI] // Read in source data from L1
movq mm2, 8[ESI]
movq mm3, 16[ESI]
movq mm4, 24[ESI]
movq mm5, 32[ESI]
movq mm6, 40[ESI]
movq mm7, 48[ESI]
movq mm0, 56[ESI]
movntq 0[EDI], mm1 // Non-temporal stores
movntq 8[EDI], mm2
movntq 16[EDI], mm3
movntq 24[EDI], mm4
movntq 32[EDI], mm5
movntq 40[EDI], mm6
movntq 48[EDI], mm7
movntq 56[EDI], mm0
add esi, 64
add edi, 64
dec ecx
jnz loopL1ToMem
pop esi // Do next 2k block
dec ebx
jnz loop2k
pop ebx
}
EMMS_INSTRUCTION
}
/*
================
idSIMD_MMX::Memcpy
optimized memory copy routine that handles all alignment cases and block sizes efficiently
================
*/
void VPCALL idSIMD_MMX::Memcpy( void *dest0, const void *src0, const int count0 ) {
// if copying more than 16 bytes and we can copy 8 byte aligned
if ( count0 > 16 && !( ( (int)dest0 ^ (int)src0 ) & 7 ) ) {
byte *dest = (byte *)dest0;
byte *src = (byte *)src0;
// copy up to the first 8 byte aligned boundary
int count = ((int)dest) & 7;
memcpy( dest, src, count );
dest += count;
src += count;
count = count0 - count;
// if there are multiple blocks of 2kB
if ( count & ~4095 ) {
MMX_Memcpy2kB( dest, src, count );
src += (count & ~2047);
dest += (count & ~2047);
count &= 2047;
}
// if there are blocks of 64 bytes
if ( count & ~63 ) {
MMX_Memcpy64B( dest, src, count );
src += (count & ~63);
dest += (count & ~63);
count &= 63;
}
// if there are blocks of 8 bytes
if ( count & ~7 ) {
MMX_Memcpy8B( dest, src, count );
src += (count & ~7);
dest += (count & ~7);
count &= 7;
}
// copy any remaining bytes
memcpy( dest, src, count );
} else {
// use the regular one if we cannot copy 8 byte aligned
memcpy( dest0, src0, count0 );
}
// the MMX_Memcpy* functions use MOVNTQ, issue a fence operation
__asm {
sfence
}
}
/*
================
idSIMD_MMX::Memset
================
*/
void VPCALL idSIMD_MMX::Memset( void* dest0, const int val, const int count0 ) {
union {
byte bytes[8];
word words[4];
dword dwords[2];
} dat;
byte *dest = (byte *)dest0;
int count = count0;
while( count > 0 && (((int)dest) & 7) ) {
*dest = val;
dest++;
count--;
}
if ( !count ) {
return;
}
dat.bytes[0] = val;
dat.bytes[1] = val;
dat.words[1] = dat.words[0];
dat.dwords[1] = dat.dwords[0];
if ( count >= 64 ) {
__asm {
mov edi, dest
mov ecx, count
shr ecx, 6 // 64 bytes per iteration
movq mm1, dat // Read in source data
movq mm2, mm1
movq mm3, mm1
movq mm4, mm1
movq mm5, mm1
movq mm6, mm1
movq mm7, mm1
movq mm0, mm1
loop1:
movntq 0[EDI], mm1 // Non-temporal stores
movntq 8[EDI], mm2
movntq 16[EDI], mm3
movntq 24[EDI], mm4
movntq 32[EDI], mm5
movntq 40[EDI], mm6
movntq 48[EDI], mm7
movntq 56[EDI], mm0
add edi, 64
dec ecx
jnz loop1
}
dest += ( count & ~63 );
count &= 63;
}
if ( count >= 8 ) {
__asm {
mov edi, dest
mov ecx, count
shr ecx, 3 // 8 bytes per iteration
movq mm1, dat // Read in source data
loop2:
movntq 0[EDI], mm1 // Non-temporal stores
add edi, 8
dec ecx
jnz loop2
}
dest += (count & ~7);
count &= 7;
}
while( count > 0 ) {
*dest = val;
dest++;
count--;
}
EMMS_INSTRUCTION
// the MMX_Memcpy* functions use MOVNTQ, issue a fence operation
__asm {
sfence
}
}
#endif /* ID_WIN_X86_ASM */
#pragma warning( pop )