dhewm3/neo/idlib/math/Simd_MMX.cpp
dhewg 8023bc5d56 Always compile all SIMD code
Protect all SIMD implementations with the according defines and
let the compiler decide if it supports the intructions.

Linux will still use Simd_Generic because CPU feature runtime
detection is missing.
2011-12-13 18:56:38 +01:00

367 lines
7.5 KiB
C++

/*
===========================================================================
Doom 3 GPL Source Code
Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
This file is part of the Doom 3 GPL Source Code ("Doom 3 Source Code").
Doom 3 Source Code is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Doom 3 Source Code is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
===========================================================================
*/
#include "../precompiled.h"
#pragma hdrstop
#include "Simd_Generic.h"
#include "Simd_MMX.h"
//===============================================================
//
// MMX implementation of idSIMDProcessor
//
//===============================================================
#if defined(__GNUC__) && defined(__MMX__)
/*
============
idSIMD_MMX::GetName
============
*/
const char * idSIMD_MMX::GetName( void ) const {
return "MMX";
}
#elif defined(_WIN32)
#define EMMS_INSTRUCTION __asm emms
/*
============
idSIMD_MMX::GetName
============
*/
const char * idSIMD_MMX::GetName( void ) const {
return "MMX";
}
/*
================
MMX_Memcpy8B
================
*/
void MMX_Memcpy8B( void *dest, const void *src, const int count ) {
_asm {
mov esi, src
mov edi, dest
mov ecx, count
shr ecx, 3 // 8 bytes per iteration
loop1:
movq mm1, 0[ESI] // Read in source data
movntq 0[EDI], mm1 // Non-temporal stores
add esi, 8
add edi, 8
dec ecx
jnz loop1
}
EMMS_INSTRUCTION
}
/*
================
MMX_Memcpy64B
165MB/sec
================
*/
void MMX_Memcpy64B( void *dest, const void *src, const int count ) {
_asm {
mov esi, src
mov edi, dest
mov ecx, count
shr ecx, 6 // 64 bytes per iteration
loop1:
prefetchnta 64[ESI] // Prefetch next loop, non-temporal
prefetchnta 96[ESI]
movq mm1, 0[ESI] // Read in source data
movq mm2, 8[ESI]
movq mm3, 16[ESI]
movq mm4, 24[ESI]
movq mm5, 32[ESI]
movq mm6, 40[ESI]
movq mm7, 48[ESI]
movq mm0, 56[ESI]
movntq 0[EDI], mm1 // Non-temporal stores
movntq 8[EDI], mm2
movntq 16[EDI], mm3
movntq 24[EDI], mm4
movntq 32[EDI], mm5
movntq 40[EDI], mm6
movntq 48[EDI], mm7
movntq 56[EDI], mm0
add esi, 64
add edi, 64
dec ecx
jnz loop1
}
EMMS_INSTRUCTION
}
/*
================
MMX_Memcpy2kB
240MB/sec
================
*/
void MMX_Memcpy2kB( void *dest, const void *src, const int count ) {
byte *tbuf = (byte *)_alloca16(2048);
__asm {
push ebx
mov esi, src
mov ebx, count
shr ebx, 11 // 2048 bytes at a time
mov edi, dest
loop2k:
push edi // copy 2k into temporary buffer
mov edi, tbuf
mov ecx, 32
loopMemToL1:
prefetchnta 64[ESI] // Prefetch next loop, non-temporal
prefetchnta 96[ESI]
movq mm1, 0[ESI] // Read in source data
movq mm2, 8[ESI]
movq mm3, 16[ESI]
movq mm4, 24[ESI]
movq mm5, 32[ESI]
movq mm6, 40[ESI]
movq mm7, 48[ESI]
movq mm0, 56[ESI]
movq 0[EDI], mm1 // Store into L1
movq 8[EDI], mm2
movq 16[EDI], mm3
movq 24[EDI], mm4
movq 32[EDI], mm5
movq 40[EDI], mm6
movq 48[EDI], mm7
movq 56[EDI], mm0
add esi, 64
add edi, 64
dec ecx
jnz loopMemToL1
pop edi // Now copy from L1 to system memory
push esi
mov esi, tbuf
mov ecx, 32
loopL1ToMem:
movq mm1, 0[ESI] // Read in source data from L1
movq mm2, 8[ESI]
movq mm3, 16[ESI]
movq mm4, 24[ESI]
movq mm5, 32[ESI]
movq mm6, 40[ESI]
movq mm7, 48[ESI]
movq mm0, 56[ESI]
movntq 0[EDI], mm1 // Non-temporal stores
movntq 8[EDI], mm2
movntq 16[EDI], mm3
movntq 24[EDI], mm4
movntq 32[EDI], mm5
movntq 40[EDI], mm6
movntq 48[EDI], mm7
movntq 56[EDI], mm0
add esi, 64
add edi, 64
dec ecx
jnz loopL1ToMem
pop esi // Do next 2k block
dec ebx
jnz loop2k
pop ebx
}
EMMS_INSTRUCTION
}
/*
================
idSIMD_MMX::Memcpy
optimized memory copy routine that handles all alignment cases and block sizes efficiently
================
*/
void VPCALL idSIMD_MMX::Memcpy( void *dest0, const void *src0, const int count0 ) {
// if copying more than 16 bytes and we can copy 8 byte aligned
if ( count0 > 16 && !( ( (int)dest0 ^ (int)src0 ) & 7 ) ) {
byte *dest = (byte *)dest0;
byte *src = (byte *)src0;
// copy up to the first 8 byte aligned boundary
int count = ((int)dest) & 7;
memcpy( dest, src, count );
dest += count;
src += count;
count = count0 - count;
// if there are multiple blocks of 2kB
if ( count & ~4095 ) {
MMX_Memcpy2kB( dest, src, count );
src += (count & ~2047);
dest += (count & ~2047);
count &= 2047;
}
// if there are blocks of 64 bytes
if ( count & ~63 ) {
MMX_Memcpy64B( dest, src, count );
src += (count & ~63);
dest += (count & ~63);
count &= 63;
}
// if there are blocks of 8 bytes
if ( count & ~7 ) {
MMX_Memcpy8B( dest, src, count );
src += (count & ~7);
dest += (count & ~7);
count &= 7;
}
// copy any remaining bytes
memcpy( dest, src, count );
} else {
// use the regular one if we cannot copy 8 byte aligned
memcpy( dest0, src0, count0 );
}
// the MMX_Memcpy* functions use MOVNTQ, issue a fence operation
__asm {
sfence
}
}
/*
================
idSIMD_MMX::Memset
================
*/
void VPCALL idSIMD_MMX::Memset( void* dest0, const int val, const int count0 ) {
union {
byte bytes[8];
word words[4];
dword dwords[2];
} dat;
byte *dest = (byte *)dest0;
int count = count0;
while ( count > 0 && (((int)dest) & 7) ) {
*dest = val;
dest++;
count--;
}
if ( !count ) {
return;
}
dat.bytes[0] = val;
dat.bytes[1] = val;
dat.words[1] = dat.words[0];
dat.dwords[1] = dat.dwords[0];
if ( count >= 64 ) {
__asm {
mov edi, dest
mov ecx, count
shr ecx, 6 // 64 bytes per iteration
movq mm1, dat // Read in source data
movq mm2, mm1
movq mm3, mm1
movq mm4, mm1
movq mm5, mm1
movq mm6, mm1
movq mm7, mm1
movq mm0, mm1
loop1:
movntq 0[EDI], mm1 // Non-temporal stores
movntq 8[EDI], mm2
movntq 16[EDI], mm3
movntq 24[EDI], mm4
movntq 32[EDI], mm5
movntq 40[EDI], mm6
movntq 48[EDI], mm7
movntq 56[EDI], mm0
add edi, 64
dec ecx
jnz loop1
}
dest += ( count & ~63 );
count &= 63;
}
if ( count >= 8 ) {
__asm {
mov edi, dest
mov ecx, count
shr ecx, 3 // 8 bytes per iteration
movq mm1, dat // Read in source data
loop2:
movntq 0[EDI], mm1 // Non-temporal stores
add edi, 8
dec ecx
jnz loop2
}
dest += (count & ~7);
count &= 7;
}
while ( count > 0 ) {
*dest = val;
dest++;
count--;
}
EMMS_INSTRUCTION
// the MMX_Memcpy* functions use MOVNTQ, issue a fence operation
__asm {
sfence
}
}
#endif /* _WIN32 */