323 lines
7.5 KiB
C
323 lines
7.5 KiB
C
/**
|
|
* GAS syntax equivalents of the MSVC asm memory calls in common.c
|
|
*
|
|
* The following changes have been made to the asm:
|
|
* 1. Registers are loaded by the inline asm arguments when possible
|
|
* 2. Labels have been changed to local label format (0,1,etc.) to allow inlining
|
|
*
|
|
* HISTORY:
|
|
* AH - Created on 08 Dec 2000
|
|
*/
|
|
|
|
#include <unistd.h> // AH - for size_t
|
|
#include <string.h>
|
|
|
|
// bk001207 - we need something under Linux, too. Mac?
|
|
#if 1 // defined(C_ONLY) // bk010102 - dedicated?
|
|
void Com_Memcpy (void* dest, const void* src, const size_t count) {
|
|
memcpy(dest, src, count);
|
|
}
|
|
|
|
void Com_Memset (void* dest, const int val, const size_t count) {
|
|
memset(dest, val, count);
|
|
}
|
|
|
|
#else
|
|
|
|
typedef enum {
|
|
PRE_READ, // prefetch assuming that buffer is used for reading only
|
|
PRE_WRITE, // prefetch assuming that buffer is used for writing only
|
|
PRE_READ_WRITE // prefetch assuming that buffer is used for both reading and writing
|
|
} e_prefetch;
|
|
|
|
void Com_Prefetch (const void *s, const unsigned int bytes, e_prefetch type);
|
|
|
|
void _copyDWord (unsigned int* dest, const unsigned int constant, const unsigned int count) {
|
|
// MMX version not used on standard Pentium MMX
|
|
// because the dword version is faster (with
|
|
// proper destination prefetching)
|
|
__asm__ __volatile__ ("
|
|
//mov eax,constant // eax = val
|
|
//mov edx,dest // dest
|
|
//mov ecx,count
|
|
movd %%eax, %%mm0
|
|
punpckldq %%mm0, %%mm0
|
|
|
|
// ensure that destination is qword aligned
|
|
|
|
testl $7, %%edx // qword padding?
|
|
jz 0f
|
|
movl %%eax, (%%edx)
|
|
decl %%ecx
|
|
addl $4, %%edx
|
|
|
|
0: movl %%ecx, %%ebx
|
|
andl $0xfffffff0, %%ecx
|
|
jz 2f
|
|
jmp 1f
|
|
.align 16
|
|
|
|
// funny ordering here to avoid commands
|
|
// that cross 32-byte boundaries (the
|
|
// [edx+0] version has a special 3-byte opcode...
|
|
1: movq %%mm0, 8(%%edx)
|
|
movq %%mm0, 16(%%edx)
|
|
movq %%mm0, 24(%%edx)
|
|
movq %%mm0, 32(%%edx)
|
|
movq %%mm0, 40(%%edx)
|
|
movq %%mm0, 48(%%edx)
|
|
movq %%mm0, 56(%%edx)
|
|
movq %%mm0, (%%edx)
|
|
addl $64, %%edx
|
|
subl $16, %%ecx
|
|
jnz 1b
|
|
2:
|
|
movl %%ebx, %%ecx // ebx = cnt
|
|
andl $0xfffffff0, %%ecx // ecx = cnt&~15
|
|
subl %%ecx, %%ebx
|
|
jz 6f
|
|
cmpl $8, %%ebx
|
|
jl 3f
|
|
|
|
movq %%mm0, (%%edx)
|
|
movq %%mm0, 8(%%edx)
|
|
movq %%mm0, 16(%%edx)
|
|
movq %%mm0, 24(%%edx)
|
|
addl $32, %%edx
|
|
subl $8, %%ebx
|
|
jz 6f
|
|
|
|
3: cmpl $4, %%ebx
|
|
jl 4f
|
|
|
|
movq %%mm0, (%%edx)
|
|
movq %%mm0, 8(%%edx)
|
|
addl $16, %%edx
|
|
subl $4, %%ebx
|
|
|
|
4: cmpl $2, %%ebx
|
|
jl 5f
|
|
movq %%mm0, (%%edx)
|
|
addl $8, %%edx
|
|
subl $2, %%ebx
|
|
|
|
5: cmpl $1, %%ebx
|
|
jl 6f
|
|
movl %%eax, (%%edx)
|
|
6:
|
|
emms
|
|
"
|
|
: : "a" (constant), "c" (count), "d" (dest)
|
|
: "%ebx", "%edi", "%esi", "cc", "memory");
|
|
}
|
|
|
|
// optimized memory copy routine that handles all alignment
|
|
// cases and block sizes efficiently
|
|
void Com_Memcpy (void* dest, const void* src, const size_t count) {
|
|
Com_Prefetch (src, count, PRE_READ);
|
|
__asm__ __volatile__ ("
|
|
pushl %%edi
|
|
pushl %%esi
|
|
//mov ecx,count
|
|
cmpl $0, %%ecx // count = 0 check (just to be on the safe side)
|
|
je 6f
|
|
//mov edx,dest
|
|
movl %0, %%ebx
|
|
cmpl $32, %%ecx // padding only?
|
|
jl 1f
|
|
|
|
movl %%ecx, %%edi
|
|
andl $0xfffffe00, %%edi // edi = count&~31
|
|
subl $32, %%edi
|
|
|
|
.align 16
|
|
0:
|
|
movl (%%ebx, %%edi, 1), %%eax
|
|
movl 4(%%ebx, %%edi, 1), %%esi
|
|
movl %%eax, (%%edx, %%edi, 1)
|
|
movl %%esi, 4(%%edx, %%edi, 1)
|
|
movl 8(%%ebx, %%edi, 1), %%eax
|
|
movl 12(%%ebx, %%edi, 1), %%esi
|
|
movl %%eax, 8(%%edx, %%edi, 1)
|
|
movl %%esi, 12(%%edx, %%edi, 1)
|
|
movl 16(%%ebx, %%edi, 1), %%eax
|
|
movl 20(%%ebx, %%edi, 1), %%esi
|
|
movl %%eax, 16(%%edx, %%edi, 1)
|
|
movl %%esi, 20(%%edx, %%edi, 1)
|
|
movl 24(%%ebx, %%edi, 1), %%eax
|
|
movl 28(%%ebx, %%edi, 1), %%esi
|
|
movl %%eax, 24(%%edx, %%edi, 1)
|
|
movl %%esi, 28(%%edx, %%edi, 1)
|
|
subl $32, %%edi
|
|
jge 0b
|
|
|
|
movl %%ecx, %%edi
|
|
andl $0xfffffe00, %%edi
|
|
addl %%edi, %%ebx // increase src pointer
|
|
addl %%edi, %%edx // increase dst pointer
|
|
andl $31, %%ecx // new count
|
|
jz 6f // if count = 0, get outta here
|
|
|
|
1:
|
|
cmpl $16, %%ecx
|
|
jl 2f
|
|
movl (%%ebx), %%eax
|
|
movl %%eax, (%%edx)
|
|
movl 4(%%ebx), %%eax
|
|
movl %%eax, 4(%%edx)
|
|
movl 8(%%ebx), %%eax
|
|
movl %%eax, 8(%%edx)
|
|
movl 12(%%ebx), %%eax
|
|
movl %%eax, 12(%%edx)
|
|
subl $16, %%ecx
|
|
addl $16, %%ebx
|
|
addl $16, %%edx
|
|
2:
|
|
cmpl $8, %%ecx
|
|
jl 3f
|
|
movl (%%ebx), %%eax
|
|
movl %%eax, (%%edx)
|
|
movl 4(%%ebx), %%eax
|
|
subl $8, %%ecx
|
|
movl %%eax, 4(%%edx)
|
|
addl $8, %%ebx
|
|
addl $8, %%edx
|
|
3:
|
|
cmpl $4, %%ecx
|
|
jl 4f
|
|
movl (%%ebx), %%eax // here 4-7 bytes
|
|
addl $4, %%ebx
|
|
subl $4, %%ecx
|
|
movl %%eax, (%%edx)
|
|
addl $4, %%edx
|
|
4: // 0-3 remaining bytes
|
|
cmpl $2, %%ecx
|
|
jl 5f
|
|
movw (%%ebx), %%ax // two bytes
|
|
cmpl $3, %%ecx // less than 3?
|
|
movw %%ax, (%%edx)
|
|
jl 6f
|
|
movb 2(%%ebx), %%al // last byte
|
|
movb %%al, 2(%%edx)
|
|
jmp 6f
|
|
5:
|
|
cmpl $1, %%ecx
|
|
jl 6f
|
|
movb (%%ebx), %%al
|
|
movb %%al, (%%edx)
|
|
6:
|
|
popl %%esi
|
|
popl %%edi
|
|
"
|
|
: : "m" (src), "d" (dest), "c" (count)
|
|
: "%eax", "%ebx", "%edi", "%esi", "cc", "memory");
|
|
}
|
|
|
|
void Com_Memset (void* dest, const int val, const size_t count)
|
|
{
|
|
unsigned int fillval;
|
|
|
|
if (count < 8)
|
|
{
|
|
__asm__ __volatile__ ("
|
|
//mov edx,dest
|
|
//mov eax, val
|
|
movb %%al, %%ah
|
|
movl %%eax, %%ebx
|
|
andl $0xffff, %%ebx
|
|
shll $16, %%eax
|
|
addl %%ebx, %%eax // eax now contains pattern
|
|
//mov ecx,count
|
|
cmpl $4, %%ecx
|
|
jl 0f
|
|
movl %%eax, (%%edx) // copy first dword
|
|
addl $4, %%edx
|
|
subl $4, %%ecx
|
|
0: cmpl $2, %%ecx
|
|
jl 1f
|
|
movw %%ax, (%%edx) // copy 2 bytes
|
|
addl $2, %%edx
|
|
subl $2, %%ecx
|
|
1: cmpl $0, %%ecx
|
|
je 2f
|
|
movb %%al, (%%edx) // copy single byte
|
|
2:
|
|
"
|
|
: : "d" (dest), "a" (val), "c" (count)
|
|
: "%ebx", "%edi", "%esi", "cc", "memory");
|
|
|
|
return;
|
|
}
|
|
|
|
fillval = val;
|
|
|
|
fillval = fillval|(fillval<<8);
|
|
fillval = fillval|(fillval<<16); // fill dword with 8-bit pattern
|
|
|
|
_copyDWord ((unsigned int*)(dest),fillval, count/4);
|
|
|
|
__asm__ __volatile__ (" // padding of 0-3 bytes
|
|
//mov ecx,count
|
|
movl %%ecx, %%eax
|
|
andl $3, %%ecx
|
|
jz 1f
|
|
andl $0xffffff00, %%eax
|
|
//mov ebx,dest
|
|
addl %%eax, %%edx
|
|
movl %0, %%eax
|
|
cmpl $2, %%ecx
|
|
jl 0f
|
|
movw %%ax, (%%edx)
|
|
cmpl $2, %%ecx
|
|
je 1f
|
|
movb %%al, 2(%%edx)
|
|
jmp 1f
|
|
0:
|
|
cmpl $0, %%ecx
|
|
je 1f
|
|
movb %%al, (%%edx)
|
|
1:
|
|
"
|
|
: : "m" (fillval), "c" (count), "d" (dest)
|
|
: "%eax", "%ebx", "%edi", "%esi", "cc", "memory");
|
|
}
|
|
|
|
void Com_Prefetch (const void *s, const unsigned int bytes, e_prefetch type)
|
|
{
|
|
// write buffer prefetching is performed only if
|
|
// the processor benefits from it. Read and read/write
|
|
// prefetching is always performed.
|
|
|
|
switch (type)
|
|
{
|
|
case PRE_WRITE : break;
|
|
case PRE_READ:
|
|
case PRE_READ_WRITE:
|
|
|
|
__asm__ __volatile__ ("
|
|
//mov ebx,s
|
|
//mov ecx,bytes
|
|
cmpl $4096, %%ecx // clamp to 4kB
|
|
jle 0f
|
|
movl $4096, %%ecx
|
|
0:
|
|
addl $0x1f, %%ecx
|
|
shrl $5, %%ecx // number of cache lines
|
|
jz 2f
|
|
jmp 1f
|
|
|
|
.align 16
|
|
1: testb %%al, (%%edx)
|
|
addl $32, %%edx
|
|
decl %%ecx
|
|
jnz 1b
|
|
2:
|
|
"
|
|
: : "d" (s), "c" (bytes)
|
|
: "%eax", "%ebx", "%edi", "%esi", "memory", "cc");
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
#endif
|