mirror of
https://github.com/UberGames/ioef.git
synced 2024-11-27 14:32:55 +00:00
* Fix the MinGW and (hopefully) OS X builds
* Remove custom memcpy/memset code
This commit is contained in:
parent
2d9d10772f
commit
6e24cfe7d3
5 changed files with 22 additions and 684 deletions
|
@ -2857,316 +2857,6 @@ void Com_Shutdown (void) {
|
|||
|
||||
}
|
||||
|
||||
#if I_WANT_A_CUSTOM_MEMCPY && !defined(_WIN32)
|
||||
void Com_Memcpy (void* dest, const void* src, const size_t count)
|
||||
{
|
||||
memcpy(dest, src, count);
|
||||
}
|
||||
|
||||
void Com_Memset (void* dest, const int val, const size_t count)
|
||||
{
|
||||
memset(dest, val, count);
|
||||
}
|
||||
|
||||
#elif I_WANT_A_CUSTOM_MEMCPY && defined(_WIN32)
|
||||
|
||||
typedef enum
|
||||
{
|
||||
PRE_READ, // prefetch assuming that buffer is used for reading only
|
||||
PRE_WRITE, // prefetch assuming that buffer is used for writing only
|
||||
PRE_READ_WRITE // prefetch assuming that buffer is used for both reading and writing
|
||||
} e_prefetch;
|
||||
|
||||
void Com_Prefetch (const void *s, const unsigned int bytes, e_prefetch type);
|
||||
|
||||
#define EMMS_INSTRUCTION __asm emms
|
||||
|
||||
void _copyDWord (unsigned int* dest, const unsigned int constant, const unsigned int count) {
|
||||
__asm
|
||||
{
|
||||
mov edx,dest
|
||||
mov eax,constant
|
||||
mov ecx,count
|
||||
and ecx,~7
|
||||
jz padding
|
||||
sub ecx,8
|
||||
jmp loopu
|
||||
align 16
|
||||
loopu:
|
||||
test [edx+ecx*4 + 28],ebx // fetch next block destination to L1 cache
|
||||
mov [edx+ecx*4 + 0],eax
|
||||
mov [edx+ecx*4 + 4],eax
|
||||
mov [edx+ecx*4 + 8],eax
|
||||
mov [edx+ecx*4 + 12],eax
|
||||
mov [edx+ecx*4 + 16],eax
|
||||
mov [edx+ecx*4 + 20],eax
|
||||
mov [edx+ecx*4 + 24],eax
|
||||
mov [edx+ecx*4 + 28],eax
|
||||
sub ecx,8
|
||||
jge loopu
|
||||
padding: mov ecx,count
|
||||
mov ebx,ecx
|
||||
and ecx,7
|
||||
jz outta
|
||||
and ebx,~7
|
||||
lea edx,[edx+ebx*4] // advance dest pointer
|
||||
test [edx+0],eax // fetch destination to L1 cache
|
||||
cmp ecx,4
|
||||
jl skip4
|
||||
mov [edx+0],eax
|
||||
mov [edx+4],eax
|
||||
mov [edx+8],eax
|
||||
mov [edx+12],eax
|
||||
add edx,16
|
||||
sub ecx,4
|
||||
skip4: cmp ecx,2
|
||||
jl skip2
|
||||
mov [edx+0],eax
|
||||
mov [edx+4],eax
|
||||
add edx,8
|
||||
sub ecx,2
|
||||
skip2: cmp ecx,1
|
||||
jl outta
|
||||
mov [edx+0],eax
|
||||
outta:
|
||||
}
|
||||
}
|
||||
|
||||
// optimized memory copy routine that handles all alignment
|
||||
// cases and block sizes efficiently
|
||||
void Com_Memcpy (void* dest, const void* src, const size_t count) {
|
||||
Com_Prefetch (src, count, PRE_READ);
|
||||
__asm
|
||||
{
|
||||
push edi
|
||||
push esi
|
||||
mov ecx,count
|
||||
cmp ecx,0 // count = 0 check (just to be on the safe side)
|
||||
je outta
|
||||
mov edx,dest
|
||||
mov ebx,src
|
||||
cmp ecx,32 // padding only?
|
||||
jl padding
|
||||
|
||||
mov edi,ecx
|
||||
and edi,~31 // edi = count&~31
|
||||
sub edi,32
|
||||
|
||||
align 16
|
||||
loopMisAligned:
|
||||
mov eax,[ebx + edi + 0 + 0*8]
|
||||
mov esi,[ebx + edi + 4 + 0*8]
|
||||
mov [edx+edi+0 + 0*8],eax
|
||||
mov [edx+edi+4 + 0*8],esi
|
||||
mov eax,[ebx + edi + 0 + 1*8]
|
||||
mov esi,[ebx + edi + 4 + 1*8]
|
||||
mov [edx+edi+0 + 1*8],eax
|
||||
mov [edx+edi+4 + 1*8],esi
|
||||
mov eax,[ebx + edi + 0 + 2*8]
|
||||
mov esi,[ebx + edi + 4 + 2*8]
|
||||
mov [edx+edi+0 + 2*8],eax
|
||||
mov [edx+edi+4 + 2*8],esi
|
||||
mov eax,[ebx + edi + 0 + 3*8]
|
||||
mov esi,[ebx + edi + 4 + 3*8]
|
||||
mov [edx+edi+0 + 3*8],eax
|
||||
mov [edx+edi+4 + 3*8],esi
|
||||
sub edi,32
|
||||
jge loopMisAligned
|
||||
|
||||
mov edi,ecx
|
||||
and edi,~31
|
||||
add ebx,edi // increase src pointer
|
||||
add edx,edi // increase dst pointer
|
||||
and ecx,31 // new count
|
||||
jz outta // if count = 0, get outta here
|
||||
|
||||
padding:
|
||||
cmp ecx,16
|
||||
jl skip16
|
||||
mov eax,dword ptr [ebx]
|
||||
mov dword ptr [edx],eax
|
||||
mov eax,dword ptr [ebx+4]
|
||||
mov dword ptr [edx+4],eax
|
||||
mov eax,dword ptr [ebx+8]
|
||||
mov dword ptr [edx+8],eax
|
||||
mov eax,dword ptr [ebx+12]
|
||||
mov dword ptr [edx+12],eax
|
||||
sub ecx,16
|
||||
add ebx,16
|
||||
add edx,16
|
||||
skip16:
|
||||
cmp ecx,8
|
||||
jl skip8
|
||||
mov eax,dword ptr [ebx]
|
||||
mov dword ptr [edx],eax
|
||||
mov eax,dword ptr [ebx+4]
|
||||
sub ecx,8
|
||||
mov dword ptr [edx+4],eax
|
||||
add ebx,8
|
||||
add edx,8
|
||||
skip8:
|
||||
cmp ecx,4
|
||||
jl skip4
|
||||
mov eax,dword ptr [ebx] // here 4-7 bytes
|
||||
add ebx,4
|
||||
sub ecx,4
|
||||
mov dword ptr [edx],eax
|
||||
add edx,4
|
||||
skip4: // 0-3 remaining bytes
|
||||
cmp ecx,2
|
||||
jl skip2
|
||||
mov ax,word ptr [ebx] // two bytes
|
||||
cmp ecx,3 // less than 3?
|
||||
mov word ptr [edx],ax
|
||||
jl outta
|
||||
mov al,byte ptr [ebx+2] // last byte
|
||||
mov byte ptr [edx+2],al
|
||||
jmp outta
|
||||
skip2:
|
||||
cmp ecx,1
|
||||
jl outta
|
||||
mov al,byte ptr [ebx]
|
||||
mov byte ptr [edx],al
|
||||
outta:
|
||||
pop esi
|
||||
pop edi
|
||||
}
|
||||
}
|
||||
|
||||
void Com_Memset (void* dest, const int val, const size_t count)
|
||||
{
|
||||
unsigned int fillval;
|
||||
|
||||
if (count < 8)
|
||||
{
|
||||
__asm
|
||||
{
|
||||
mov edx,dest
|
||||
mov eax, val
|
||||
mov ah,al
|
||||
mov ebx,eax
|
||||
and ebx, 0xffff
|
||||
shl eax,16
|
||||
add eax,ebx // eax now contains pattern
|
||||
mov ecx,count
|
||||
cmp ecx,4
|
||||
jl skip4
|
||||
mov [edx],eax // copy first dword
|
||||
add edx,4
|
||||
sub ecx,4
|
||||
skip4: cmp ecx,2
|
||||
jl skip2
|
||||
mov word ptr [edx],ax // copy 2 bytes
|
||||
add edx,2
|
||||
sub ecx,2
|
||||
skip2: cmp ecx,0
|
||||
je skip1
|
||||
mov byte ptr [edx],al // copy single byte
|
||||
skip1:
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
fillval = val;
|
||||
|
||||
fillval = fillval|(fillval<<8);
|
||||
fillval = fillval|(fillval<<16); // fill dword with 8-bit pattern
|
||||
|
||||
_copyDWord ((unsigned int*)(dest),fillval, count/4);
|
||||
|
||||
__asm // padding of 0-3 bytes
|
||||
{
|
||||
mov ecx,count
|
||||
mov eax,ecx
|
||||
and ecx,3
|
||||
jz skipA
|
||||
and eax,~3
|
||||
mov ebx,dest
|
||||
add ebx,eax
|
||||
mov eax,fillval
|
||||
cmp ecx,2
|
||||
jl skipB
|
||||
mov word ptr [ebx],ax
|
||||
cmp ecx,2
|
||||
je skipA
|
||||
mov byte ptr [ebx+2],al
|
||||
jmp skipA
|
||||
skipB:
|
||||
cmp ecx,0
|
||||
je skipA
|
||||
mov byte ptr [ebx],al
|
||||
skipA:
|
||||
}
|
||||
}
|
||||
|
||||
qboolean Com_Memcmp (const void *src0, const void *src1, const unsigned int count)
|
||||
{
|
||||
unsigned int i;
|
||||
// MMX version anyone?
|
||||
|
||||
if (count >= 16)
|
||||
{
|
||||
unsigned int *dw = (unsigned int*)(src0);
|
||||
unsigned int *sw = (unsigned int*)(src1);
|
||||
|
||||
unsigned int nm2 = count/16;
|
||||
for (i = 0; i < nm2; i+=4)
|
||||
{
|
||||
unsigned int tmp = (dw[i+0]-sw[i+0])|(dw[i+1]-sw[i+1])|
|
||||
(dw[i+2]-sw[i+2])|(dw[i+3]-sw[i+3]);
|
||||
if (tmp)
|
||||
return qfalse;
|
||||
}
|
||||
}
|
||||
if (count & 15)
|
||||
{
|
||||
byte *d = (byte*)src0;
|
||||
byte *s = (byte*)src1;
|
||||
for (i = count & 0xfffffff0; i < count; i++)
|
||||
if (d[i]!=s[i])
|
||||
return qfalse;
|
||||
}
|
||||
|
||||
return qtrue;
|
||||
}
|
||||
|
||||
void Com_Prefetch (const void *s, const unsigned int bytes, e_prefetch type)
|
||||
{
|
||||
// write buffer prefetching is performed only if
|
||||
// the processor benefits from it. Read and read/write
|
||||
// prefetching is always performed.
|
||||
|
||||
switch (type)
|
||||
{
|
||||
case PRE_WRITE : break;
|
||||
case PRE_READ:
|
||||
case PRE_READ_WRITE:
|
||||
|
||||
__asm
|
||||
{
|
||||
mov ebx,s
|
||||
mov ecx,bytes
|
||||
cmp ecx,4096 // clamp to 4kB
|
||||
jle skipClamp
|
||||
mov ecx,4096
|
||||
skipClamp:
|
||||
add ecx,0x1f
|
||||
shr ecx,5 // number of cache lines
|
||||
jz skip
|
||||
jmp loopie
|
||||
|
||||
align 16
|
||||
loopie: test byte ptr [ebx],al
|
||||
add ebx,32
|
||||
dec ecx
|
||||
jnz loopie
|
||||
skip:
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
//------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
|
|
@ -38,13 +38,8 @@ void MD4Init (MD4_CTX *);
|
|||
void MD4Update (MD4_CTX *, const unsigned char *, unsigned int);
|
||||
void MD4Final (unsigned char [16], MD4_CTX *);
|
||||
|
||||
#if I_WANT_A_CUSTOM_MEMCPY
|
||||
void Com_Memset (void* dest, const int val, const size_t count);
|
||||
void Com_Memcpy (void* dest, const void* src, const size_t count);
|
||||
#else
|
||||
#define Com_Memset memset
|
||||
#define Com_Memcpy memcpy
|
||||
#endif
|
||||
|
||||
/* MD4C.C - RSA Data Security, Inc., MD4 message-digest algorithm */
|
||||
/* Copyright (C) 1990-2, RSA Data Security, Inc. All rights reserved.
|
||||
|
|
|
@ -243,13 +243,8 @@ void Snd_Memset (void* dest, const int val, const size_t count);
|
|||
#define Snd_Memset Com_Memset
|
||||
#endif
|
||||
|
||||
#if I_WANT_A_CUSTOM_MEMCPY
|
||||
void Com_Memset (void* dest, const int val, const size_t count);
|
||||
void Com_Memcpy (void* dest, const void* src, const size_t count);
|
||||
#else
|
||||
#define Com_Memset memset
|
||||
#define Com_Memcpy memcpy
|
||||
#endif
|
||||
|
||||
#define CIN_system 1
|
||||
#define CIN_loop 2
|
||||
|
|
|
@ -350,6 +350,9 @@ ifeq ($(PLATFORM),mingw32)
|
|||
LDFLAGS+=-m32
|
||||
endif
|
||||
|
||||
BUILD_SERVER = 0
|
||||
BUILD_CLIENT_SMP = 0
|
||||
|
||||
else # ifeq mingw32
|
||||
|
||||
#############################################################################
|
||||
|
@ -823,9 +826,19 @@ Q3OBJ = \
|
|||
|
||||
ifeq ($(ARCH),i386)
|
||||
Q3OBJ += $(B)/client/vm_x86.o
|
||||
Q3OBJ += \
|
||||
$(B)/client/snd_mixa.o \
|
||||
$(B)/client/matha.o \
|
||||
$(B)/client/ftola.o \
|
||||
$(B)/client/snapvectora.o
|
||||
endif
|
||||
ifeq ($(ARCH),x86)
|
||||
Q3OBJ += $(B)/client/vm_x86.o
|
||||
Q3OBJ += \
|
||||
$(B)/client/snd_mixa.o \
|
||||
$(B)/client/matha.o \
|
||||
$(B)/client/ftola.o \
|
||||
$(B)/client/snapvectora.o
|
||||
endif
|
||||
ifeq ($(ARCH),x86_64)
|
||||
Q3OBJ += $(B)/client/vm_x86_64.o
|
||||
|
@ -837,21 +850,6 @@ ifeq ($(ARCH),ppc)
|
|||
endif
|
||||
endif
|
||||
|
||||
Q3OBJ += \
|
||||
$(B)/client/linux_common.o \
|
||||
\
|
||||
$(B)/client/snd_mixa.o \
|
||||
$(B)/client/matha.o \
|
||||
$(B)/client/ftola.o \
|
||||
$(B)/client/snapvectora.o \
|
||||
\
|
||||
$(B)/client/unix_main.o \
|
||||
$(B)/client/unix_net.o \
|
||||
$(B)/client/unix_shared.o \
|
||||
$(B)/client/linux_signals.o \
|
||||
$(B)/client/linux_qgl.o \
|
||||
$(B)/client/linux_snd.o \
|
||||
$(B)/client/sdl_snd.o
|
||||
|
||||
ifeq ($(PLATFORM),mingw32)
|
||||
Q3OBJ += \
|
||||
|
@ -867,6 +865,15 @@ ifeq ($(PLATFORM),mingw32)
|
|||
$(B)/client/win_wndproc.o \
|
||||
$(B)/client/win_resource.o
|
||||
else
|
||||
Q3OBJ += \
|
||||
$(B)/client/unix_main.o \
|
||||
$(B)/client/unix_net.o \
|
||||
$(B)/client/unix_shared.o \
|
||||
$(B)/client/linux_signals.o \
|
||||
$(B)/client/linux_qgl.o \
|
||||
$(B)/client/linux_snd.o \
|
||||
$(B)/client/sdl_snd.o
|
||||
|
||||
ifeq ($(PLATFORM),linux)
|
||||
Q3OBJ += $(B)/client/linux_joystick.o
|
||||
endif
|
||||
|
@ -1051,7 +1058,6 @@ $(B)/client/irix_glimp_smp.o : $(UDIR)/irix_glimp.c; $(DO_SMP_CC)
|
|||
$(B)/client/irix_snd.o : $(UDIR)/irix_snd.c; $(DO_CC)
|
||||
$(B)/client/irix_input.o : $(UDIR)/irix_input.c; $(DO_CC)
|
||||
$(B)/client/linux_signals.o : $(UDIR)/linux_signals.c; $(DO_CC) $(GL_CFLAGS)
|
||||
$(B)/client/linux_common.o : $(UDIR)/linux_common.c; $(DO_CC)
|
||||
$(B)/client/linux_glimp.o : $(UDIR)/linux_glimp.c; $(DO_CC) $(GL_CFLAGS)
|
||||
$(B)/client/sdl_glimp.o : $(UDIR)/sdl_glimp.c; $(DO_CC) $(GL_CFLAGS)
|
||||
$(B)/client/linux_glimp_smp.o : $(UDIR)/linux_glimp.c; $(DO_SMP_CC) $(GL_CFLAGS)
|
||||
|
@ -1154,7 +1160,6 @@ Q3DOBJ = \
|
|||
$(B)/ded/l_struct.o \
|
||||
\
|
||||
$(B)/ded/linux_signals.o \
|
||||
$(B)/ded/linux_common.o \
|
||||
$(B)/ded/unix_main.o \
|
||||
$(B)/ded/unix_net.o \
|
||||
$(B)/ded/unix_shared.o \
|
||||
|
@ -1236,7 +1241,6 @@ $(B)/ded/l_script.o : $(BLIBDIR)/l_script.c; $(DO_BOT_CC)
|
|||
$(B)/ded/l_struct.o : $(BLIBDIR)/l_struct.c; $(DO_BOT_CC)
|
||||
|
||||
$(B)/ded/linux_signals.o : $(UDIR)/linux_signals.c; $(DO_DED_CC)
|
||||
$(B)/ded/linux_common.o : $(UDIR)/linux_common.c; $(DO_DED_CC)
|
||||
$(B)/ded/unix_main.o : $(UDIR)/unix_main.c; $(DO_DED_CC)
|
||||
$(B)/ded/unix_net.o : $(UDIR)/unix_net.c; $(DO_DED_CC)
|
||||
$(B)/ded/unix_shared.o : $(UDIR)/unix_shared.c; $(DO_DED_CC)
|
||||
|
|
|
@ -1,346 +0,0 @@
|
|||
#if 0 // not used anymore
|
||||
/*
|
||||
===========================================================================
|
||||
Copyright (C) 1999-2005 Id Software, Inc.
|
||||
|
||||
This file is part of Quake III Arena source code.
|
||||
|
||||
Quake III Arena source code is free software; you can redistribute it
|
||||
and/or modify it under the terms of the GNU General Public License as
|
||||
published by the Free Software Foundation; either version 2 of the License,
|
||||
or (at your option) any later version.
|
||||
|
||||
Quake III Arena source code is distributed in the hope that it will be
|
||||
useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Quake III Arena source code; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
===========================================================================
|
||||
*/
|
||||
/**
|
||||
* GAS syntax equivalents of the MSVC asm memory calls in common.c
|
||||
*
|
||||
* The following changes have been made to the asm:
|
||||
* 1. Registers are loaded by the inline asm arguments when possible
|
||||
* 2. Labels have been changed to local label format (0,1,etc.) to allow inlining
|
||||
*
|
||||
* HISTORY:
|
||||
* AH - Created on 08 Dec 2000
|
||||
*/
|
||||
|
||||
#include <unistd.h> // AH - for size_t
|
||||
#include <string.h>
|
||||
|
||||
// bk001207 - we need something under Linux, too. Mac?
|
||||
#if 1 // defined(C_ONLY) // bk010102 - dedicated?
|
||||
void Com_Memcpy (void* dest, const void* src, const size_t count) {
|
||||
memcpy(dest, src, count);
|
||||
}
|
||||
|
||||
void Com_Memset (void* dest, const int val, const size_t count) {
|
||||
memset(dest, val, count);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
typedef enum {
|
||||
PRE_READ, // prefetch assuming that buffer is used for reading only
|
||||
PRE_WRITE, // prefetch assuming that buffer is used for writing only
|
||||
PRE_READ_WRITE // prefetch assuming that buffer is used for both reading and writing
|
||||
} e_prefetch;
|
||||
|
||||
void Com_Prefetch (const void *s, const unsigned int bytes, e_prefetch type);
|
||||
|
||||
void _copyDWord (unsigned int* dest, const unsigned int constant, const unsigned int count) {
|
||||
// MMX version not used on standard Pentium MMX
|
||||
// because the dword version is faster (with
|
||||
// proper destination prefetching)
|
||||
__asm__ __volatile__ (" \
|
||||
//mov eax,constant // eax = val \
|
||||
//mov edx,dest // dest \
|
||||
//mov ecx,count \
|
||||
movd %%eax, %%mm0 \
|
||||
punpckldq %%mm0, %%mm0 \
|
||||
\
|
||||
// ensure that destination is qword aligned \
|
||||
\
|
||||
testl $7, %%edx // qword padding?\
|
||||
jz 0f \
|
||||
movl %%eax, (%%edx) \
|
||||
decl %%ecx \
|
||||
addl $4, %%edx \
|
||||
\
|
||||
0: movl %%ecx, %%ebx \
|
||||
andl $0xfffffff0, %%ecx \
|
||||
jz 2f \
|
||||
jmp 1f \
|
||||
.align 16 \
|
||||
\
|
||||
// funny ordering here to avoid commands \
|
||||
// that cross 32-byte boundaries (the \
|
||||
// [edx+0] version has a special 3-byte opcode... \
|
||||
1: movq %%mm0, 8(%%edx) \
|
||||
movq %%mm0, 16(%%edx) \
|
||||
movq %%mm0, 24(%%edx) \
|
||||
movq %%mm0, 32(%%edx) \
|
||||
movq %%mm0, 40(%%edx) \
|
||||
movq %%mm0, 48(%%edx) \
|
||||
movq %%mm0, 56(%%edx) \
|
||||
movq %%mm0, (%%edx)\
|
||||
addl $64, %%edx \
|
||||
subl $16, %%ecx \
|
||||
jnz 1b \
|
||||
2: \
|
||||
movl %%ebx, %%ecx // ebx = cnt \
|
||||
andl $0xfffffff0, %%ecx // ecx = cnt&~15 \
|
||||
subl %%ecx, %%ebx \
|
||||
jz 6f \
|
||||
cmpl $8, %%ebx \
|
||||
jl 3f \
|
||||
\
|
||||
movq %%mm0, (%%edx) \
|
||||
movq %%mm0, 8(%%edx) \
|
||||
movq %%mm0, 16(%%edx) \
|
||||
movq %%mm0, 24(%%edx) \
|
||||
addl $32, %%edx \
|
||||
subl $8, %%ebx \
|
||||
jz 6f \
|
||||
\
|
||||
3: cmpl $4, %%ebx \
|
||||
jl 4f \
|
||||
\
|
||||
movq %%mm0, (%%edx) \
|
||||
movq %%mm0, 8(%%edx) \
|
||||
addl $16, %%edx \
|
||||
subl $4, %%ebx \
|
||||
\
|
||||
4: cmpl $2, %%ebx \
|
||||
jl 5f \
|
||||
movq %%mm0, (%%edx) \
|
||||
addl $8, %%edx \
|
||||
subl $2, %%ebx \
|
||||
\
|
||||
5: cmpl $1, %%ebx \
|
||||
jl 6f \
|
||||
movl %%eax, (%%edx) \
|
||||
6: \
|
||||
emms \
|
||||
"
|
||||
: : "a" (constant), "c" (count), "d" (dest)
|
||||
: "%ebx", "%edi", "%esi", "cc", "memory");
|
||||
}
|
||||
|
||||
// optimized memory copy routine that handles all alignment
|
||||
// cases and block sizes efficiently
|
||||
void Com_Memcpy (void* dest, const void* src, const size_t count) {
|
||||
Com_Prefetch (src, count, PRE_READ);
|
||||
__asm__ __volatile__ (" \
|
||||
pushl %%edi \
|
||||
pushl %%esi \
|
||||
//mov ecx,count \
|
||||
cmpl $0, %%ecx // count = 0 check (just to be on the safe side) \
|
||||
je 6f \
|
||||
//mov edx,dest \
|
||||
movl %0, %%ebx \
|
||||
cmpl $32, %%ecx // padding only? \
|
||||
jl 1f \
|
||||
\
|
||||
movl %%ecx, %%edi \
|
||||
andl $0xfffffe00, %%edi // edi = count&~31 \
|
||||
subl $32, %%edi \
|
||||
\
|
||||
.align 16 \
|
||||
0: \
|
||||
movl (%%ebx, %%edi, 1), %%eax \
|
||||
movl 4(%%ebx, %%edi, 1), %%esi \
|
||||
movl %%eax, (%%edx, %%edi, 1) \
|
||||
movl %%esi, 4(%%edx, %%edi, 1) \
|
||||
movl 8(%%ebx, %%edi, 1), %%eax \
|
||||
movl 12(%%ebx, %%edi, 1), %%esi \
|
||||
movl %%eax, 8(%%edx, %%edi, 1) \
|
||||
movl %%esi, 12(%%edx, %%edi, 1) \
|
||||
movl 16(%%ebx, %%edi, 1), %%eax \
|
||||
movl 20(%%ebx, %%edi, 1), %%esi \
|
||||
movl %%eax, 16(%%edx, %%edi, 1) \
|
||||
movl %%esi, 20(%%edx, %%edi, 1) \
|
||||
movl 24(%%ebx, %%edi, 1), %%eax \
|
||||
movl 28(%%ebx, %%edi, 1), %%esi \
|
||||
movl %%eax, 24(%%edx, %%edi, 1) \
|
||||
movl %%esi, 28(%%edx, %%edi, 1) \
|
||||
subl $32, %%edi \
|
||||
jge 0b \
|
||||
\
|
||||
movl %%ecx, %%edi \
|
||||
andl $0xfffffe00, %%edi \
|
||||
addl %%edi, %%ebx // increase src pointer \
|
||||
addl %%edi, %%edx // increase dst pointer \
|
||||
andl $31, %%ecx // new count \
|
||||
jz 6f // if count = 0, get outta here \
|
||||
\
|
||||
1: \
|
||||
cmpl $16, %%ecx \
|
||||
jl 2f \
|
||||
movl (%%ebx), %%eax \
|
||||
movl %%eax, (%%edx) \
|
||||
movl 4(%%ebx), %%eax \
|
||||
movl %%eax, 4(%%edx) \
|
||||
movl 8(%%ebx), %%eax \
|
||||
movl %%eax, 8(%%edx) \
|
||||
movl 12(%%ebx), %%eax \
|
||||
movl %%eax, 12(%%edx) \
|
||||
subl $16, %%ecx \
|
||||
addl $16, %%ebx \
|
||||
addl $16, %%edx \
|
||||
2: \
|
||||
cmpl $8, %%ecx \
|
||||
jl 3f \
|
||||
movl (%%ebx), %%eax \
|
||||
movl %%eax, (%%edx) \
|
||||
movl 4(%%ebx), %%eax \
|
||||
subl $8, %%ecx \
|
||||
movl %%eax, 4(%%edx) \
|
||||
addl $8, %%ebx \
|
||||
addl $8, %%edx \
|
||||
3: \
|
||||
cmpl $4, %%ecx \
|
||||
jl 4f \
|
||||
movl (%%ebx), %%eax // here 4-7 bytes \
|
||||
addl $4, %%ebx \
|
||||
subl $4, %%ecx \
|
||||
movl %%eax, (%%edx) \
|
||||
addl $4, %%edx \
|
||||
4: // 0-3 remaining bytes \
|
||||
cmpl $2, %%ecx \
|
||||
jl 5f \
|
||||
movw (%%ebx), %%ax // two bytes \
|
||||
cmpl $3, %%ecx // less than 3? \
|
||||
movw %%ax, (%%edx) \
|
||||
jl 6f \
|
||||
movb 2(%%ebx), %%al // last byte \
|
||||
movb %%al, 2(%%edx) \
|
||||
jmp 6f \
|
||||
5: \
|
||||
cmpl $1, %%ecx \
|
||||
jl 6f \
|
||||
movb (%%ebx), %%al \
|
||||
movb %%al, (%%edx) \
|
||||
6: \
|
||||
popl %%esi \
|
||||
popl %%edi \
|
||||
"
|
||||
: : "m" (src), "d" (dest), "c" (count)
|
||||
: "%eax", "%ebx", "%edi", "%esi", "cc", "memory");
|
||||
}
|
||||
|
||||
void Com_Memset (void* dest, const int val, const size_t count)
|
||||
{
|
||||
unsigned int fillval;
|
||||
|
||||
if (count < 8)
|
||||
{
|
||||
__asm__ __volatile__ (" \
|
||||
//mov edx,dest \
|
||||
//mov eax, val \
|
||||
movb %%al, %%ah \
|
||||
movl %%eax, %%ebx \
|
||||
andl $0xffff, %%ebx \
|
||||
shll $16, %%eax \
|
||||
addl %%ebx, %%eax // eax now contains pattern \
|
||||
//mov ecx,count \
|
||||
cmpl $4, %%ecx \
|
||||
jl 0f \
|
||||
movl %%eax, (%%edx) // copy first dword \
|
||||
addl $4, %%edx \
|
||||
subl $4, %%ecx \
|
||||
0: cmpl $2, %%ecx \
|
||||
jl 1f \
|
||||
movw %%ax, (%%edx) // copy 2 bytes \
|
||||
addl $2, %%edx \
|
||||
subl $2, %%ecx \
|
||||
1: cmpl $0, %%ecx \
|
||||
je 2f \
|
||||
movb %%al, (%%edx) // copy single byte \
|
||||
2: \
|
||||
"
|
||||
: : "d" (dest), "a" (val), "c" (count)
|
||||
: "%ebx", "%edi", "%esi", "cc", "memory");
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
fillval = val;
|
||||
|
||||
fillval = fillval|(fillval<<8);
|
||||
fillval = fillval|(fillval<<16); // fill dword with 8-bit pattern
|
||||
|
||||
_copyDWord ((unsigned int*)(dest),fillval, count/4);
|
||||
|
||||
__asm__ __volatile__ (" // padding of 0-3 bytes \
|
||||
//mov ecx,count \
|
||||
movl %%ecx, %%eax \
|
||||
andl $3, %%ecx \
|
||||
jz 1f \
|
||||
andl $0xffffff00, %%eax \
|
||||
//mov ebx,dest \
|
||||
addl %%eax, %%edx \
|
||||
movl %0, %%eax \
|
||||
cmpl $2, %%ecx \
|
||||
jl 0f \
|
||||
movw %%ax, (%%edx) \
|
||||
cmpl $2, %%ecx \
|
||||
je 1f \
|
||||
movb %%al, 2(%%edx) \
|
||||
jmp 1f \
|
||||
0: \
|
||||
cmpl $0, %%ecx\
|
||||
je 1f\
|
||||
movb %%al, (%%edx)\
|
||||
1: \
|
||||
"
|
||||
: : "m" (fillval), "c" (count), "d" (dest)
|
||||
: "%eax", "%ebx", "%edi", "%esi", "cc", "memory");
|
||||
}
|
||||
|
||||
void Com_Prefetch (const void *s, const unsigned int bytes, e_prefetch type)
|
||||
{
|
||||
// write buffer prefetching is performed only if
|
||||
// the processor benefits from it. Read and read/write
|
||||
// prefetching is always performed.
|
||||
|
||||
switch (type)
|
||||
{
|
||||
case PRE_WRITE : break;
|
||||
case PRE_READ:
|
||||
case PRE_READ_WRITE:
|
||||
|
||||
__asm__ __volatile__ ("\
|
||||
//mov ebx,s\
|
||||
//mov ecx,bytes\
|
||||
cmpl $4096, %%ecx // clamp to 4kB\
|
||||
jle 0f\
|
||||
movl $4096, %%ecx\
|
||||
0:\
|
||||
addl $0x1f, %%ecx\
|
||||
shrl $5, %%ecx // number of cache lines\
|
||||
jz 2f\
|
||||
jmp 1f\
|
||||
\
|
||||
.align 16\
|
||||
1: testb %%al, (%%edx)\
|
||||
addl $32, %%edx\
|
||||
decl %%ecx\
|
||||
jnz 1b\
|
||||
2:\
|
||||
"
|
||||
: : "d" (s), "c" (bytes)
|
||||
: "%eax", "%ebx", "%edi", "%esi", "memory", "cc");
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
Loading…
Reference in a new issue