* Fix the MinGW and (hopefully) OS X builds

* Remove custom memcpy/memset code
This commit is contained in:
Tim Angus 2006-01-04 03:40:49 +00:00
parent 2d9d10772f
commit 6e24cfe7d3
5 changed files with 22 additions and 684 deletions

View file

@ -2857,316 +2857,6 @@ void Com_Shutdown (void) {
}
#if I_WANT_A_CUSTOM_MEMCPY && !defined(_WIN32)
void Com_Memcpy (void* dest, const void* src, const size_t count)
{
memcpy(dest, src, count);
}
void Com_Memset (void* dest, const int val, const size_t count)
{
memset(dest, val, count);
}
#elif I_WANT_A_CUSTOM_MEMCPY && defined(_WIN32)
typedef enum
{
PRE_READ, // prefetch assuming that buffer is used for reading only
PRE_WRITE, // prefetch assuming that buffer is used for writing only
PRE_READ_WRITE // prefetch assuming that buffer is used for both reading and writing
} e_prefetch;
void Com_Prefetch (const void *s, const unsigned int bytes, e_prefetch type);
#define EMMS_INSTRUCTION __asm emms
void _copyDWord (unsigned int* dest, const unsigned int constant, const unsigned int count) {
__asm
{
mov edx,dest
mov eax,constant
mov ecx,count
and ecx,~7
jz padding
sub ecx,8
jmp loopu
align 16
loopu:
test [edx+ecx*4 + 28],ebx // fetch next block destination to L1 cache
mov [edx+ecx*4 + 0],eax
mov [edx+ecx*4 + 4],eax
mov [edx+ecx*4 + 8],eax
mov [edx+ecx*4 + 12],eax
mov [edx+ecx*4 + 16],eax
mov [edx+ecx*4 + 20],eax
mov [edx+ecx*4 + 24],eax
mov [edx+ecx*4 + 28],eax
sub ecx,8
jge loopu
padding: mov ecx,count
mov ebx,ecx
and ecx,7
jz outta
and ebx,~7
lea edx,[edx+ebx*4] // advance dest pointer
test [edx+0],eax // fetch destination to L1 cache
cmp ecx,4
jl skip4
mov [edx+0],eax
mov [edx+4],eax
mov [edx+8],eax
mov [edx+12],eax
add edx,16
sub ecx,4
skip4: cmp ecx,2
jl skip2
mov [edx+0],eax
mov [edx+4],eax
add edx,8
sub ecx,2
skip2: cmp ecx,1
jl outta
mov [edx+0],eax
outta:
}
}
// optimized memory copy routine that handles all alignment
// cases and block sizes efficiently
void Com_Memcpy (void* dest, const void* src, const size_t count) {
Com_Prefetch (src, count, PRE_READ);
__asm
{
push edi
push esi
mov ecx,count
cmp ecx,0 // count = 0 check (just to be on the safe side)
je outta
mov edx,dest
mov ebx,src
cmp ecx,32 // padding only?
jl padding
mov edi,ecx
and edi,~31 // edi = count&~31
sub edi,32
align 16
loopMisAligned:
mov eax,[ebx + edi + 0 + 0*8]
mov esi,[ebx + edi + 4 + 0*8]
mov [edx+edi+0 + 0*8],eax
mov [edx+edi+4 + 0*8],esi
mov eax,[ebx + edi + 0 + 1*8]
mov esi,[ebx + edi + 4 + 1*8]
mov [edx+edi+0 + 1*8],eax
mov [edx+edi+4 + 1*8],esi
mov eax,[ebx + edi + 0 + 2*8]
mov esi,[ebx + edi + 4 + 2*8]
mov [edx+edi+0 + 2*8],eax
mov [edx+edi+4 + 2*8],esi
mov eax,[ebx + edi + 0 + 3*8]
mov esi,[ebx + edi + 4 + 3*8]
mov [edx+edi+0 + 3*8],eax
mov [edx+edi+4 + 3*8],esi
sub edi,32
jge loopMisAligned
mov edi,ecx
and edi,~31
add ebx,edi // increase src pointer
add edx,edi // increase dst pointer
and ecx,31 // new count
jz outta // if count = 0, get outta here
padding:
cmp ecx,16
jl skip16
mov eax,dword ptr [ebx]
mov dword ptr [edx],eax
mov eax,dword ptr [ebx+4]
mov dword ptr [edx+4],eax
mov eax,dword ptr [ebx+8]
mov dword ptr [edx+8],eax
mov eax,dword ptr [ebx+12]
mov dword ptr [edx+12],eax
sub ecx,16
add ebx,16
add edx,16
skip16:
cmp ecx,8
jl skip8
mov eax,dword ptr [ebx]
mov dword ptr [edx],eax
mov eax,dword ptr [ebx+4]
sub ecx,8
mov dword ptr [edx+4],eax
add ebx,8
add edx,8
skip8:
cmp ecx,4
jl skip4
mov eax,dword ptr [ebx] // here 4-7 bytes
add ebx,4
sub ecx,4
mov dword ptr [edx],eax
add edx,4
skip4: // 0-3 remaining bytes
cmp ecx,2
jl skip2
mov ax,word ptr [ebx] // two bytes
cmp ecx,3 // less than 3?
mov word ptr [edx],ax
jl outta
mov al,byte ptr [ebx+2] // last byte
mov byte ptr [edx+2],al
jmp outta
skip2:
cmp ecx,1
jl outta
mov al,byte ptr [ebx]
mov byte ptr [edx],al
outta:
pop esi
pop edi
}
}
void Com_Memset (void* dest, const int val, const size_t count)
{
unsigned int fillval;
if (count < 8)
{
__asm
{
mov edx,dest
mov eax, val
mov ah,al
mov ebx,eax
and ebx, 0xffff
shl eax,16
add eax,ebx // eax now contains pattern
mov ecx,count
cmp ecx,4
jl skip4
mov [edx],eax // copy first dword
add edx,4
sub ecx,4
skip4: cmp ecx,2
jl skip2
mov word ptr [edx],ax // copy 2 bytes
add edx,2
sub ecx,2
skip2: cmp ecx,0
je skip1
mov byte ptr [edx],al // copy single byte
skip1:
}
return;
}
fillval = val;
fillval = fillval|(fillval<<8);
fillval = fillval|(fillval<<16); // fill dword with 8-bit pattern
_copyDWord ((unsigned int*)(dest),fillval, count/4);
__asm // padding of 0-3 bytes
{
mov ecx,count
mov eax,ecx
and ecx,3
jz skipA
and eax,~3
mov ebx,dest
add ebx,eax
mov eax,fillval
cmp ecx,2
jl skipB
mov word ptr [ebx],ax
cmp ecx,2
je skipA
mov byte ptr [ebx+2],al
jmp skipA
skipB:
cmp ecx,0
je skipA
mov byte ptr [ebx],al
skipA:
}
}
qboolean Com_Memcmp (const void *src0, const void *src1, const unsigned int count)
{
unsigned int i;
// MMX version anyone?
if (count >= 16)
{
unsigned int *dw = (unsigned int*)(src0);
unsigned int *sw = (unsigned int*)(src1);
unsigned int nm2 = count/16;
for (i = 0; i < nm2; i+=4)
{
unsigned int tmp = (dw[i+0]-sw[i+0])|(dw[i+1]-sw[i+1])|
(dw[i+2]-sw[i+2])|(dw[i+3]-sw[i+3]);
if (tmp)
return qfalse;
}
}
if (count & 15)
{
byte *d = (byte*)src0;
byte *s = (byte*)src1;
for (i = count & 0xfffffff0; i < count; i++)
if (d[i]!=s[i])
return qfalse;
}
return qtrue;
}
void Com_Prefetch (const void *s, const unsigned int bytes, e_prefetch type)
{
// write buffer prefetching is performed only if
// the processor benefits from it. Read and read/write
// prefetching is always performed.
switch (type)
{
case PRE_WRITE : break;
case PRE_READ:
case PRE_READ_WRITE:
__asm
{
mov ebx,s
mov ecx,bytes
cmp ecx,4096 // clamp to 4kB
jle skipClamp
mov ecx,4096
skipClamp:
add ecx,0x1f
shr ecx,5 // number of cache lines
jz skip
jmp loopie
align 16
loopie: test byte ptr [ebx],al
add ebx,32
dec ecx
jnz loopie
skip:
}
break;
}
}
#endif
//------------------------------------------------------------------------

View file

@ -38,13 +38,8 @@ void MD4Init (MD4_CTX *);
void MD4Update (MD4_CTX *, const unsigned char *, unsigned int);
void MD4Final (unsigned char [16], MD4_CTX *);
#if I_WANT_A_CUSTOM_MEMCPY
void Com_Memset (void* dest, const int val, const size_t count);
void Com_Memcpy (void* dest, const void* src, const size_t count);
#else
#define Com_Memset memset
#define Com_Memcpy memcpy
#endif
/* MD4C.C - RSA Data Security, Inc., MD4 message-digest algorithm */
/* Copyright (C) 1990-2, RSA Data Security, Inc. All rights reserved.

View file

@ -243,13 +243,8 @@ void Snd_Memset (void* dest, const int val, const size_t count);
#define Snd_Memset Com_Memset
#endif
#if I_WANT_A_CUSTOM_MEMCPY
void Com_Memset (void* dest, const int val, const size_t count);
void Com_Memcpy (void* dest, const void* src, const size_t count);
#else
#define Com_Memset memset
#define Com_Memcpy memcpy
#endif
#define CIN_system 1
#define CIN_loop 2

View file

@ -350,6 +350,9 @@ ifeq ($(PLATFORM),mingw32)
LDFLAGS+=-m32
endif
BUILD_SERVER = 0
BUILD_CLIENT_SMP = 0
else # ifeq mingw32
#############################################################################
@ -823,9 +826,19 @@ Q3OBJ = \
ifeq ($(ARCH),i386)
Q3OBJ += $(B)/client/vm_x86.o
Q3OBJ += \
$(B)/client/snd_mixa.o \
$(B)/client/matha.o \
$(B)/client/ftola.o \
$(B)/client/snapvectora.o
endif
ifeq ($(ARCH),x86)
Q3OBJ += $(B)/client/vm_x86.o
Q3OBJ += \
$(B)/client/snd_mixa.o \
$(B)/client/matha.o \
$(B)/client/ftola.o \
$(B)/client/snapvectora.o
endif
ifeq ($(ARCH),x86_64)
Q3OBJ += $(B)/client/vm_x86_64.o
@ -837,21 +850,6 @@ ifeq ($(ARCH),ppc)
endif
endif
Q3OBJ += \
$(B)/client/linux_common.o \
\
$(B)/client/snd_mixa.o \
$(B)/client/matha.o \
$(B)/client/ftola.o \
$(B)/client/snapvectora.o \
\
$(B)/client/unix_main.o \
$(B)/client/unix_net.o \
$(B)/client/unix_shared.o \
$(B)/client/linux_signals.o \
$(B)/client/linux_qgl.o \
$(B)/client/linux_snd.o \
$(B)/client/sdl_snd.o
ifeq ($(PLATFORM),mingw32)
Q3OBJ += \
@ -867,6 +865,15 @@ ifeq ($(PLATFORM),mingw32)
$(B)/client/win_wndproc.o \
$(B)/client/win_resource.o
else
Q3OBJ += \
$(B)/client/unix_main.o \
$(B)/client/unix_net.o \
$(B)/client/unix_shared.o \
$(B)/client/linux_signals.o \
$(B)/client/linux_qgl.o \
$(B)/client/linux_snd.o \
$(B)/client/sdl_snd.o
ifeq ($(PLATFORM),linux)
Q3OBJ += $(B)/client/linux_joystick.o
endif
@ -1051,7 +1058,6 @@ $(B)/client/irix_glimp_smp.o : $(UDIR)/irix_glimp.c; $(DO_SMP_CC)
$(B)/client/irix_snd.o : $(UDIR)/irix_snd.c; $(DO_CC)
$(B)/client/irix_input.o : $(UDIR)/irix_input.c; $(DO_CC)
$(B)/client/linux_signals.o : $(UDIR)/linux_signals.c; $(DO_CC) $(GL_CFLAGS)
$(B)/client/linux_common.o : $(UDIR)/linux_common.c; $(DO_CC)
$(B)/client/linux_glimp.o : $(UDIR)/linux_glimp.c; $(DO_CC) $(GL_CFLAGS)
$(B)/client/sdl_glimp.o : $(UDIR)/sdl_glimp.c; $(DO_CC) $(GL_CFLAGS)
$(B)/client/linux_glimp_smp.o : $(UDIR)/linux_glimp.c; $(DO_SMP_CC) $(GL_CFLAGS)
@ -1154,7 +1160,6 @@ Q3DOBJ = \
$(B)/ded/l_struct.o \
\
$(B)/ded/linux_signals.o \
$(B)/ded/linux_common.o \
$(B)/ded/unix_main.o \
$(B)/ded/unix_net.o \
$(B)/ded/unix_shared.o \
@ -1236,7 +1241,6 @@ $(B)/ded/l_script.o : $(BLIBDIR)/l_script.c; $(DO_BOT_CC)
$(B)/ded/l_struct.o : $(BLIBDIR)/l_struct.c; $(DO_BOT_CC)
$(B)/ded/linux_signals.o : $(UDIR)/linux_signals.c; $(DO_DED_CC)
$(B)/ded/linux_common.o : $(UDIR)/linux_common.c; $(DO_DED_CC)
$(B)/ded/unix_main.o : $(UDIR)/unix_main.c; $(DO_DED_CC)
$(B)/ded/unix_net.o : $(UDIR)/unix_net.c; $(DO_DED_CC)
$(B)/ded/unix_shared.o : $(UDIR)/unix_shared.c; $(DO_DED_CC)

View file

@ -1,346 +0,0 @@
#if 0 // not used anymore
/*
===========================================================================
Copyright (C) 1999-2005 Id Software, Inc.
This file is part of Quake III Arena source code.
Quake III Arena source code is free software; you can redistribute it
and/or modify it under the terms of the GNU General Public License as
published by the Free Software Foundation; either version 2 of the License,
or (at your option) any later version.
Quake III Arena source code is distributed in the hope that it will be
useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Quake III Arena source code; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
===========================================================================
*/
/**
* GAS syntax equivalents of the MSVC asm memory calls in common.c
*
* The following changes have been made to the asm:
* 1. Registers are loaded by the inline asm arguments when possible
* 2. Labels have been changed to local label format (0,1,etc.) to allow inlining
*
* HISTORY:
* AH - Created on 08 Dec 2000
*/
#include <unistd.h> // AH - for size_t
#include <string.h>
// bk001207 - we need something under Linux, too. Mac?
#if 1 // defined(C_ONLY) // bk010102 - dedicated?
void Com_Memcpy (void* dest, const void* src, const size_t count) {
memcpy(dest, src, count);
}
void Com_Memset (void* dest, const int val, const size_t count) {
memset(dest, val, count);
}
#else
typedef enum {
PRE_READ, // prefetch assuming that buffer is used for reading only
PRE_WRITE, // prefetch assuming that buffer is used for writing only
PRE_READ_WRITE // prefetch assuming that buffer is used for both reading and writing
} e_prefetch;
void Com_Prefetch (const void *s, const unsigned int bytes, e_prefetch type);
void _copyDWord (unsigned int* dest, const unsigned int constant, const unsigned int count) {
// MMX version not used on standard Pentium MMX
// because the dword version is faster (with
// proper destination prefetching)
__asm__ __volatile__ (" \
//mov eax,constant // eax = val \
//mov edx,dest // dest \
//mov ecx,count \
movd %%eax, %%mm0 \
punpckldq %%mm0, %%mm0 \
\
// ensure that destination is qword aligned \
\
testl $7, %%edx // qword padding?\
jz 0f \
movl %%eax, (%%edx) \
decl %%ecx \
addl $4, %%edx \
\
0: movl %%ecx, %%ebx \
andl $0xfffffff0, %%ecx \
jz 2f \
jmp 1f \
.align 16 \
\
// funny ordering here to avoid commands \
// that cross 32-byte boundaries (the \
// [edx+0] version has a special 3-byte opcode... \
1: movq %%mm0, 8(%%edx) \
movq %%mm0, 16(%%edx) \
movq %%mm0, 24(%%edx) \
movq %%mm0, 32(%%edx) \
movq %%mm0, 40(%%edx) \
movq %%mm0, 48(%%edx) \
movq %%mm0, 56(%%edx) \
movq %%mm0, (%%edx)\
addl $64, %%edx \
subl $16, %%ecx \
jnz 1b \
2: \
movl %%ebx, %%ecx // ebx = cnt \
andl $0xfffffff0, %%ecx // ecx = cnt&~15 \
subl %%ecx, %%ebx \
jz 6f \
cmpl $8, %%ebx \
jl 3f \
\
movq %%mm0, (%%edx) \
movq %%mm0, 8(%%edx) \
movq %%mm0, 16(%%edx) \
movq %%mm0, 24(%%edx) \
addl $32, %%edx \
subl $8, %%ebx \
jz 6f \
\
3: cmpl $4, %%ebx \
jl 4f \
\
movq %%mm0, (%%edx) \
movq %%mm0, 8(%%edx) \
addl $16, %%edx \
subl $4, %%ebx \
\
4: cmpl $2, %%ebx \
jl 5f \
movq %%mm0, (%%edx) \
addl $8, %%edx \
subl $2, %%ebx \
\
5: cmpl $1, %%ebx \
jl 6f \
movl %%eax, (%%edx) \
6: \
emms \
"
: : "a" (constant), "c" (count), "d" (dest)
: "%ebx", "%edi", "%esi", "cc", "memory");
}
// optimized memory copy routine that handles all alignment
// cases and block sizes efficiently
void Com_Memcpy (void* dest, const void* src, const size_t count) {
Com_Prefetch (src, count, PRE_READ);
__asm__ __volatile__ (" \
pushl %%edi \
pushl %%esi \
//mov ecx,count \
cmpl $0, %%ecx // count = 0 check (just to be on the safe side) \
je 6f \
//mov edx,dest \
movl %0, %%ebx \
cmpl $32, %%ecx // padding only? \
jl 1f \
\
movl %%ecx, %%edi \
andl $0xfffffe00, %%edi // edi = count&~31 \
subl $32, %%edi \
\
.align 16 \
0: \
movl (%%ebx, %%edi, 1), %%eax \
movl 4(%%ebx, %%edi, 1), %%esi \
movl %%eax, (%%edx, %%edi, 1) \
movl %%esi, 4(%%edx, %%edi, 1) \
movl 8(%%ebx, %%edi, 1), %%eax \
movl 12(%%ebx, %%edi, 1), %%esi \
movl %%eax, 8(%%edx, %%edi, 1) \
movl %%esi, 12(%%edx, %%edi, 1) \
movl 16(%%ebx, %%edi, 1), %%eax \
movl 20(%%ebx, %%edi, 1), %%esi \
movl %%eax, 16(%%edx, %%edi, 1) \
movl %%esi, 20(%%edx, %%edi, 1) \
movl 24(%%ebx, %%edi, 1), %%eax \
movl 28(%%ebx, %%edi, 1), %%esi \
movl %%eax, 24(%%edx, %%edi, 1) \
movl %%esi, 28(%%edx, %%edi, 1) \
subl $32, %%edi \
jge 0b \
\
movl %%ecx, %%edi \
andl $0xfffffe00, %%edi \
addl %%edi, %%ebx // increase src pointer \
addl %%edi, %%edx // increase dst pointer \
andl $31, %%ecx // new count \
jz 6f // if count = 0, get outta here \
\
1: \
cmpl $16, %%ecx \
jl 2f \
movl (%%ebx), %%eax \
movl %%eax, (%%edx) \
movl 4(%%ebx), %%eax \
movl %%eax, 4(%%edx) \
movl 8(%%ebx), %%eax \
movl %%eax, 8(%%edx) \
movl 12(%%ebx), %%eax \
movl %%eax, 12(%%edx) \
subl $16, %%ecx \
addl $16, %%ebx \
addl $16, %%edx \
2: \
cmpl $8, %%ecx \
jl 3f \
movl (%%ebx), %%eax \
movl %%eax, (%%edx) \
movl 4(%%ebx), %%eax \
subl $8, %%ecx \
movl %%eax, 4(%%edx) \
addl $8, %%ebx \
addl $8, %%edx \
3: \
cmpl $4, %%ecx \
jl 4f \
movl (%%ebx), %%eax // here 4-7 bytes \
addl $4, %%ebx \
subl $4, %%ecx \
movl %%eax, (%%edx) \
addl $4, %%edx \
4: // 0-3 remaining bytes \
cmpl $2, %%ecx \
jl 5f \
movw (%%ebx), %%ax // two bytes \
cmpl $3, %%ecx // less than 3? \
movw %%ax, (%%edx) \
jl 6f \
movb 2(%%ebx), %%al // last byte \
movb %%al, 2(%%edx) \
jmp 6f \
5: \
cmpl $1, %%ecx \
jl 6f \
movb (%%ebx), %%al \
movb %%al, (%%edx) \
6: \
popl %%esi \
popl %%edi \
"
: : "m" (src), "d" (dest), "c" (count)
: "%eax", "%ebx", "%edi", "%esi", "cc", "memory");
}
void Com_Memset (void* dest, const int val, const size_t count)
{
unsigned int fillval;
if (count < 8)
{
__asm__ __volatile__ (" \
//mov edx,dest \
//mov eax, val \
movb %%al, %%ah \
movl %%eax, %%ebx \
andl $0xffff, %%ebx \
shll $16, %%eax \
addl %%ebx, %%eax // eax now contains pattern \
//mov ecx,count \
cmpl $4, %%ecx \
jl 0f \
movl %%eax, (%%edx) // copy first dword \
addl $4, %%edx \
subl $4, %%ecx \
0: cmpl $2, %%ecx \
jl 1f \
movw %%ax, (%%edx) // copy 2 bytes \
addl $2, %%edx \
subl $2, %%ecx \
1: cmpl $0, %%ecx \
je 2f \
movb %%al, (%%edx) // copy single byte \
2: \
"
: : "d" (dest), "a" (val), "c" (count)
: "%ebx", "%edi", "%esi", "cc", "memory");
return;
}
fillval = val;
fillval = fillval|(fillval<<8);
fillval = fillval|(fillval<<16); // fill dword with 8-bit pattern
_copyDWord ((unsigned int*)(dest),fillval, count/4);
__asm__ __volatile__ (" // padding of 0-3 bytes \
//mov ecx,count \
movl %%ecx, %%eax \
andl $3, %%ecx \
jz 1f \
andl $0xffffff00, %%eax \
//mov ebx,dest \
addl %%eax, %%edx \
movl %0, %%eax \
cmpl $2, %%ecx \
jl 0f \
movw %%ax, (%%edx) \
cmpl $2, %%ecx \
je 1f \
movb %%al, 2(%%edx) \
jmp 1f \
0: \
cmpl $0, %%ecx\
je 1f\
movb %%al, (%%edx)\
1: \
"
: : "m" (fillval), "c" (count), "d" (dest)
: "%eax", "%ebx", "%edi", "%esi", "cc", "memory");
}
void Com_Prefetch (const void *s, const unsigned int bytes, e_prefetch type)
{
// write buffer prefetching is performed only if
// the processor benefits from it. Read and read/write
// prefetching is always performed.
switch (type)
{
case PRE_WRITE : break;
case PRE_READ:
case PRE_READ_WRITE:
__asm__ __volatile__ ("\
//mov ebx,s\
//mov ecx,bytes\
cmpl $4096, %%ecx // clamp to 4kB\
jle 0f\
movl $4096, %%ecx\
0:\
addl $0x1f, %%ecx\
shrl $5, %%ecx // number of cache lines\
jz 2f\
jmp 1f\
\
.align 16\
1: testb %%al, (%%edx)\
addl $32, %%edx\
decl %%ecx\
jnz 1b\
2:\
"
: : "d" (s), "c" (bytes)
: "%eax", "%ebx", "%edi", "%esi", "memory", "cc");
break;
}
}
#endif
#endif