* Fix the MinGW and (hopefully) OS X builds

* Remove custom memcpy/memset code
2024-11-10 14:41:42 +00:00 · 2006-01-04 03:40:49 +00:00 · 2006-01-04 03:40:49 +00:00 · 6e24cfe7d3
commit 6e24cfe7d3
parent 2d9d10772f
5 changed files with 22 additions and 684 deletions
--- a/code/qcommon/common.c
+++ b/code/qcommon/common.c
@ -2857,316 +2857,6 @@ void Com_Shutdown (void) {
 }
 #if I_WANT_A_CUSTOM_MEMCPY && !defined(_WIN32)
 void Com_Memcpy (void* dest, const void* src, const size_t count)
 {
 	memcpy(dest, src, count);
 }
 void Com_Memset (void* dest, const int val, const size_t count)
 {
 	memset(dest, val, count);
 }
 #elif I_WANT_A_CUSTOM_MEMCPY && defined(_WIN32)
 typedef enum
 {
 	PRE_READ,									// prefetch assuming that buffer is used for reading only
 	PRE_WRITE,									// prefetch assuming that buffer is used for writing only
 	PRE_READ_WRITE								// prefetch assuming that buffer is used for both reading and writing
 } e_prefetch;
 void Com_Prefetch (const void *s, const unsigned int bytes, e_prefetch type);
 #define EMMS_INSTRUCTION	__asm emms
 void _copyDWord (unsigned int* dest, const unsigned int constant, const unsigned int count) {
 	__asm
 	{
 			mov		edx,dest
 			mov		eax,constant
 			mov		ecx,count
 			and		ecx,~7
 			jz		padding
 			sub		ecx,8
 			jmp		loopu
 			align	16
 loopu:		
 			test	[edx+ecx*4 + 28],ebx		// fetch next block destination to L1 cache
 			mov		[edx+ecx*4 + 0],eax
 			mov		[edx+ecx*4 + 4],eax
 			mov		[edx+ecx*4 + 8],eax
 			mov		[edx+ecx*4 + 12],eax
 			mov		[edx+ecx*4 + 16],eax
 			mov		[edx+ecx*4 + 20],eax
 			mov		[edx+ecx*4 + 24],eax
 			mov		[edx+ecx*4 + 28],eax
 			sub		ecx,8
 			jge		loopu
 padding:	mov		ecx,count
 			mov		ebx,ecx
 			and		ecx,7
 			jz		outta
 			and		ebx,~7
 			lea		edx,[edx+ebx*4]				// advance dest pointer
 			test	[edx+0],eax					// fetch destination to L1 cache
 			cmp		ecx,4
 			jl		skip4
 			mov		[edx+0],eax
 			mov		[edx+4],eax
 			mov		[edx+8],eax
 			mov		[edx+12],eax
 			add		edx,16
 			sub		ecx,4
 skip4:		cmp		ecx,2
 			jl		skip2
 			mov		[edx+0],eax
 			mov		[edx+4],eax
 			add		edx,8
 			sub		ecx,2
 skip2:		cmp		ecx,1
 			jl		outta
 			mov		[edx+0],eax
 outta:
 	}
 }
 // optimized memory copy routine that handles all alignment
 // cases and block sizes efficiently
 void Com_Memcpy (void* dest, const void* src, const size_t count) {
 	Com_Prefetch (src, count, PRE_READ);
 	__asm
 	{
 		push	edi
 		push	esi
 		mov		ecx,count
 		cmp		ecx,0						// count = 0 check (just to be on the safe side)
 		je		outta
 		mov		edx,dest
 		mov		ebx,src
 		cmp		ecx,32						// padding only?
 		jl		padding
 		mov		edi,ecx					
 		and		edi,~31					// edi = count&~31
 		sub		edi,32
 		align 16
 loopMisAligned:
 		mov		eax,[ebx + edi + 0 + 0*8]
 		mov		esi,[ebx + edi + 4 + 0*8]
 		mov		[edx+edi+0 + 0*8],eax
 		mov		[edx+edi+4 + 0*8],esi
 		mov		eax,[ebx + edi + 0 + 1*8]
 		mov		esi,[ebx + edi + 4 + 1*8]
 		mov		[edx+edi+0 + 1*8],eax
 		mov		[edx+edi+4 + 1*8],esi
 		mov		eax,[ebx + edi + 0 + 2*8]
 		mov		esi,[ebx + edi + 4 + 2*8]
 		mov		[edx+edi+0 + 2*8],eax
 		mov		[edx+edi+4 + 2*8],esi
 		mov		eax,[ebx + edi + 0 + 3*8]
 		mov		esi,[ebx + edi + 4 + 3*8]
 		mov		[edx+edi+0 + 3*8],eax
 		mov		[edx+edi+4 + 3*8],esi
 		sub		edi,32
 		jge		loopMisAligned
 		mov		edi,ecx
 		and		edi,~31
 		add		ebx,edi					// increase src pointer
 		add		edx,edi					// increase dst pointer
 		and		ecx,31					// new count
 		jz		outta					// if count = 0, get outta here
 padding:
 		cmp		ecx,16
 		jl		skip16
 		mov		eax,dword ptr [ebx]
 		mov		dword ptr [edx],eax
 		mov		eax,dword ptr [ebx+4]
 		mov		dword ptr [edx+4],eax
 		mov		eax,dword ptr [ebx+8]
 		mov		dword ptr [edx+8],eax
 		mov		eax,dword ptr [ebx+12]
 		mov		dword ptr [edx+12],eax
 		sub		ecx,16
 		add		ebx,16
 		add		edx,16
 skip16:
 		cmp		ecx,8
 		jl		skip8
 		mov		eax,dword ptr [ebx]
 		mov		dword ptr [edx],eax
 		mov		eax,dword ptr [ebx+4]
 		sub		ecx,8
 		mov		dword ptr [edx+4],eax
 		add		ebx,8
 		add		edx,8
 skip8:
 		cmp		ecx,4
 		jl		skip4
 		mov		eax,dword ptr [ebx]	// here 4-7 bytes
 		add		ebx,4
 		sub		ecx,4
 		mov		dword ptr [edx],eax
 		add		edx,4
 skip4:							// 0-3 remaining bytes
 		cmp		ecx,2
 		jl		skip2
 		mov		ax,word ptr [ebx]	// two bytes
 		cmp		ecx,3				// less than 3?
 		mov		word ptr [edx],ax
 		jl		outta
 		mov		al,byte ptr [ebx+2]	// last byte
 		mov		byte ptr [edx+2],al
 		jmp		outta
 skip2:
 		cmp		ecx,1
 		jl		outta
 		mov		al,byte ptr [ebx]
 		mov		byte ptr [edx],al
 outta:
 		pop		esi
 		pop		edi
 	}
 }
 void Com_Memset (void* dest, const int val, const size_t count)
 {
 	unsigned int fillval;
 	if (count < 8)
 	{
 		__asm
 		{
 			mov		edx,dest
 			mov		eax, val
 			mov		ah,al
 			mov		ebx,eax
 			and		ebx, 0xffff
 			shl		eax,16
 			add		eax,ebx				// eax now contains pattern
 			mov		ecx,count
 			cmp		ecx,4
 			jl		skip4
 			mov		[edx],eax			// copy first dword
 			add		edx,4
 			sub		ecx,4
 	skip4:	cmp		ecx,2
 			jl		skip2
 			mov		word ptr [edx],ax	// copy 2 bytes
 			add		edx,2
 			sub		ecx,2
 	skip2:	cmp		ecx,0
 			je		skip1
 			mov		byte ptr [edx],al	// copy single byte
 	skip1:
 		}
 		return;
 	}
 	fillval = val;
 	fillval = fillval|(fillval<<8);
 	fillval = fillval|(fillval<<16);		// fill dword with 8-bit pattern
 	_copyDWord ((unsigned int*)(dest),fillval, count/4);
 	__asm									// padding of 0-3 bytes
 	{
 		mov		ecx,count
 		mov		eax,ecx
 		and		ecx,3
 		jz		skipA
 		and		eax,~3
 		mov		ebx,dest
 		add		ebx,eax
 		mov		eax,fillval
 		cmp		ecx,2
 		jl		skipB
 		mov		word ptr [ebx],ax
 		cmp		ecx,2
 		je		skipA					
 		mov		byte ptr [ebx+2],al		
 		jmp		skipA
 skipB:		
 		cmp		ecx,0
 		je		skipA
 		mov		byte ptr [ebx],al
 skipA:
 	}
 }
 qboolean Com_Memcmp (const void *src0, const void *src1, const unsigned int count)
 {
 	unsigned int i;
 	// MMX version anyone?
 	if (count >= 16)
 	{
 		unsigned int *dw = (unsigned int*)(src0);
 		unsigned int *sw = (unsigned int*)(src1);
 		unsigned int nm2 = count/16;
 		for (i = 0; i < nm2; i+=4)
 		{
 			unsigned int tmp = (dw[i+0]-sw[i+0])|(dw[i+1]-sw[i+1])|
 						  (dw[i+2]-sw[i+2])|(dw[i+3]-sw[i+3]);
 			if (tmp)
 				return qfalse;
 		}
 	}
 	if (count & 15)
 	{
 		byte *d = (byte*)src0;
 		byte *s = (byte*)src1;
 		for (i = count & 0xfffffff0; i < count; i++)
 		if (d[i]!=s[i])
 			return qfalse;
 	}
 	return qtrue;
 }
 void Com_Prefetch (const void *s, const unsigned int bytes, e_prefetch type)
 {
 	// write buffer prefetching is performed only if
 	// the processor benefits from it. Read and read/write
 	// prefetching is always performed.
 	switch (type)
 	{
 		case PRE_WRITE : break;
 		case PRE_READ:
 		case PRE_READ_WRITE:
 		__asm
 		{
 			mov		ebx,s
 			mov		ecx,bytes
 			cmp		ecx,4096				// clamp to 4kB
 			jle		skipClamp
 			mov		ecx,4096
 skipClamp:
 			add		ecx,0x1f
 			shr		ecx,5					// number of cache lines
 			jz		skip
 			jmp		loopie
 			align 16
 	loopie:	test	byte ptr [ebx],al
 			add		ebx,32
 			dec		ecx
 			jnz		loopie
 	skip:
 		}
 		break;
 	}
 }
 #endif
 //------------------------------------------------------------------------
--- a/code/qcommon/md4.c
+++ b/code/qcommon/md4.c
@ -38,13 +38,8 @@ void MD4Init (MD4_CTX *);
 void MD4Update (MD4_CTX *, const unsigned char *, unsigned int);
 void MD4Final (unsigned char [16], MD4_CTX *);
 #if I_WANT_A_CUSTOM_MEMCPY
 void Com_Memset (void* dest, const int val, const size_t count);
 void Com_Memcpy (void* dest, const void* src, const size_t count);
 #else
 #define Com_Memset memset
 #define Com_Memcpy memcpy
 #endif
 /* MD4C.C - RSA Data Security, Inc., MD4 message-digest algorithm */
 /* Copyright (C) 1990-2, RSA Data Security, Inc. All rights reserved.
--- a/code/qcommon/q_shared.h
+++ b/code/qcommon/q_shared.h
@ -243,13 +243,8 @@ void Snd_Memset (void* dest, const int val, const size_t count);
 #define Snd_Memset Com_Memset
 #endif
 #if I_WANT_A_CUSTOM_MEMCPY
 void Com_Memset (void* dest, const int val, const size_t count);
 void Com_Memcpy (void* dest, const void* src, const size_t count);
 #else
 #define Com_Memset memset
 #define Com_Memcpy memcpy
 #endif
 #define CIN_system	1
 #define CIN_loop	2
--- a/code/unix/Makefile
+++ b/code/unix/Makefile
@ -350,6 +350,9 @@ ifeq ($(PLATFORM),mingw32)
    LDFLAGS+=-m32
  endif
  BUILD_SERVER = 0
  BUILD_CLIENT_SMP = 0
 else # ifeq mingw32
 #############################################################################
@ -823,9 +826,19 @@ Q3OBJ = \
 ifeq ($(ARCH),i386)
  Q3OBJ += $(B)/client/vm_x86.o
  Q3OBJ += \
    $(B)/client/snd_mixa.o \
    $(B)/client/matha.o \
    $(B)/client/ftola.o \
    $(B)/client/snapvectora.o
 endif
 ifeq ($(ARCH),x86)
  Q3OBJ += $(B)/client/vm_x86.o
  Q3OBJ += \
    $(B)/client/snd_mixa.o \
    $(B)/client/matha.o \
    $(B)/client/ftola.o \
    $(B)/client/snapvectora.o
 endif
 ifeq ($(ARCH),x86_64)
  Q3OBJ += $(B)/client/vm_x86_64.o
@ -837,21 +850,6 @@ ifeq ($(ARCH),ppc)
  endif
 endif
 Q3OBJ += \
  $(B)/client/linux_common.o \
  \
  $(B)/client/snd_mixa.o \
  $(B)/client/matha.o \
  $(B)/client/ftola.o \
  $(B)/client/snapvectora.o \
  \
  $(B)/client/unix_main.o \
  $(B)/client/unix_net.o \
  $(B)/client/unix_shared.o \
  $(B)/client/linux_signals.o \
  $(B)/client/linux_qgl.o \
  $(B)/client/linux_snd.o \
  $(B)/client/sdl_snd.o
 ifeq ($(PLATFORM),mingw32)
  Q3OBJ += \
@ -867,6 +865,15 @@ ifeq ($(PLATFORM),mingw32)
    $(B)/client/win_wndproc.o \
    $(B)/client/win_resource.o
 else
  Q3OBJ += \
    $(B)/client/unix_main.o \
    $(B)/client/unix_net.o \
    $(B)/client/unix_shared.o \
    $(B)/client/linux_signals.o \
    $(B)/client/linux_qgl.o \
    $(B)/client/linux_snd.o \
    $(B)/client/sdl_snd.o
  ifeq ($(PLATFORM),linux)
    Q3OBJ += $(B)/client/linux_joystick.o
  endif
@ -1051,7 +1058,6 @@ $(B)/client/irix_glimp_smp.o : $(UDIR)/irix_glimp.c; $(DO_SMP_CC)
 $(B)/client/irix_snd.o : $(UDIR)/irix_snd.c; $(DO_CC)
 $(B)/client/irix_input.o : $(UDIR)/irix_input.c; $(DO_CC)
 $(B)/client/linux_signals.o : $(UDIR)/linux_signals.c; $(DO_CC) $(GL_CFLAGS)
 $(B)/client/linux_common.o : $(UDIR)/linux_common.c; $(DO_CC)
 $(B)/client/linux_glimp.o : $(UDIR)/linux_glimp.c; $(DO_CC)  $(GL_CFLAGS)
 $(B)/client/sdl_glimp.o : $(UDIR)/sdl_glimp.c; $(DO_CC)  $(GL_CFLAGS)
 $(B)/client/linux_glimp_smp.o : $(UDIR)/linux_glimp.c; $(DO_SMP_CC)  $(GL_CFLAGS)
@ -1154,7 +1160,6 @@ Q3DOBJ = \
  $(B)/ded/l_struct.o \
  \
  $(B)/ded/linux_signals.o \
  $(B)/ded/linux_common.o \
  $(B)/ded/unix_main.o \
  $(B)/ded/unix_net.o \
  $(B)/ded/unix_shared.o \
@ -1236,7 +1241,6 @@ $(B)/ded/l_script.o : $(BLIBDIR)/l_script.c; $(DO_BOT_CC)
 $(B)/ded/l_struct.o : $(BLIBDIR)/l_struct.c; $(DO_BOT_CC)
 $(B)/ded/linux_signals.o : $(UDIR)/linux_signals.c; $(DO_DED_CC)
 $(B)/ded/linux_common.o : $(UDIR)/linux_common.c; $(DO_DED_CC)
 $(B)/ded/unix_main.o : $(UDIR)/unix_main.c; $(DO_DED_CC)
 $(B)/ded/unix_net.o : $(UDIR)/unix_net.c; $(DO_DED_CC)
 $(B)/ded/unix_shared.o : $(UDIR)/unix_shared.c; $(DO_DED_CC)
--- a/code/unix/linux_common.c
+++ b/code/unix/linux_common.c
@ -1,346 +0,0 @@
 #if 0 // not used anymore
 /*
 ===========================================================================
 Copyright (C) 1999-2005 Id Software, Inc.
 This file is part of Quake III Arena source code.
 Quake III Arena source code is free software; you can redistribute it
 and/or modify it under the terms of the GNU General Public License as
 published by the Free Software Foundation; either version 2 of the License,
 or (at your option) any later version.
 Quake III Arena source code is distributed in the hope that it will be
 useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with Quake III Arena source code; if not, write to the Free Software
 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 ===========================================================================
 */
 /** 
 * GAS syntax equivalents of the MSVC asm memory calls in common.c
 *
 * The following changes have been made to the asm:
 * 1. Registers are loaded by the inline asm arguments when possible
 * 2. Labels have been changed to local label format (0,1,etc.) to allow inlining
 *
 * HISTORY:
 *	AH - Created on 08 Dec 2000
 */
 #include <unistd.h>   // AH - for size_t
 #include <string.h>
 // bk001207 - we need something under Linux, too. Mac?
 #if 1 // defined(C_ONLY) // bk010102 - dedicated?
 void Com_Memcpy (void* dest, const void* src, const size_t count) {
  memcpy(dest, src, count);
 }
 void Com_Memset (void* dest, const int val, const size_t count) {
  memset(dest, val, count);
 }
 #else
 typedef enum {
  PRE_READ,         // prefetch assuming that buffer is used for reading only
  PRE_WRITE,        // prefetch assuming that buffer is used for writing only
  PRE_READ_WRITE    // prefetch assuming that buffer is used for both reading and writing
 } e_prefetch;
 void Com_Prefetch (const void *s, const unsigned int bytes, e_prefetch type);
 void _copyDWord (unsigned int* dest, const unsigned int constant, const unsigned int count) {
 	// MMX version not used on standard Pentium MMX
 	// because the dword version is faster (with
 	// proper destination prefetching)
 		__asm__ __volatile__ (" \
 			//mov			eax,constant		// eax = val \
 			//mov			edx,dest			// dest \
 			//mov			ecx,count \
 			movd		%%eax, %%mm0 \
 			punpckldq	%%mm0, %%mm0 \
 \
 			// ensure that destination is qword aligned \
 \
 			testl		$7, %%edx				// qword padding?\
 			jz		0f	\
 			movl		%%eax, (%%edx) \
 			decl		%%ecx \
 			addl		$4, %%edx \
 \
 0:			movl		%%ecx, %%ebx				\
 			andl		$0xfffffff0, %%ecx	\
 			jz		2f \
 			jmp		1f \
 			.align 		16 \
 \
 			// funny ordering here to avoid commands \
 			// that cross 32-byte boundaries (the \
 			// [edx+0] version has a special 3-byte opcode... \
 1:			movq		%%mm0, 8(%%edx) \
 			movq		%%mm0, 16(%%edx) \
 			movq		%%mm0, 24(%%edx) \
 			movq		%%mm0, 32(%%edx) \
 			movq		%%mm0, 40(%%edx) \
 			movq		%%mm0, 48(%%edx) \
 			movq		%%mm0, 56(%%edx) \
 			movq		%%mm0, (%%edx)\
 			addl		$64, %%edx \
 			subl		$16, %%ecx \
 			jnz		1b \
 2:	\
 			movl		%%ebx, %%ecx				// ebx = cnt \
 			andl		$0xfffffff0, %%ecx				// ecx = cnt&~15 \
 			subl		%%ecx, %%ebx \
 			jz		6f \
 			cmpl		$8, %%ebx \
 			jl		3f \
 \
 			movq		%%mm0, (%%edx) \
 			movq		%%mm0, 8(%%edx) \
 			movq		%%mm0, 16(%%edx) \
 			movq		%%mm0, 24(%%edx) \
 			addl		$32, %%edx \
 			subl		$8, %%ebx \
 			jz		6f \
 \
 3:			cmpl		$4, %%ebx \
 			jl		4f \
 			\
 			movq		%%mm0, (%%edx) \
 			movq		%%mm0, 8(%%edx) \
 			addl		$16, %%edx \
 			subl		$4, %%ebx \
 \
 4:			cmpl		$2, %%ebx \
 			jl		5f \
 			movq		%%mm0, (%%edx) \
 			addl		$8, %%edx \
 			subl		$2, %%ebx \
 \
 5:			cmpl		$1, %%ebx \
 			jl		6f \
 			movl		%%eax, (%%edx) \
 6: \
 			emms \
 	"
 	: : "a" (constant), "c" (count), "d" (dest)
 	: "%ebx", "%edi", "%esi", "cc", "memory");
 }
 // optimized memory copy routine that handles all alignment
 // cases and block sizes efficiently
 void Com_Memcpy (void* dest, const void* src, const size_t count) {
 	Com_Prefetch (src, count, PRE_READ);
 	__asm__ __volatile__ (" \
 		pushl		%%edi \
 		pushl		%%esi \
 		//mov		ecx,count \
 		cmpl		$0, %%ecx						// count = 0 check (just to be on the safe side) \
 		je		6f \
 		//mov		edx,dest \
 		movl		%0, %%ebx \
 		cmpl		$32, %%ecx						// padding only? \
 		jl		1f \
 \
 		movl		%%ecx, %%edi					\
 		andl		$0xfffffe00, %%edi					// edi = count&~31 \
 		subl		$32, %%edi \
 \
 		.align 16 \
 0: \
 		movl		(%%ebx, %%edi, 1), %%eax \
 		movl		4(%%ebx, %%edi, 1), %%esi \
 		movl		%%eax, (%%edx, %%edi, 1) \
 		movl		%%esi, 4(%%edx, %%edi, 1) \
 		movl		8(%%ebx, %%edi, 1), %%eax \
 		movl		12(%%ebx, %%edi, 1), %%esi \
 		movl		%%eax, 8(%%edx, %%edi, 1) \
 		movl		%%esi, 12(%%edx, %%edi, 1) \
 		movl		16(%%ebx, %%edi, 1), %%eax \
 		movl		20(%%ebx, %%edi, 1), %%esi \
 		movl		%%eax, 16(%%edx, %%edi, 1) \
 		movl		%%esi, 20(%%edx, %%edi, 1) \
 		movl		24(%%ebx, %%edi, 1), %%eax \
 		movl		28(%%ebx, %%edi, 1), %%esi \
 		movl		%%eax, 24(%%edx, %%edi, 1) \
 		movl		%%esi, 28(%%edx, %%edi, 1) \
 		subl		$32, %%edi \
 		jge		0b \
 		\
 		movl		%%ecx, %%edi \
 		andl		$0xfffffe00, %%edi \
 		addl		%%edi, %%ebx					// increase src pointer \
 		addl		%%edi, %%edx					// increase dst pointer \
 		andl		$31, %%ecx					// new count \
 		jz		6f					// if count = 0, get outta here \
 \
 1: \
 		cmpl		$16, %%ecx \
 		jl		2f \
 		movl		(%%ebx), %%eax \
 		movl		%%eax, (%%edx) \
 		movl		4(%%ebx), %%eax \
 		movl		%%eax, 4(%%edx) \
 		movl		8(%%ebx), %%eax \
 		movl		%%eax, 8(%%edx) \
 		movl		12(%%ebx), %%eax \
 		movl		%%eax, 12(%%edx) \
 		subl		$16, %%ecx \
 		addl		$16, %%ebx \
 		addl		$16, %%edx \
 2: \
 		cmpl		$8, %%ecx \
 		jl		3f \
 		movl		(%%ebx), %%eax \
 		movl		%%eax, (%%edx) \
 		movl		4(%%ebx), %%eax \
 		subl		$8, %%ecx \
 		movl		%%eax, 4(%%edx) \
 		addl		$8, %%ebx \
 		addl		$8, %%edx \
 3: \
 		cmpl		$4, %%ecx \
 		jl		4f \
 		movl		(%%ebx), %%eax	// here 4-7 bytes \
 		addl		$4, %%ebx \
 		subl		$4, %%ecx \
 		movl		%%eax, (%%edx) \
 		addl		$4, %%edx \
 4:							// 0-3 remaining bytes \
 		cmpl		$2, %%ecx \
 		jl		5f \
 		movw		(%%ebx), %%ax	// two bytes \
 		cmpl		$3, %%ecx				// less than 3? \
 		movw		%%ax, (%%edx) \
 		jl		6f \
 		movb		2(%%ebx), %%al	// last byte \
 		movb		%%al, 2(%%edx) \
 		jmp		6f \
 5: \
 		cmpl		$1, %%ecx \
 		jl		6f \
 		movb		(%%ebx), %%al \
 		movb		%%al, (%%edx) \
 6: \
 		popl		%%esi \
 		popl		%%edi \
 	"
 	: : "m" (src), "d" (dest), "c" (count)
 	: "%eax", "%ebx", "%edi", "%esi", "cc", "memory");
 }
 void Com_Memset (void* dest, const int val, const size_t count)
 {
 	unsigned int fillval;
 	if (count < 8)
 	{
 		__asm__ __volatile__ (" \
 			//mov		edx,dest \
 			//mov		eax, val \
 			movb		%%al, %%ah \
 			movl		%%eax, %%ebx \
 			andl		$0xffff, %%ebx \
 			shll		$16, %%eax \
 			addl		%%ebx, %%eax	// eax now contains pattern \
 			//mov		ecx,count \
 			cmpl		$4, %%ecx \
 			jl		0f \
 			movl		%%eax, (%%edx)	// copy first dword \
 			addl		$4, %%edx \
 			subl		$4, %%ecx \
 	0:		cmpl		$2, %%ecx \
 			jl		1f \
 			movw		%%ax, (%%edx)	// copy 2 bytes \
 			addl		$2, %%edx \
 			subl		$2, %%ecx \
 	1:		cmpl		$0, %%ecx \
 			je		2f \
 			movb		%%al, (%%edx)	// copy single byte \
 	2:		 \
 		"
 		: : "d" (dest), "a" (val), "c" (count)
 		: "%ebx", "%edi", "%esi", "cc", "memory");
 		return;
 	}
 	fillval = val;
 	fillval = fillval|(fillval<<8);
 	fillval = fillval|(fillval<<16);		// fill dword with 8-bit pattern
 	_copyDWord ((unsigned int*)(dest),fillval, count/4);
 	__asm__ __volatile__ ("     		// padding of 0-3 bytes \
 		//mov		ecx,count \
 		movl		%%ecx, %%eax \
 		andl		$3, %%ecx \
 		jz		1f \
 		andl		$0xffffff00, %%eax \
 		//mov		ebx,dest \
 		addl		%%eax, %%edx \
 		movl		%0, %%eax \
 		cmpl		$2, %%ecx \
 		jl		0f \
 		movw		%%ax, (%%edx) \
 		cmpl		$2, %%ecx \
 		je		1f					\
 		movb		%%al, 2(%%edx)		\
 		jmp		1f \
 0:		\
 		cmpl		$0, %%ecx\
 		je		1f\
 		movb		%%al, (%%edx)\
 1:	\
 	"
 	: : "m" (fillval), "c" (count), "d" (dest)
 	: "%eax", "%ebx", "%edi", "%esi", "cc", "memory");	
 }
 void Com_Prefetch (const void *s, const unsigned int bytes, e_prefetch type)
 {
 	// write buffer prefetching is performed only if
 	// the processor benefits from it. Read and read/write
 	// prefetching is always performed.
 	switch (type)
 	{
 		case PRE_WRITE : break;
 		case PRE_READ:
 		case PRE_READ_WRITE:
 		__asm__ __volatile__ ("\
 			//mov		ebx,s\
 			//mov		ecx,bytes\
 			cmpl		$4096, %%ecx				// clamp to 4kB\
 			jle		0f\
 			movl		$4096, %%ecx\
 	0:\
 			addl		$0x1f, %%ecx\
 			shrl		$5, %%ecx					// number of cache lines\
 			jz		2f\
 			jmp		1f\
 \
 			.align 16\
 	1:		testb		%%al, (%%edx)\
 			addl		$32, %%edx\
 			decl		%%ecx\
 			jnz		1b\
 	2:\
 		"
 		: : "d" (s), "c" (bytes)
 		: "%eax", "%ebx", "%edi", "%esi", "memory", "cc");
 		break;
 	}
 }
 #endif
 #endif