using the roundss SSE4.1 instruction for inlining floor and ceil syscalls when possible

fixed sharedTraps_t listing syscalls that were not actually at the same index for all 3 VMs
This commit is contained in:
myT 2019-09-18 17:06:58 +02:00
parent 1115cb39b0
commit c5192d49b7
5 changed files with 63 additions and 21 deletions

View file

@ -89,6 +89,8 @@ chg: r_textureMode and r_measureOverdraw were removed
chg: r_speeds 1 now reports more precise timings, V-Sync status, GPU time and the selected back-end
chg: using the roundss SSE4.1 instruction for inlining floor and ceil syscalls when possible
chg: improved face and grid processing performance with SSE2
chg: r_lightmap is now latched again

View file

@ -227,6 +227,21 @@ typedef enum {
// 1.32
G_FS_SEEK,
G_MEMSET = 100,
G_MEMCPY,
G_STRNCPY,
G_SIN,
G_COS,
G_ATAN2,
G_SQRT,
G_MATRIXMULTIPLY,
G_ANGLEVECTORS,
G_PERPENDICULARVECTOR,
G_FLOOR,
G_CEIL,
G_TESTPRINTINT,
G_TESTPRINTFLOAT,
BOTLIB_SETUP = 200, // ( void );
BOTLIB_SHUTDOWN, // ( void );
BOTLIB_LIBVAR_SET,

View file

@ -270,15 +270,8 @@ typedef enum {
TRAP_SIN,
TRAP_COS,
TRAP_ATAN2,
TRAP_SQRT,
TRAP_MATRIXMULTIPLY,
TRAP_ANGLEVECTORS,
TRAP_PERPENDICULARVECTOR,
TRAP_FLOOR,
TRAP_CEIL,
TRAP_TESTPRINTINT,
TRAP_TESTPRINTFLOAT
TRAP_SQRT
// note that ceil/floor etc have different numbers across VMs
} sharedTraps_t;
typedef enum {

View file

@ -863,6 +863,24 @@ static void EmitDATAFunc(vm_t *vm)
}
static qbool IsFloorTrap(vm_t *vm, int trap)
{
if ( vm->index == VM_CGAME || vm->index == VM_UI )
return trap == ~107;
return trap == ~110; // VM_GAME
}
static qbool IsCeilTrap(vm_t *vm, int trap)
{
if ( vm->index == VM_CGAME || vm->index == VM_UI )
return trap == ~108;
return trap == ~111; // VM_GAME
}
/*
=================
ConstOptimize
@ -1114,6 +1132,20 @@ static qboolean ConstOptimize(vm_t *vm)
EmitCommand( LAST_COMMAND_STORE_FLOAT_EDI );
ip += 1;
return qtrue;
} else if ( IsFloorTrap( vm, v ) && ( cpu_features & CPU_SSE41 ) != 0 ) {
EmitString( "f3 0f 10 45 08" ); // movss xmm0, dword ptr [ebp + 8]
EmitAddEDI4( vm );
EmitString( "66 0f 3a 0a c0 01" ); // roundss xmm0, xmm0, 1 (exceptions not masked)
EmitCommand( LAST_COMMAND_STORE_FLOAT_EDI );
ip += 1;
return qtrue;
} else if ( IsCeilTrap( vm, v ) && ( cpu_features & CPU_SSE41 ) != 0 ) {
EmitString( "f3 0f 10 45 08" ); // movss xmm0, dword ptr [ebp + 8]
EmitAddEDI4( vm );
EmitString( "66 0f 3a 0a c0 02" ); // roundss xmm0, xmm0, 2 (exceptions not masked)
EmitCommand( LAST_COMMAND_STORE_FLOAT_EDI );
ip += 1;
return qtrue;
}
if ( v < 0 ) // syscall

View file

@ -764,46 +764,46 @@ static intptr_t SV_GameSystemCalls( intptr_t* args )
case BOTLIB_AI_GENETIC_PARENTS_AND_CHILD_SELECTION:
return botlib_export->ai.GeneticParentsAndChildSelection(args[1], VMA(2), VMA(3), VMA(4), VMA(5));
case TRAP_MEMSET:
case G_MEMSET:
Com_Memset( VMA(1), args[2], args[3] );
return 0;
case TRAP_MEMCPY:
case G_MEMCPY:
Com_Memcpy( VMA(1), VMA(2), args[3] );
return 0;
case TRAP_STRNCPY:
case G_STRNCPY:
strncpy( VMA(1), VMA(2), args[3] );
return args[1];
case TRAP_SIN:
case G_SIN:
return PASSFLOAT( sin( VMF(1) ) );
case TRAP_COS:
case G_COS:
return PASSFLOAT( cos( VMF(1) ) );
case TRAP_ATAN2:
case G_ATAN2:
return PASSFLOAT( atan2( VMF(1), VMF(2) ) );
case TRAP_SQRT:
case G_SQRT:
return PASSFLOAT( sqrt( VMF(1) ) );
case TRAP_MATRIXMULTIPLY:
case G_MATRIXMULTIPLY:
MatrixMultiply( VMA(1), VMA(2), VMA(3) );
return 0;
case TRAP_ANGLEVECTORS:
case G_ANGLEVECTORS:
AngleVectors( VMA(1), VMA(2), VMA(3), VMA(4) );
return 0;
case TRAP_PERPENDICULARVECTOR:
case G_PERPENDICULARVECTOR:
PerpendicularVector( VMA(1), VMA(2) );
return 0;
case TRAP_FLOOR:
case G_FLOOR:
return PASSFLOAT( floor( VMF(1) ) );
case TRAP_CEIL:
case G_CEIL:
return PASSFLOAT( ceil( VMF(1) ) );
// extensions