using the roundss SSE4.1 instruction for inlining floor and ceil syscalls when possible

fixed sharedTraps_t listing syscalls that were not actually at the same index for all 3 VMs
This commit is contained in:
myT 2019-09-18 17:06:58 +02:00
parent 1115cb39b0
commit c5192d49b7
5 changed files with 63 additions and 21 deletions

View file

@ -89,6 +89,8 @@ chg: r_textureMode and r_measureOverdraw were removed
chg: r_speeds 1 now reports more precise timings, V-Sync status, GPU time and the selected back-end chg: r_speeds 1 now reports more precise timings, V-Sync status, GPU time and the selected back-end
chg: using the roundss SSE4.1 instruction for inlining floor and ceil syscalls when possible
chg: improved face and grid processing performance with SSE2 chg: improved face and grid processing performance with SSE2
chg: r_lightmap is now latched again chg: r_lightmap is now latched again

View file

@ -227,6 +227,21 @@ typedef enum {
// 1.32 // 1.32
G_FS_SEEK, G_FS_SEEK,
G_MEMSET = 100,
G_MEMCPY,
G_STRNCPY,
G_SIN,
G_COS,
G_ATAN2,
G_SQRT,
G_MATRIXMULTIPLY,
G_ANGLEVECTORS,
G_PERPENDICULARVECTOR,
G_FLOOR,
G_CEIL,
G_TESTPRINTINT,
G_TESTPRINTFLOAT,
BOTLIB_SETUP = 200, // ( void ); BOTLIB_SETUP = 200, // ( void );
BOTLIB_SHUTDOWN, // ( void ); BOTLIB_SHUTDOWN, // ( void );
BOTLIB_LIBVAR_SET, BOTLIB_LIBVAR_SET,

View file

@ -270,15 +270,8 @@ typedef enum {
TRAP_SIN, TRAP_SIN,
TRAP_COS, TRAP_COS,
TRAP_ATAN2, TRAP_ATAN2,
TRAP_SQRT, TRAP_SQRT
TRAP_MATRIXMULTIPLY, // note that ceil/floor etc have different numbers across VMs
TRAP_ANGLEVECTORS,
TRAP_PERPENDICULARVECTOR,
TRAP_FLOOR,
TRAP_CEIL,
TRAP_TESTPRINTINT,
TRAP_TESTPRINTFLOAT
} sharedTraps_t; } sharedTraps_t;
typedef enum { typedef enum {

View file

@ -863,6 +863,24 @@ static void EmitDATAFunc(vm_t *vm)
} }
static qbool IsFloorTrap(vm_t *vm, int trap)
{
if ( vm->index == VM_CGAME || vm->index == VM_UI )
return trap == ~107;
return trap == ~110; // VM_GAME
}
static qbool IsCeilTrap(vm_t *vm, int trap)
{
if ( vm->index == VM_CGAME || vm->index == VM_UI )
return trap == ~108;
return trap == ~111; // VM_GAME
}
/* /*
================= =================
ConstOptimize ConstOptimize
@ -1114,6 +1132,20 @@ static qboolean ConstOptimize(vm_t *vm)
EmitCommand( LAST_COMMAND_STORE_FLOAT_EDI ); EmitCommand( LAST_COMMAND_STORE_FLOAT_EDI );
ip += 1; ip += 1;
return qtrue; return qtrue;
} else if ( IsFloorTrap( vm, v ) && ( cpu_features & CPU_SSE41 ) != 0 ) {
EmitString( "f3 0f 10 45 08" ); // movss xmm0, dword ptr [ebp + 8]
EmitAddEDI4( vm );
EmitString( "66 0f 3a 0a c0 01" ); // roundss xmm0, xmm0, 1 (exceptions not masked)
EmitCommand( LAST_COMMAND_STORE_FLOAT_EDI );
ip += 1;
return qtrue;
} else if ( IsCeilTrap( vm, v ) && ( cpu_features & CPU_SSE41 ) != 0 ) {
EmitString( "f3 0f 10 45 08" ); // movss xmm0, dword ptr [ebp + 8]
EmitAddEDI4( vm );
EmitString( "66 0f 3a 0a c0 02" ); // roundss xmm0, xmm0, 2 (exceptions not masked)
EmitCommand( LAST_COMMAND_STORE_FLOAT_EDI );
ip += 1;
return qtrue;
} }
if ( v < 0 ) // syscall if ( v < 0 ) // syscall

View file

@ -764,46 +764,46 @@ static intptr_t SV_GameSystemCalls( intptr_t* args )
case BOTLIB_AI_GENETIC_PARENTS_AND_CHILD_SELECTION: case BOTLIB_AI_GENETIC_PARENTS_AND_CHILD_SELECTION:
return botlib_export->ai.GeneticParentsAndChildSelection(args[1], VMA(2), VMA(3), VMA(4), VMA(5)); return botlib_export->ai.GeneticParentsAndChildSelection(args[1], VMA(2), VMA(3), VMA(4), VMA(5));
case TRAP_MEMSET: case G_MEMSET:
Com_Memset( VMA(1), args[2], args[3] ); Com_Memset( VMA(1), args[2], args[3] );
return 0; return 0;
case TRAP_MEMCPY: case G_MEMCPY:
Com_Memcpy( VMA(1), VMA(2), args[3] ); Com_Memcpy( VMA(1), VMA(2), args[3] );
return 0; return 0;
case TRAP_STRNCPY: case G_STRNCPY:
strncpy( VMA(1), VMA(2), args[3] ); strncpy( VMA(1), VMA(2), args[3] );
return args[1]; return args[1];
case TRAP_SIN: case G_SIN:
return PASSFLOAT( sin( VMF(1) ) ); return PASSFLOAT( sin( VMF(1) ) );
case TRAP_COS: case G_COS:
return PASSFLOAT( cos( VMF(1) ) ); return PASSFLOAT( cos( VMF(1) ) );
case TRAP_ATAN2: case G_ATAN2:
return PASSFLOAT( atan2( VMF(1), VMF(2) ) ); return PASSFLOAT( atan2( VMF(1), VMF(2) ) );
case TRAP_SQRT: case G_SQRT:
return PASSFLOAT( sqrt( VMF(1) ) ); return PASSFLOAT( sqrt( VMF(1) ) );
case TRAP_MATRIXMULTIPLY: case G_MATRIXMULTIPLY:
MatrixMultiply( VMA(1), VMA(2), VMA(3) ); MatrixMultiply( VMA(1), VMA(2), VMA(3) );
return 0; return 0;
case TRAP_ANGLEVECTORS: case G_ANGLEVECTORS:
AngleVectors( VMA(1), VMA(2), VMA(3), VMA(4) ); AngleVectors( VMA(1), VMA(2), VMA(3), VMA(4) );
return 0; return 0;
case TRAP_PERPENDICULARVECTOR: case G_PERPENDICULARVECTOR:
PerpendicularVector( VMA(1), VMA(2) ); PerpendicularVector( VMA(1), VMA(2) );
return 0; return 0;
case TRAP_FLOOR: case G_FLOOR:
return PASSFLOAT( floor( VMF(1) ) ); return PASSFLOAT( floor( VMF(1) ) );
case TRAP_CEIL: case G_CEIL:
return PASSFLOAT( ceil( VMF(1) ) ); return PASSFLOAT( ceil( VMF(1) ) );
// extensions // extensions