rallyunlimited-engine/code/qcommon/vm_armv7l.c

3263 lines
89 KiB
C
Raw Normal View History

2024-02-02 16:46:17 +00:00
/*
===========================================================================
Copyright (C) 2009 David S. Miller <davem@davemloft.net>
Copyright (C) 2013,2014 SUSE Linux Products GmbH
Copyright (C) 2020-2021 Quake3e project
This file is part of Quake III Arena source code.
Quake III Arena source code is free software; you can redistribute it
and/or modify it under the terms of the GNU General Public License as
published by the Free Software Foundation; either version 2 of the License,
or (at your option) any later version.
Quake III Arena source code is distributed in the hope that it will be
useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Quake III Arena source code; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
===========================================================================
ARMv7l VM by Ludwig Nussel <ludwig.nussel@suse.de>
TODO: optimization
Docu:
http://www.coranac.com/tonc/text/asm.htm
http://www.heyrick.co.uk/armwiki/Category:Opcodes
ARMv7-A_ARMv7-R_DDI0406_2007.pdf
*/
#ifdef _WIN32
#include <windows.h>
#pragma warning( disable : 4245 ) // conversion from int to XXX, signed/unsigned mismatch
#pragma warning( disable : 4146 ) // unary minus operator applied to unsigned type, result still unsigned
#else
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <unistd.h>
#include <math.h>
#endif
#include "vm_local.h"
#define NUM_PASSES 1
// additional integrity checks
#define DEBUG_VM
// various definitions to enable/disable particular optimization
// use dynamic allocation of integer/scalar registers
#define DYN_ALLOC_RX
#define DYN_ALLOC_SX
// re-use constants previously stored in scratch registers
#define CONST_CACHE_RX
#define CONST_CACHE_SX
#define REGS_OPTIMIZE
#define FPU_OPTIMIZE
#define CONST_OPTIMIZE
#define ADDR_OPTIMIZE
#define LOAD_OPTIMIZE
// allow sharing both variables and constants in registers
#define REG_TYPE_MASK
// number of variables/memory mappings per register
#define REG_MAP_COUNT 4
#define FUNC_ALIGN 16
//#define DUMP_CODE
typedef enum
{
FUNC_ENTR,
FUNC_BCPY,
FUNC_CALL,
FUNC_SYSC,
FUNC_SYSF,
FUNC_PSOF,
FUNC_OSOF,
FUNC_BADJ,
FUNC_OUTJ,
FUNC_BADR,
FUNC_BADW,
OFFSET_T_LAST
} offset_t;
static uint32_t *code;
static uint32_t compiledOfs;
//static uint32_t *instructionOffsets;
//static intptr_t *instructionPointers;
static instruction_t *inst = NULL;
static instruction_t *ci;
static uint32_t ip;
static uint32_t pass;
static uint32_t savedOffset[ OFFSET_T_LAST ];
#define R0 0 // scratch
#define R1 1 // scratch
#define R2 2 // scratch
#define R3 3 // scratch
#define R4 4 // * vmBase
#define R5 5 // * opStack
#define R6 6 // * opStackTop
#define R7 7 // * instructionPointers
#define R8 8 // * programStack
#define R9 9 // * dataBase
#define R10 10 // * dataMask
#define R11 11 // * procBase, frame pointer
#define R12 12 // scratch, inter-procedure call
#define SP 13 // * stack pointer
#define LR 14 // link register
#define PC 15 // program counter
#define FP R11
#define APSR_nzcv 15
#define S0 0
#define S1 1
#define S2 2
#define rVMBASE R4
#define rOPSTACK R5
#define rOPSTACKTOP R6
#define rINSPOINTERS R7
#define rPSTACK R8
#define rDATABASE R9
#define rDATAMASK R10
#define rPROCBASE FP
#define R4_R11 (1<<R4)|(1<<R5)|(1<<R6)|(1<<R7)|(1<<R8)|(1<<R9)|(1<<R10)|(1<<R11)
/* arm eabi, builtin gcc functions */
int __aeabi_idiv(int, int);
unsigned __aeabi_uidiv(unsigned, unsigned);
void __aeabi_idivmod(void);
void __aeabi_uidivmod(void);
#ifdef _MSC_VER
#define DROP( reason, ... ) \
do { \
VM_FreeBuffers(); \
Com_Error( ERR_DROP, "%s: " reason, __func__, __VA_ARGS__ ); \
} while(0)
#else
#define DROP( reason, args... ) \
do { \
VM_FreeBuffers(); \
Com_Error( ERR_DROP, "%s: " reason, __func__, ##args ); \
} while(0)
#endif
static void VM_FreeBuffers( void )
{
// should be freed in reversed allocation order
//if ( instructionOffsets ) {
// Z_Free( instructionOffsets );
// instructionOffsets = NULL;
//}
if ( inst ) {
Z_Free( inst );
inst = NULL;
}
}
static void VM_Destroy_Compiled( vm_t *vm )
{
if ( vm->codeBase.ptr )
{
#ifdef _WIN32
VirtualFree( vm->codeBase.ptr, 0, MEM_RELEASE );
#else
if ( munmap( vm->codeBase.ptr, vm->codeLength ) )
Com_Printf( S_COLOR_RED "%s(): memory unmap failed, possible memory leak!\n", __func__ );
#endif
}
vm->codeBase.ptr = NULL;
}
static void __attribute__((__noreturn__)) OutJump( void )
{
//Com_Error( ERR_NOTDROP, "program tried to execute code outside VM" );
}
static void __attribute__((__noreturn__)) BadJump( void )
{
//Com_Error( ERR_NOTDROP, "program tried to execute code at bad location inside VM" );
}
static void __attribute__((__noreturn__)) ErrBadProgramStack( void )
{
//Com_Error( ERR_NOTDROP, "program tried to overflow programStack" );
}
static void __attribute__((__noreturn__)) ErrBadOpStack( void )
{
//Com_Error( ERR_NOTDROP, "program tried to overflow opStack" );
}
static void __attribute__( ( __noreturn__ ) ) ErrBadDataRead( void )
{
//Com_Error( ERR_NOTDROP, "program tried to read out of data segment" );
}
static void __attribute__( ( __noreturn__ ) ) ErrBadDataWrite( void )
{
Com_Error( ERR_NOTDROP, "program tried to write out of data segment" );
}
static void emit( uint32_t isn )
{
if ( code )
{
code[ compiledOfs >> 2 ] = isn;
}
compiledOfs += 4;
}
static unsigned char off10_2( unsigned val )
{
if ( val & 3 )
DROP( "offset must be multiple of four" );
if ( val > 1020 )
DROP( "offset %i is too large", val );
return val >> 2;
}
// ARM is really crazy ...
static unsigned short rimm( uint32_t val )
{
unsigned shift = 0;
if (val < 256)
return val;
// rotate the value until it fits
while (shift < 16 && (val>255 || !(val&3))) {
val = (val&3)<<30 | val>>2;
++shift;
}
if (shift > 15 || val > 255) {
DROP( "immediate cannot be encoded (%d, %d)\n", shift, val );
}
return (16-shift)<<8 | val;
}
// same as rimm but doesn't die, returns 0 if not encodable
static unsigned short can_encode( uint32_t val )
{
unsigned shift = 0;
if (val == 0)
return 1; // to avoid redundant checks
//DIE("can_encode: invalid argument");
if (val < 256)
return val;
// rotate the value until it fits
while (shift < 16 && (val>255 || !(val&3))) {
val = (val&3)<<30 | val>>2;
++shift;
}
if (shift > 15 || val > 255) {
return 0;
}
return (16-shift)<<8 | val;
}
#define rASR(i, reg) (0b10<<5 | ((i&31)<<7) | reg)
#define rLSL(i, reg) (0b00<<5 | ((i&31)<<7) | reg)
#define rLSR(i, reg) (0b01<<5 | ((i&31)<<7) | reg)
#define rROR(i, reg) (0b11<<5 | ((i&31)<<7) | reg)
// conditions
#define EQ (0b0000<<28)
#define NE (0b0001<<28)
#define CS (0b0010<<28)
#define HS CS
#define CC (0b0011<<28)
#define LO CC
#define MI (0b0100<<28)
#define PL (0b0101<<28)
#define VS (0b0110<<28)
#define VC (0b0111<<28)
#define HI (0b1000<<28)
#define LS (0b1001<<28)
#define GE (0b1010<<28)
#define LT (0b1011<<28)
#define GT (0b1100<<28)
#define LE (0b1101<<28)
#define AL (0b1110<<28)
#define cond(what, op) (what | (op&~AL))
#define BKPT(v) (AL | 0b10010<<20 | ((v&0xFFF0)<<8) | 0b0111<<4 | (v&0xF))
#define NOP (AL | 0b110010<<20 | 0b1111<<12)
#define ANDi(dst, src, i) (AL | (0b001<<25) | (0b00000<<20) | (src<<16) | (dst<<12) | rimm(i))
#define EORi(dst, src, i) (AL | (0b001<<25) | (0b00010<<20) | (src<<16) | (dst<<12) | rimm(i))
#define SUBi(dst, src, i) (AL | (0b001<<25) | (0b00100<<20) | (src<<16) | (dst<<12) | rimm(i))
#define RSBi(dst, src, i) (AL | (0b001<<25) | (0b00110<<20) | (src<<16) | (dst<<12) | rimm(i))
#define ADDi(dst, src, i) (AL | (0b001<<25) | (0b01000<<20) | (src<<16) | (dst<<12) | rimm(i))
#define ADCi(dst, src, i) (AL | (0b001<<25) | (0b01010<<20) | (src<<16) | (dst<<12) | rimm(i))
#define SBCi(dst, src, i) (AL | (0b001<<25) | (0b01100<<20) | (src<<16) | (dst<<12) | rimm(i))
#define RSCi(dst, src, i) (AL | (0b001<<25) | (0b01110<<20) | (src<<16) | (dst<<12) | rimm(i))
#define ORRi(dst, src, i) (AL | (0b001<<25) | (0b11000<<20) | (src<<16) | (dst<<12) | rimm(i))
#define MOVi(dst, i) (AL | (0b001<<25) | (0b11010<<20) | (dst<<12) | rimm(i))
#define BICi(dst, src, i) (AL | (0b001<<25) | (0b11100<<20) | (src<<16) | (dst<<12) | rimm(i))
#define MVNi(dst, i) (AL | (0b001<<25) | (0b11110<<20) | (dst<<12) | rimm(i))
#define MOVW(dst, i) (AL | (0b11<<24) | ((((i)>>12)&0xF)<<16) | (dst<<12) | ((i)&((1<<12)-1)))
#define MOVT(dst, i) (AL | (0b11<<24) | (0b0100<<20) | ((((i)>>12)&0xF)<<16) | (dst<<12) | ((i)&((1<<12)-1)))
#define TSTi( src, i) (AL | (0b001<<25) | (0b10001<<20) | (src<<16) | rimm(i))
#define TEQi( src, i) (AL | (0b001<<25) | (0b10011<<20) | (src<<16) | rimm(i))
#define CMPi( src, i) (AL | (0b001<<25) | (0b10101<<20) | (src<<16) | rimm(i))
#define CMNi( src, i) (AL | (0b001<<25) | (0b10111<<20) | (src<<16) | rimm(i))
#define ANDSi(dst, src, i) (ANDi(dst, src, i) | (1<<20))
#define EORSi(dst, src, i) (EORi(dst, src, i) | (1<<20))
#define SUBSi(dst, src, i) (SUBi(dst, src, i) | (1<<20))
#define RSBSi(dst, src, i) (RSBi(dst, src, i) | (1<<20))
#define ADDSi(dst, src, i) (ADDi(dst, src, i) | (1<<20))
#define ADCSi(dst, src, i) (ADCi(dst, src, i) | (1<<20))
#define SBCSi(dst, src, i) (SBCi(dst, src, i) | (1<<20))
#define RSCSi(dst, src, i) (RSCi(dst, src, i) | (1<<20))
#define ORRSi(dst, src, i) (ORRi(dst, src, i) | (1<<20))
#define MOVSi(dst, i) (MOVi(dst, i) | (1<<20))
#define BICSi(dst, src, i) (BICi(dst, src, i) | (1<<20))
#define MVNSi(dst, i) (MVNi(dst, src, i) | (1<<20))
#define AND(dst, src, reg) (AL | (0b000<<25) | (0b00000<<20) | (src<<16) | (dst<<12) | reg)
#define EOR(dst, src, reg) (AL | (0b000<<25) | (0b00010<<20) | (src<<16) | (dst<<12) | reg)
#define SUB(dst, src, reg) (AL | (0b000<<25) | (0b00100<<20) | (src<<16) | (dst<<12) | reg)
#define RSB(dst, src, reg) (AL | (0b000<<25) | (0b00110<<20) | (src<<16) | (dst<<12) | reg)
#define ADD(dst, src, reg) (AL | (0b000<<25) | (0b01000<<20) | (src<<16) | (dst<<12) | reg)
#define ADC(dst, src, reg) (AL | (0b000<<25) | (0b01010<<20) | (src<<16) | (dst<<12) | reg)
#define SBC(dst, src, reg) (AL | (0b000<<25) | (0b01100<<20) | (src<<16) | (dst<<12) | reg)
#define RSC(dst, src, reg) (AL | (0b000<<25) | (0b01110<<20) | (src<<16) | (dst<<12) | reg)
#define ORR(dst, src, reg) (AL | (0b000<<25) | (0b11000<<20) | (src<<16) | (dst<<12) | reg)
#define MOV(dst, src) (AL | (0b000<<25) | (0b11010<<20) | (dst<<12) | src)
#define LSL(dst, src, reg) (AL | (0b000<<25) | (0b1101<<21) | (0<<20) | (dst<<12) | (reg<<8) | (0b0001<<4) | src)
#define LSR(dst, src, reg) (AL | (0b000<<25) | (0b1101<<21) | (0<<20) | (dst<<12) | (reg<<8) | (0b0011<<4) | src)
#define ASR(dst, src, reg) (AL | (0b000<<25) | (0b1101<<21) | (0<<20) | (dst<<12) | (reg<<8) | (0b0101<<4) | src)
#define ROR(dst, src, reg) (AL | (0b000<<25) | (0b1101<<21) | (0<<20) | (dst<<12) | (reg<<8) | (0b0111<<4) | src)
#define LSLi(dst, src, i) (AL | (0b000<<25) | (0b1101<<21) | (0<<20) | (dst<<12) | ((i&0x1F)<<7) | (0b000<<4) | src)
#define LSRi(dst, src, i) (AL | (0b000<<25) | (0b1101<<21) | (0<<20) | (dst<<12) | ((i&0x1F)<<7) | (0b010<<4) | src)
#define ASRi(dst, src, i) (AL | (0b000<<25) | (0b1101<<21) | (0<<20) | (dst<<12) | ((i&0x1F)<<7) | (0b100<<4) | src)
#define RORi(dst, src, i) (AL | (0b000<<25) | (0b1101<<21) | (0<<20) | (dst<<12) | ((i&0x1F)<<7) | (0b110<<4) | src)
#define RRX(dst, src) (AL | (0b000<<25) | (0b1101<<21) | (0<<20) | (dst<<12) | (0b110<<4) | src)
#define BIC(dst, src, reg) (AL | (0b000<<25) | (0b11100<<20) | (src<<16) | (dst<<12) | reg)
#define MVN(dst, reg) (AL | (0b000<<25) | (0b11110<<20) | (dst<<12) | reg)
#define TST( src, reg) (AL | (0b000<<25) | (0b10001<<20) | (src<<16) | reg)
#define TEQ( src, reg) (AL | (0b000<<25) | (0b10011<<20) | (src<<16) | reg)
#define CMP( src, reg) (AL | (0b000<<25) | (0b10101<<20) | (src<<16) | reg)
#define CMN( src, reg) (AL | (0b000<<25) | (0b10111<<20) | (src<<16) | reg)
// load word/byte with pre-increment
#define LDRa(dst, base, off) (AL | (0b011<<25) | (0b1100<<21) | (1<<20) | base<<16 | dst<<12 | off)
#define LDRai(dst, base, off12)(AL | (0b010<<25) | (0b1100<<21) | (1<<20) | base<<16 | dst<<12 | off12)
// load byte with 12-bit offset
#define LDRBai(dst, base, off12) (AL | (0b010<<25) | (0b1110<<21) | (1<<20) | base<<16 | dst<<12 | off12)
// load word with post-increment
#define LDRTaiw(dst, base, off) (AL | (0b010<<25) | (0b0101<<21) | (1<<20) | base<<16 | dst<<12 | off)
// load/store byte with post-increment
#define LDRBTaiw(dst, base, off)(AL | (0b010<<25) | (0b0111<<21) | (1<<20) | base<<16 | dst<<12 | off)
#define STRBTaiw(dst, base, off)(AL | (0b010<<25) | (0b0111<<21) | (0<<20) | base<<16 | dst<<12 | off)
// load byte with pre-increment
#define LDRBa(dst, base, off) (AL | (0b011<<25) | (0b1110<<21) | (1<<20) | base<<16 | dst<<12 | off)
#define LDRSBa(dst, base, off) (AL | (0b000<<25) | (0b1100<<21) | (1<<20) | base<<16 | dst<<12 | (0b0000<<8) | 0b1101<<4 | off)
// load signed byte with 8-bit offset
#define LDRSBai(dst, base, off8) (AL | (0b000<<25) | (0b1110<<21) | (1<<20) | base<<16 | dst<<12 | ((((off8)>>4)&0xF)<<8) | 0b1101<<4 | ((off8)&0xF))
// load half-word with pre-increment
#define LDRHa(dst, base, off) (AL | (0b000<<25) | (0b1100<<21) | (1<<20) | base<<16 | dst<<12 | (0b0000<<8) | (0b1011<<4) | off)
#define LDRSHa(dst, base, off) (AL | (0b000<<25) | (0b1100<<21) | (1<<20) | base<<16 | dst<<12 | (0b0000<<8) | (0b1111<<4) | off)
// load unsigned/signed half-word with 8-bit offset
#define LDRHai(dst, base, off8) (AL | (0b000<<25) | (0b1110<<21) | (1<<20) | base<<16 | dst<<12 | ((((off8)>>4)&0xF)<<8) | (0b1011)<<4 | ((off8)&0xF) )
#define LDRSHai(dst, base, off8) (AL | (0b000<<25) | (0b1110<<21) | (1<<20) | base<<16 | dst<<12 | ((((off8)>>4)&0xF)<<8) | (0b1111)<<4 | ((off8)&0xF) )
// store byte/half-word with pre-increment
#define STRBa(dst, base, off) (AL | (0b011<<25) | (0b1110<<21) | (0<<20) | base<<16 | dst<<12 | off)
#define STRHa(dst, base, off) (AL | (0b000<<25) | (0b1100<<21) | (0<<20) | base<<16 | dst<<12 | (0b1011<<4) | off)
// store word with pre-increment
#define STRa(dst, base, off) (AL | (0b011<<25) | (0b1100<<21) | (0<<20) | base<<16 | dst<<12 | off)
#define STRai(dst, base, off) (AL | (0b010<<25) | (0b1100<<21) | (0<<20) | base<<16 | dst<<12 | off)
// store byte with 12-bit offset
#define STRBai(dst, base, off) (AL | (0b010<<25) | (0b1110<<21) | (0<<20) | base<<16 | dst<<12 | off)
// store short with 8-bit offset
#define STRHai(dst, base, off8) (AL | (0b000<<25) | (0b1110<<21) | (0<<20) | base<<16 | dst<<12 | ((((off8)>>4)&0xF)<<8) | (0b1011)<<4 | ((off8)&0xF) )
// store word with post-increment
#define STRTaiw(dst, base, off) (AL | (0b010<<25) | (0b0101<<21) | (0<<20) | base<<16 | dst<<12 | off)
// sign-extend byte to word
#define SXTB(Rd, Rm) (AL | (0b01101<<23) | (0b010<<20) | (0b1111<<16) | (Rd<<12) | (0b00000111<<4) | Rm)
// zero-extend byte to word
#define UXTB(Rd, Rm) (AL | (0b01101<<23) | (0b110<<20) | (0b1111<<16) | (Rd<<12) | (0b00000111<<4) | Rm)
// sign-extend short to word
#define SXTH(Rd, Rm) (AL | (0b01101<<23) | (0b011<<20) | (0b1111<<16) | (Rd<<12) | (0b00000111<<4) | Rm)
// zero-extend short to word
#define UXTH(Rd, Rm) (AL | (0b01101<<23) | (0b111<<20) | (0b1111<<16) | (Rd<<12) | (0b00000111<<4) | Rm)
// branch to target address (for small jumps within +/-32M)
#define Bi(imm24) \
(AL | (0b101)<<25 | (0<<24) /*L*/ | (imm24))
// call subroutine
#define BLi(imm24) \
(AL | (0b101)<<25 | (1<<24) /*L*/ | (imm24))
// branch and exchange (register)
#define BX(reg) \
(AL | 0b00010010<<20 | 0b1111<<16 | 0b1111<<12 | 0b1111<<8| 0b0001<<4 | reg)
// call subroutine (register)
#define BLX(reg) \
(AL | 0b00010010<<20 | 0b1111<<16 | 0b1111<<12 | 0b1111<<8| 0b0011<<4 | reg)
#define PUSH(mask) (AL | (0b100100<<22) | (0b10<<20) | (0b1101<<16) | mask)
#define POP(mask) (0xe8bd0000|mask)
// note: Rd and Rm must not be the same
#define MUL(Rd, Rm, Rs) \
(AL | 0b0000000<<21 | (0<<20) /*S*/ | (Rd<<16) | (Rs<<8) | 0b1001<<4 | Rm)
// Rd = Rn / Rm
#define SDIV(Rd, Rn, Rm) (AL | (0b01110<<23) | (0b001<<20) | (Rd<<16) | (0b1111<<12) | (Rm<<8) | (0b0001 << 4) | Rn)
#define UDIV(Rd, Rn, Rm) (AL | (0b01110<<23) | (0b011<<20) | (Rd<<16) | (0b1111<<12) | (Rm<<8) | (0b0001 << 4) | Rn)
// Rd = Ra - Rn * Rm
#define MLS(Rd, Rn, Rm, Ra) (AL | (0b0110<<20) | (Rd<<16) | (Ra<<12) | (Rm<<8) | (0b1001<<4) | Rn)
// immediate -> singe precision register
#define VMOVi(Vd, imm) (AL | (0b11101<<23)| ((Vd&1)<<22) | (0b11<<20)| (((imm&0xF0)>>4)<<16) | ((Vd>>1)<<12) | (0b10100000<<4) | (imm&0xF) )
// single precision register -> singe precision register
#define VMOV(Vd, Vm) (AL | (0b11101<<23)| ((Vd&1)<<22) | (0b110000<<16) | ((Vd>>1)<<12) | (0b101001<<6) | ((Vm&1)<<5) | (0<<4) | (Vm>>1) )
// arm core register -> singe precision register
#define VMOVass(Vn, Rt) (AL|(0b1110<<24)|(0b000<<21)|(0<<20)| ((Vn>>1)<<16) | (Rt<<12) | (0b1010<<8) | ((Vn&1)<<7) | (1<<4))
// singe precision register -> arm core register
#define VMOVssa(Rt, Vn) (AL|(0b1110<<24)|(0b000<<21)|(1<<20)| ((Vn>>1)<<16) | (Rt<<12) | (0b1010<<8) | ((Vn&1)<<7) | (1<<4))
#define _VCVT_F(Vd, Vm, opc2, op) \
(AL|(0b11101<<23)|((Vd&1)<<22)|(0b111<<19)|(opc2<<16)|((Vd>>1)<<12)|(0b101<<9)|(0<<8)|(op<<7)|(1<<6)|((Vm&1)<<5)|(Vm>>1))
#define VCVT_F32_U32(Sd, Sm) _VCVT_F(Sd, Sm, 0b000, 0 /* unsigned */)
#define VCVT_U32_F32(Sd, Sm) _VCVT_F(Sd, Sm, 0b100, 1 /* round zero */)
#define VCVT_F32_S32(Sd, Sm) _VCVT_F(Sd, Sm, 0b000, 1 /* unsigned */)
#define VCVT_S32_F32(Sd, Sm) _VCVT_F(Sd, Sm, 0b101, 1 /* round zero */)
#define VLDRai(Vd, Rn, i) (AL|(0b1101<<24)|1<<23|((Vd&1)<<22)|1<<20|(Rn<<16)|((Vd>>1)<<12)|(0b1010<<8)|off10_2(i))
#define VSTRai(Vd, Rn, i) (AL|(0b1101<<24)|1<<23|((Vd&1)<<22)|0<<20|(Rn<<16)|((Vd>>1)<<12)|(0b1010<<8)|off10_2(i))
#define VNEG_F32(Vd, Vm) \
(AL|(0b11101<<23)|((Vd&1)<<22)|(0b11<<20)|(1<<16)|((Vd>>1)<<12)|(0b101<<9)|(0<<8)|(1<<6)|((Vm&1)<<5)|(Vm>>1))
#define VADD_F32(Vd, Vn, Vm) \
(AL|(0b11100<<23)|((Vd&1)<<22)|(0b11<<20)|((Vn>>1)<<16)|((Vd>>1)<<12)|(0b101<<9)|(0<<8)|((Vn&1)<<7)|(0<<6)|((Vm&1)<<5)|(Vm>>1))
#define VSUB_F32(Vd, Vn, Vm) \
(AL|(0b11100<<23)|((Vd&1)<<22)|(0b11<<20)|((Vn>>1)<<16)|((Vd>>1)<<12)|(0b101<<9)|(0<<8)|((Vn&1)<<7)|(1<<6)|((Vm&1)<<5)|(Vm>>1))
#define VMUL_F32(Vd, Vn, Vm) \
(AL|(0b11100<<23)|((Vd&1)<<22)|(0b10<<20)|((Vn>>1)<<16)|((Vd>>1)<<12)|(0b101)<<9|(0<<8)|((Vn&1)<<7)|(0<<6)|((Vm&1)<<5)|(Vm>>1))
#define VDIV_F32(Vd, Vn, Vm) \
(AL|(0b11101<<23)|((Vd&1)<<22)|(0b00<<20)|((Vn>>1)<<16)|((Vd>>1)<<12)|(0b101<<9)|(0<<8)|((Vn&1)<<7)|(0<<6)|((Vm&1)<<5)|(Vm>>1))
#define VSQRT_F32(Vd, Vm) \
(AL|(0b11101<<23)|((Vd&1)<<22)|(0b11<<20)|(0b0001<<16)|((Vd>>1)<<12)|(0b101<<9)|(0b011<<6)|((Vm&1)<<5)|(0<<4)|(Vm>>1))
#define _VCMP_F32(Vd, Vm, E) \
(AL|(0b11101<<23)|((Vd&1)<<22)|(0b11<<20)|((0b0100)<<16)|((Vd>>1)<<12)|(0b101<<9)|(0<<8)|(E<<7)|(1<<6)|((Vm&1)<<5)|(Vm>>1))
#define VCMP_F32(Vd, Vm) _VCMP_F32(Vd, Vm, 0)
#define VMRS(Rt) \
(AL|(0b11101111<<20)|(0b0001<<16)|(Rt<<12)|(0b1010<<8)|(1<<4))
// check if we can encode single-precision scalar immediate
static qboolean can_encode_f32_imm( const uint32_t v )
{
uint32_t exp3 = (v >> 25) & ((1<<6)-1);
if ( exp3 != 0x20 && exp3 != 0x1F )
return qfalse;
if ( v & ((1<<19)-1) )
return qfalse;
return qtrue;
}
static uint32_t encode_f32_imm( const uint32_t v )
{
return (((v >> 31) & 0x1) << 7) | (((v >> 23) & 0x7) << 4) | ((v >> 19) & 0xF);
}
static void emit_MOVRxi( uint32_t reg, uint32_t imm )
{
if ( imm <= 0xFFFF ) {
emit( MOVW( reg, imm ) );
} else if ( can_encode( imm ) ) {
emit( MOVi( reg, imm ) );
} else if ( can_encode( ~imm ) ) {
emit( MVNi( reg, ~imm ) );
} else {
emit( MOVW( reg, (imm&0xFFFF) ) );
emit( MOVT( reg, (((imm>>16)&0xFFFF)) ) );
}
}
static uint32_t alloc_rx( uint32_t pref );
static qboolean find_rx_const( uint32_t imm );
static uint32_t alloc_rx_const( uint32_t pref, uint32_t imm );
static uint32_t alloc_rx_local( uint32_t pref, uint32_t imm );
static uint32_t alloc_sx( uint32_t pref );
// ---------------- register allocation --------------------
// register allocation preferences
#define FORCED 0x20 // load function must return specified register
#define TEMP 0x40 // hint: temporary allocation, will not be stored on opStack
#define RCONST 0x80 // hint: register value will be not modified
#define XMASK 0x100 // exclude masked registers
#define RMASK 0x0F
// array sizes for cached/meta registers
#define NUM_RX_REGS 13 // [R0..R12]
#define NUM_SX_REGS 8 // [S0..S7]
// general-purpose register list available for dynamic allocation
static const uint32_t rx_list_alloc[] = {
R0, R1, R2, R3, // R0-R3 are required minimum
R12
};
// FPU scalar register list available for dynamic allocation
static const uint32_t sx_list_alloc[] = {
S0, S1, 2, 3, 4, 5, 6, 7 // S0 and S1 are required minimum
};
#ifdef CONST_CACHE_RX
static const uint32_t rx_list_cache[] = {
R12, R3, R2, R1
};
#endif
#ifdef CONST_CACHE_SX
static const uint32_t sx_list_cache[] = {
S0, S1, 2, 3, 4, 5, 6, 7,
};
#endif
// types of items on the opStack
typedef enum {
TYPE_RAW, // stored value
TYPE_CONST, // constant
TYPE_LOCAL, // address of local variable
TYPE_RX, // volatile - general-purpose register
TYPE_SX, // volatile - FPU scalar register
} opstack_value_t;
typedef enum {
RTYPE_UNUSED = 0x0,
RTYPE_CONST = 0x1,
RTYPE_VAR = 0x2
} reg_value_t;
typedef struct opstack_s {
uint32_t value;
int offset;
opstack_value_t type;
int safe_arg;
} opstack_t;
typedef struct var_addr_s {
int32_t addr; // variable address/offset
uint8_t base; // procBase or dataBase register, ranges should NOT overlap
uint8_t size; // 1,2,4
} var_addr_t;
typedef enum {
Z_NONE,
Z_EXT8,
S_EXT8,
Z_EXT16,
S_EXT16,
} ext_t;
typedef struct reg_s {
int type_mask;
struct {
uint32_t value;
} cnst;
// register value can be mapped to many memory regions
struct {
var_addr_t map[REG_MAP_COUNT];
unsigned idx; // next allocation slot
} vars;
uint32_t ip; // ip of last reference
int refcnt; // reference counter
ext_t ext; // zero/sign-extension flags
} reg_t;
static int opstack;
static opstack_t opstackv[PROC_OPSTACK_SIZE + 1];
// cached register values
static reg_t rx_regs[NUM_RX_REGS];
static reg_t sx_regs[NUM_SX_REGS];
// masked register can't be allocated or flushed to opStack on register pressure
static int32_t rx_mask[NUM_RX_REGS];
static int32_t sx_mask[NUM_SX_REGS];
static qboolean find_free_rx( void ) {
uint32_t i, n;
for ( i = 0; i < ARRAY_LEN( rx_list_alloc ); i++ ) {
n = rx_list_alloc[i];
if ( rx_regs[n].type_mask == RTYPE_UNUSED ) {
return qtrue;
}
}
return qfalse;
}
static qboolean find_free_sx( void ) {
uint32_t i, n;
for ( i = 0; i < ARRAY_LEN( sx_list_alloc ); i++ ) {
n = sx_list_alloc[i];
if ( sx_regs[n].type_mask == RTYPE_UNUSED ) {
return qtrue;
}
}
return qfalse;
}
static void wipe_reg_range( reg_t *reg, const var_addr_t *v ) {
if ( reg->type_mask & RTYPE_VAR ) {
uint32_t c, n;
for ( c = 0, n = 0; n < ARRAY_LEN( reg->vars.map ); n++ ) {
var_addr_t *var = &reg->vars.map[n];
if ( var->size != 0 ) {
c++;
if ( var->base == v->base ) {
if ( v->addr < var->addr + var->size && v->addr + v->size > var->addr ) {
memset( var, 0, sizeof( *var ) );
//var->size = 0;
c--; continue;
}
}
}
}
if ( c == 0 ) {
reg->type_mask &= ~RTYPE_VAR;
reg->ext = Z_NONE;
} else {
//reg->type_mask |= RTYPE_VAR;
}
}
}
static void wipe_var_range( const var_addr_t *v )
{
#ifdef LOAD_OPTIMIZE
uint32_t i;
#ifdef DEBUG_VM
if ( v->size == 0 || v->base == 0 )
DROP( "incorrect variable setup" );
#endif
// wipe all types of overlapping variables
for ( i = 0; i < ARRAY_LEN( rx_regs ); i++ ) {
wipe_reg_range( &rx_regs[i], v );
}
for ( i = 0; i < ARRAY_LEN( sx_regs ); i++ ) {
wipe_reg_range( &sx_regs[i], v );
}
#endif
}
static void set_var_map( reg_t *r, const var_addr_t *v ) {
uint32_t n;
for ( n = 0; n < ARRAY_LEN( r->vars.map ); n++ ) {
if ( r->vars.map[n].size == 0 ) {
r->vars.map[n] = *v;
r->vars.idx = ( n + 1 ) % ARRAY_LEN( r->vars.map );
return;
}
}
r->vars.map[r->vars.idx] = *v;
r->vars.idx = ( r->vars.idx + 1 ) % ARRAY_LEN( r->vars.map );
}
static void set_rx_var( uint32_t reg, const var_addr_t *v ) {
#ifdef LOAD_OPTIMIZE
if ( reg < ARRAY_LEN( rx_regs ) ) {
reg_t *r = rx_regs + reg;
#ifdef REG_TYPE_MASK
r->type_mask |= RTYPE_VAR;
#else
r->type_mask = RTYPE_VAR;
#endif
set_var_map( r, v );
r->refcnt++; // = 1;
r->ip = ip;
}
#endif
}
static void set_rx_ext( uint32_t reg, ext_t ext ) {
#ifdef LOAD_OPTIMIZE
if ( reg >= ARRAY_LEN( rx_regs ) )
DROP( "register value %i s out of range", reg );
rx_regs[reg].ext = ext;
#endif
}
static void set_sx_var( uint32_t reg, const var_addr_t *v ) {
#ifdef LOAD_OPTIMIZE
if ( reg < ARRAY_LEN( sx_regs ) ) {
reg_t *r = sx_regs + reg;
#ifdef REG_TYPE_MASK
r->type_mask |= RTYPE_VAR;
#else
r->type_mask = RTYPE_VAR;
#endif
set_var_map( r, v );
r->refcnt++; // = 1;
r->ip = ip;
}
#endif
}
static reg_t *find_rx_var( uint32_t *reg, const var_addr_t *v ) {
#ifdef LOAD_OPTIMIZE
uint32_t i;
for ( i = 0; i < ARRAY_LEN( rx_regs ); i++ ) {
reg_t *r = &rx_regs[i];
if ( r->type_mask & RTYPE_VAR ) {
uint32_t n;
for ( n = 0; n < ARRAY_LEN( r->vars.map ); n++ ) {
if ( r->vars.map[n].size && r->vars.map[n].addr == v->addr && r->vars.map[n].size == v->size && r->vars.map[n].base == v->base ) {
r->refcnt++;
r->ip = ip;
*reg = i;
return r;
}
}
}
}
#endif
return NULL;
}
static qboolean find_sx_var( uint32_t *reg, const var_addr_t *v ) {
#ifdef LOAD_OPTIMIZE
uint32_t i;
for ( i = 0; i < ARRAY_LEN( sx_regs ); i++ ) {
reg_t *r = &sx_regs[i];
if ( r->type_mask & RTYPE_VAR ) {
uint32_t n;
for ( n = 0; n < ARRAY_LEN( r->vars.map ); n++ ) {
if ( r->vars.map[n].size && r->vars.map[n].addr == v->addr && r->vars.map[n].size == v->size && r->vars.map[n].base == v->base ) {
r->refcnt++;
r->ip = ip;
*reg = i;
return qtrue;
}
}
}
}
#endif // LOAD_OPTIMIZE
return qfalse;
}
static void reduce_map_size( reg_t *reg, uint32_t size ) {
int i;
for ( i = 0; i < ARRAY_LEN( reg->vars.map ); i++ ) {
if ( reg->vars.map[i].size > size ) {
reg->vars.map[i].size = size;
}
}
}
static reg_t *rx_on_top( void ) {
opstack_t *it = &opstackv[ opstack ];
if ( it->type == TYPE_RX ) {
return &rx_regs[ it->value ];
} else {
return NULL;
}
}
static void wipe_vars( void )
{
#ifdef LOAD_OPTIMIZE
uint32_t i;
reg_t *r;
for ( i = 0; i < ARRAY_LEN( rx_regs ); i++ ) {
r = &rx_regs[i];
memset( &r->vars, 0, sizeof( r->vars ) );
r->type_mask &= ~RTYPE_VAR;
r->ext = Z_NONE;
}
for ( i = 0; i < ARRAY_LEN( sx_regs ); i++ ) {
r = &sx_regs[i];
memset( &r->vars, 0, sizeof( r->vars ) );
r->type_mask &= ~RTYPE_VAR;
r->ext = Z_NONE;
}
#endif
}
static qboolean search_opstack( opstack_value_t type, uint32_t value ) {
int i;
for ( i = 1; i <= opstack; i++ ) {
if ( opstackv[i].type == type && opstackv[i].value == value ) {
return qtrue;
}
}
return qfalse;
}
static void wipe_rx_meta( uint32_t reg )
{
#ifdef DEBUG_VM
if ( reg >= ARRAY_LEN( rx_regs ) )
DROP( "incorrect register index %i", reg );
#endif
memset( &rx_regs[reg], 0, sizeof( rx_regs[0] ) );
//rx_regs[reg].type_mask = RTYPE_UNUSED;
}
static void wipe_sx_meta( uint32_t reg )
{
#ifdef DEBUG_VM
if ( reg >= ARRAY_LEN( sx_regs ) )
DROP( "incorrect register index %i", reg );
#endif
memset( &sx_regs[reg], 0, sizeof( sx_regs[0] ) );
//sx_regs[reg].type_mask = RTYPE_UNUSED;
}
static void mask_rx( uint32_t reg )
{
rx_mask[reg]++;
}
static void mask_sx( uint32_t reg )
{
sx_mask[reg]++;
}
static void unmask_rx( uint32_t reg )
{
#ifdef DEBUG_VM
if ( rx_mask[reg] <= 0 ) {
DROP( "register R%i is already unmasked", reg );
}
#endif
rx_mask[reg]--;
}
static void unmask_sx( uint32_t reg )
{
#ifdef DEBUG_VM
if ( sx_mask[reg] <= 0 ) {
DROP( "register S%i is already unmasked", reg );
}
#endif
sx_mask[reg]--;
}
static void emit_MOVSxi( uint32_t reg, uint32_t imm )
{
uint32_t rx;
//if ( imm == 0 ) {
// fmov.f32 d0, #0.0? it will wipe s1 too so we need to use s0, s2, s4, s6 etc.
// return;
//}
if ( can_encode_f32_imm( imm ) ) {
emit( VMOVi( reg, encode_f32_imm( imm ) ) );
return;
}
rx = alloc_rx_const( R2, imm ); // rx = imm
emit(VMOVass(reg, rx)); // arm core register -> singe precision register
unmask_rx( rx );
}
static void set_local_address( uint32_t reg, uint32_t addr )
{
if ( can_encode( addr ) ) {
emit(ADDi(reg, rPSTACK, addr)); // r2 = pstack + addr
} else {
if ( find_rx_const( addr ) ) {
uint32_t rx = alloc_rx_const( R3, addr ); // rx = const
emit(ADD(reg, rPSTACK, rx)); // reg = pstack + rx
unmask_rx( rx );
} else {
emit_MOVRxi(reg, addr); // r2 = arg
emit(ADD(reg, rPSTACK, reg)); // ref = pstack + reg
}
}
}
static void flush_item( opstack_t *it )
{
uint32_t rx;
switch ( it->type ) {
case TYPE_RX:
if ( it->offset >= 0 )
emit(STRai(it->value, rOPSTACK, it->offset)); // *opstack = rX
break;
case TYPE_SX:
emit(VSTRai(it->value, rOPSTACK, it->offset)); // *opstack = sX
break;
case TYPE_CONST:
rx = alloc_rx_const( R2, it->value );
emit(STRai(rx, rOPSTACK, it->offset)); // *opstack = r2
unmask_rx( rx );
break;
case TYPE_LOCAL:
rx = alloc_rx_local( R2 | TEMP, it->value );
emit(STRai(rx, rOPSTACK, it->offset)); // *opstack = r2
unmask_rx( rx );
break;
default:
break;
}
it->type = TYPE_RAW;
it->safe_arg = 0;
}
static void flush_items( opstack_value_t type, uint32_t value ) {
int i;
for ( i = 0; i <= opstack; i++ ) {
opstack_t *it = opstackv + i;
if ( it->type == type && it->value == value ) {
flush_item( it );
}
}
}
static void init_opstack( void )
{
opstack = 0;
Com_Memset( &rx_mask[0], 0, sizeof( rx_mask ) );
Com_Memset( &sx_mask[0], 0, sizeof( sx_mask ) );
Com_Memset( &opstackv[0], 0, sizeof( opstackv ) );
Com_Memset( &rx_regs[0], 0, sizeof( rx_regs ) );
Com_Memset( &sx_regs[0], 0, sizeof( sx_regs ) );
}
static qboolean scalar_on_top( void )
{
#ifdef DEBUG_VM
if ( opstack >= PROC_OPSTACK_SIZE || opstack <= 0 )
DROP( "bad opstack %i", opstack * 4 );
#endif
#ifdef FPU_OPTIMIZE
if ( opstackv[ opstack ].type == TYPE_SX )
return qtrue;
#endif
return qfalse;
}
static qboolean addr_on_top( var_addr_t *addr )
{
#ifdef DEBUG_VM
if ( opstack >= PROC_OPSTACK_SIZE || opstack <= 0 )
DROP( "bad opstack %i", opstack * 4 );
#endif
#ifdef ADDR_OPTIMIZE
if ( opstackv[ opstack ].type == TYPE_CONST ) {
addr->addr = opstackv[opstack].value;
addr->base = rDATABASE;
addr->size = 0;
return qtrue;
}
if ( opstackv[ opstack ].type == TYPE_LOCAL ) {
addr->addr = opstackv[opstack].value;
addr->base = rPROCBASE;
addr->size = 0;
return qtrue;
}
#endif
return qfalse;
}
static void discard_top( void )
{
opstack_t *it = &opstackv[ opstack ];
it->type = TYPE_RAW;
it->safe_arg = 0;
}
#if 1
static int is_safe_arg( void )
{
#ifdef DEBUG_VM
if ( opstack >= PROC_OPSTACK_SIZE || opstack <= 0 )
DROP( "bad opstack %i", opstack * 4 );
#endif
return opstackv[ opstack ].safe_arg;
}
#endif
static void inc_opstack( void )
{
#ifdef DEBUG_VM
if ( opstack >= PROC_OPSTACK_SIZE )
DROP( "opstack overflow - %i", opstack * 4 );
#endif
opstack += 1;
#ifdef DEBUG_VM
if ( opstackv[ opstack ].type != TYPE_RAW )
DROP( "bad item type %i at opstack %i", opstackv[ opstack ].type, opstack * 4 );
#endif
}
static void dec_opstack( void )
{
#ifdef DEBUG_VM
opstack_t *it;
if ( opstack <= 0 )
DROP( "opstack underflow - %i", opstack * 4 );
it = &opstackv[ opstack ];
if ( it->type != TYPE_RAW )
DROP( "opstack[%i]: item type %i is not consumed", opstack * 4, it->type );
#endif
opstack -= 1;
}
static void dec_opstack_discard( void )
{
opstack_t *it;
it = &opstackv[ opstack ];
#ifdef DEBUG_VM
if ( opstack <= 0 )
DROP( "opstack underflow - %i", opstack * 4 );
if ( it->type != TYPE_RAW && ( it->type != TYPE_RX || it->offset >= 0 ) )
DROP( "opstack[%i]: item type %i is not consumed", opstack * 4, it->type );
#endif
it->type = TYPE_RAW; // discard value
it->safe_arg = 0;
opstack -= 1;
}
// returns bitmask of registers present on opstack
static uint32_t build_opstack_mask( opstack_value_t reg_type )
{
uint32_t mask = 0;
int i;
for ( i = 0; i <= opstack; i++ ) {
opstack_t *it = opstackv + i;
if ( it->type == reg_type ) {
mask |= ( 1 << it->value );
}
}
return mask;
}
static uint32_t build_rx_mask( void )
{
uint32_t i, mask = 0;
for ( i = 0; i < ARRAY_LEN( rx_mask ); i++ ) {
if ( rx_mask[i] ) {
mask |= 1 << i;
}
}
return mask;
}
static uint32_t build_sx_mask( void )
{
uint32_t i, mask = 0;
for ( i = 0; i < ARRAY_LEN( sx_mask ); i++ ) {
if ( sx_mask[i] ) {
mask |= 1 << i;
}
}
return mask;
}
// allocate register with local address value
static uint32_t alloc_rx_local( uint32_t pref, uint32_t imm )
{
uint32_t rx = alloc_rx( pref );
set_local_address( rx, imm );
return rx;
}
// returns qtrue if specified constant is found or there is a free register to store it
static qboolean find_rx_const( uint32_t imm )
{
#ifdef CONST_CACHE_RX
uint32_t mask = build_rx_mask() | build_opstack_mask( TYPE_RX );
int i;
for ( i = 0; i < ARRAY_LEN( rx_list_cache ); i++ ) {
reg_t *r;
uint32_t n = rx_list_cache[ i ];
if ( mask & ( 1 << n ) ) {
// target register must be unmasked
continue;
}
r = &rx_regs[ n ];
if ( r->type_mask & RTYPE_CONST && r->cnst.value == imm ) {
return qtrue;
}
if ( r->type_mask == RTYPE_UNUSED ) {
return qtrue;
}
}
#endif
return qfalse;
}
// allocate integer register with constant value
static uint32_t alloc_rx_const( uint32_t pref, uint32_t imm )
{
#ifdef CONST_CACHE_RX
reg_t *r;
#endif
uint32_t rx;
#ifdef CONST_CACHE_RX
#ifdef DYN_ALLOC_RX
if ( ( pref & FORCED ) == 0 ) {
// support only dynamic allocation mode
const uint32_t mask = build_rx_mask() | build_opstack_mask( TYPE_RX );
int min_ref = MAX_QINT;
int min_ip = MAX_QINT;
int idx = -1;
int i, n;
if ( ( pref & XMASK ) == 0 ) {
// we can select from already masked registers
for ( n = 0; n < ARRAY_LEN( rx_regs ); n++ ) {
r = &rx_regs[n];
if ( r->type_mask & RTYPE_CONST && r->cnst.value == imm ) {
r->refcnt++;
r->ip = ip;
mask_rx( n );
return n;
}
}
}
for ( i = 0; i < ARRAY_LEN( rx_list_cache ); i++ ) {
n = rx_list_cache[i];
if ( mask & ( 1 << n ) ) {
// target register must be unmasked and not present on the opStack
continue;
}
r = &rx_regs[n];
if ( r->type_mask & RTYPE_CONST && r->cnst.value == imm ) {
// exact match, re-use this register
r->refcnt++; // increase reference count
r->ip = ip; // update address too
mask_rx( n );
return n;
}
if ( r->type_mask == RTYPE_UNUSED ) {
idx = n;
break;
}
if ( ( r->refcnt < min_ref ) || ( r->refcnt == min_ref && r->ip < min_ip ) ) {
// update least referenced item index
min_ref = r->refcnt;
min_ip = r->ip;
idx = n;
continue;
}
}
if ( idx != -1 ) {
r = &rx_regs[ idx ];
memset( &r->vars, 0, sizeof( r->vars ) );
r->type_mask = RTYPE_CONST;
r->cnst.value = imm;
r->refcnt = 1;
r->ip = ip;
r->ext = Z_NONE;
emit_MOVRxi( idx, imm );
mask_rx( idx );
return idx;
}
// else go to usual allocation to handle register spilling
}
#endif // DYN_ALLOC_RX
#endif // CONST_CACHE_RX
rx = alloc_rx( pref );
emit_MOVRxi( rx, imm );
#ifdef CONST_CACHE_RX
r = &rx_regs[ rx ];
//memset( &r->vars, 0, sizeof( r->vars ) );
r->type_mask = RTYPE_CONST;
r->cnst.value = imm;
r->refcnt = 1;
r->ip = ip;
//r->ext = Z_NONE;
#endif
return rx;
}
// allocate scalar register with constant value
static uint32_t alloc_sx_const( uint32_t pref, uint32_t imm )
{
#ifdef CONST_CACHE_SX
reg_t *r;
#endif
uint32_t sx;
#ifdef CONST_CACHE_SX
#ifdef DYN_ALLOC_SX
if ( ( pref & FORCED ) == 0 ) {
// support only dynamic allocation mode
const uint32_t mask = build_sx_mask() | build_opstack_mask( TYPE_SX );
int min_ref = MAX_QINT;
int min_ip = MAX_QINT;
int idx = -1;
int i, n;
if ( ( pref & XMASK ) == 0 ) {
// we can select from already masked registers
for ( n = 0; n < ARRAY_LEN( sx_regs ); n++ ) {
r = &sx_regs[n];
if ( r->type_mask & RTYPE_CONST && r->cnst.value == imm ) {
r->refcnt++;
r->ip = ip;
mask_sx( n );
return n;
}
}
}
for ( i = 0; i < ARRAY_LEN( sx_list_cache ); i++ ) {
n = sx_list_cache[i];
if ( mask & ( 1 << n ) ) {
// target register must be unmasked and not present on the opStack
continue;
}
r = &sx_regs[n];
if ( r->type_mask & RTYPE_CONST && r->cnst.value == imm ) {
// exact match, re-use this register
r->refcnt++; // increase reference count
r->ip = ip; // update address too
mask_sx( n );
return n;
}
if ( r->type_mask == RTYPE_UNUSED ) {
idx = n;
break;
}
if ( ( r->refcnt < min_ref ) || ( r->refcnt == min_ref && r->ip < min_ip ) ) {
// update least referenced item index
min_ref = r->refcnt;
min_ip = r->ip;
idx = n;
continue;
}
}
if ( idx != -1 ) {
r = &sx_regs[ idx ];
memset( &r->vars, 0, sizeof( r->vars ) );
r->type_mask = RTYPE_CONST;
r->cnst.value = imm;
r->refcnt = 1;
r->ip = ip;
r->ext = Z_NONE;
emit_MOVSxi( idx, imm );
mask_sx( idx );
return idx;
}
// else go to usual allocation to handle register spilling
}
#endif // DYN_ALLOC_SX
#endif // CONST_CACHE_SX
sx = alloc_sx( pref );
emit_MOVSxi( sx, imm );
#ifdef CONST_CACHE_SX
r = &sx_regs[sx];
//memset( &r->vars, 0, sizeof( r->vars ) );
r->type_mask = RTYPE_CONST;
r->cnst.value = imm;
r->refcnt = 1;
r->ip = ip;
r->ext = Z_NONE;
#endif
return sx;
}
static uint32_t dyn_alloc_rx( uint32_t pref )
{
const uint32_t _rx_mask = build_rx_mask();
const uint32_t mask = _rx_mask | build_opstack_mask( TYPE_RX );
const reg_t *reg, *used = NULL;
uint32_t i, n;
// try to bypass registers with metadata
for ( i = 0; i < ARRAY_LEN( rx_list_alloc ); i++ ) {
n = rx_list_alloc[i];
if ( mask & ( 1 << n ) ) {
continue;
}
reg = &rx_regs[n];
if ( reg->type_mask != RTYPE_UNUSED ) {
// mark least used item
if ( !used || reg->refcnt < used->refcnt || ( reg->refcnt == used->refcnt && reg->ip < used->ip ) ) {
used = reg;
}
continue;
}
wipe_rx_meta( n );
mask_rx( n );
return n;
}
if ( used ) {
// no free slots but something occupied by metadata
uint32_t idx = used - rx_regs;
wipe_rx_meta( idx );
mask_rx( idx );
return idx;
}
// no free registers, flush bottom of the opStack
for ( i = 0; i <= opstack; i++ ) {
opstack_t *it = opstackv + i;
if ( it->type == TYPE_RX ) {
n = it->value;
// skip masked registers
if ( _rx_mask & ( 1 << n ) ) {
continue;
}
flush_item( it );
flush_items( TYPE_RX, n ); // flush cloned registers too
wipe_rx_meta( n );
mask_rx( n );
return n;
}
}
return ~0U;
}
// integer register allocation
static uint32_t alloc_rx( uint32_t pref )
{
uint32_t reg;
#ifdef DYN_ALLOC_RX
if ( ( pref & FORCED ) == 0 ) {
uint32_t v = dyn_alloc_rx( pref );
if ( v == ~0U ) {
DROP( "no free registers at ip %i, pref %x, opStack %i, mask %04x", ip, pref, opstack * 4, build_rx_mask() );
}
return v;
}
#endif
reg = pref & RMASK;
#ifdef DEBUG_VM
if ( reg >= ARRAY_LEN( rx_mask ) )
DROP( "forced register R%i index overflowed!", reg );
else if ( rx_mask[reg] )
DROP( "forced register R%i is already masked!", reg );
#endif
// FORCED option: find and flush target register
flush_items( TYPE_RX, reg );
wipe_rx_meta( reg );
mask_rx( reg );
return reg;
}
static uint32_t dyn_alloc_sx( uint32_t pref )
{
const uint32_t _sx_mask = build_sx_mask();
const uint32_t mask = _sx_mask | build_opstack_mask( TYPE_SX );
const reg_t *reg, *used = NULL;
uint32_t i, n;
// try to bypass registers with metadata
for ( i = 0; i < ARRAY_LEN( sx_list_alloc ); i++ ) {
n = sx_list_alloc[i];
if ( mask & ( 1 << n ) ) {
continue;
}
reg = &sx_regs[n];
if ( reg->type_mask != RTYPE_UNUSED ) {
// mark least used item
if ( !used || reg->refcnt < used->refcnt || ( reg->refcnt == used->refcnt && reg->ip < used->ip ) ) {
used = reg;
}
continue;
}
wipe_sx_meta( n );
mask_sx( n );
return n;
}
if ( used ) {
// no free slots but something occupied by metadata
uint32_t idx = used - sx_regs;
wipe_sx_meta( idx );
mask_sx( idx );
return idx;
}
// no free registers, flush bottom of the opStack
for ( i = 0; i <= opstack; i++ ) {
opstack_t *it = opstackv + i;
if ( it->type == TYPE_SX ) {
n = it->value;
// skip masked registers
if ( _sx_mask & ( 1 << n ) ) {
continue;
}
flush_item( it );
flush_items( TYPE_SX, n ); // flush cloned registers too
wipe_sx_meta( n );
mask_sx( n );
return n;
}
}
return ~0U;
}
// scalar register allocation
static uint32_t alloc_sx( uint32_t pref )
{
uint32_t reg;
#ifdef DYN_ALLOC_SX
if ( ( pref & FORCED ) == 0 ) {
uint32_t v = dyn_alloc_sx( pref );
if ( v == ~0U ) {
DROP( "no free registers at ip %i, pref %x, opStack %i, mask %04x", ip, pref, opstack * 4, build_sx_mask() );
}
return v;
}
#endif
reg = pref & RMASK;
#ifdef DEBUG_VM
if ( reg >= ARRAY_LEN( sx_mask ) )
DROP( "forced register S%i index overflowed!", reg );
else if ( sx_mask[reg] )
DROP( "forced register S%i is already masked!", reg );
#endif
// FORCED option: find and flush target register
flush_items( TYPE_SX, reg );
wipe_sx_meta( reg );
mask_sx( reg );
return reg;
}
/*
==============
flush_volatile
flush any cached register/address/constant to opstack and reset meta (constants mapping)
this MUST be called before any unconditional jump, return or function call
==============
*/
static void flush_volatile( void )
{
int i;
for ( i = 0; i <= opstack; i++ ) {
opstack_t *it = opstackv + i;
if ( it->type == TYPE_RX || it->type == TYPE_SX ) {
flush_item( it );
}
}
// wipe all constants metadata
Com_Memset( &rx_regs[0], 0, sizeof( rx_regs ) );
Com_Memset( &sx_regs[0], 0, sizeof( sx_regs ) );
}
static void flush_opstack( void )
{
int i;
for ( i = 0; i <= opstack; i++ ) {
opstack_t *it = opstackv + i;
flush_item( it );
}
// wipe all constants metadata
Com_Memset( &rx_regs[0], 0, sizeof( rx_regs ) );
Com_Memset( &sx_regs[0], 0, sizeof( sx_regs ) );
}
static void store_rx_opstack( uint32_t reg )
{
opstack_t *it = opstackv + opstack;
#ifdef DEBUG_VM
if ( opstack <= 0 )
DROP( "bad opstack %i", opstack * 4 );
if ( it->type != TYPE_RAW )
DROP( "bad type %i at opstack %i", it->type, opstack * 4 );
#endif
it->type = TYPE_RX;
it->offset = opstack * sizeof( int32_t );
it->value = reg;
it->safe_arg = 0;
unmask_rx( reg ); // so it can be flushed on demand
}
static void store_syscall_opstack( void )
{
opstack_t *it = opstackv + opstack;
#ifdef DEBUG_VM
if ( opstack <= 0 )
DROP( "bad opstack %i", opstack * 4 );
if ( it->type != TYPE_RAW )
DROP( "bad type %i at opstack %i", it->type, opstack * 4 );
#endif
it->type = TYPE_RX;
it->offset = -1; // opstack * sizeof( int32_t )
it->value = R0;
it->safe_arg = 0;
wipe_rx_meta( it->value );
unmask_rx( it->value ); // so it can be flushed on demand
}
static void store_sx_opstack( uint32_t reg )
{
opstack_t *it = opstackv + opstack;
#ifdef DEBUG_VM
if ( opstack <= 0 )
DROP( "bad opstack %i", opstack * 4 );
if ( it->type != TYPE_RAW )
DROP( "bad type %i at opstack %i", it->type, opstack * 4 );
#endif
it->type = TYPE_SX;
it->offset = opstack * sizeof( int32_t );
it->value = reg;
it->safe_arg = 0;
unmask_sx( reg ); // so it can be flushed on demand
}
static void store_item_opstack( instruction_t *ins )
{
opstack_t *it = opstackv + opstack;
#ifdef DEBUG_VM
if ( it->type != TYPE_RAW )
DROP( "bad type %i at opstack %i", it->type, opstack * 4 );
#endif
switch ( ins->op ) {
case OP_CONST: it->type = TYPE_CONST; break;
case OP_LOCAL: it->type = TYPE_LOCAL; break;
default: DROP( "incorrect opcode %i", ins->op );
}
it->offset = opstack * sizeof( int32_t );
it->value = ins->value;
it->safe_arg = ins->safe;
}
static uint32_t finish_rx( uint32_t pref, uint32_t reg ) {
if ( pref & RCONST ) {
// non-destructive operation
return reg;
}
if ( search_opstack( TYPE_RX, reg ) ) {
// another instance is present on opStack
if ( pref & FORCED ) {
// nothing should left for a FORCED register
flush_items( TYPE_RX, reg );
} else {
// copy it
int rx = alloc_rx( R2 );
emit(MOV(rx, reg));
unmask_rx( reg );
return rx;
}
}
wipe_rx_meta( reg );
return reg;
}
/*
===========
load_rx_opstack
loads current opstack value into specified register
returns masked register number, must be unmasked manually if not stored on the opstack
output register is very likely to be modified unless CONST preference is specified
===========
*/
static uint32_t load_rx_opstack( uint32_t pref )
{
opstack_t *it = opstackv + opstack;
uint32_t reg = pref & RMASK;
#ifdef DEBUG_VM
if ( opstack <= 0 )
DROP( "bad opstack %i", opstack*4 );
#endif
if ( it->type == TYPE_RX ) {
#ifdef DYN_ALLOC_RX
if ( !( pref & FORCED ) ) {
mask_rx( it->value );
it->type = TYPE_RAW;
return finish_rx( pref, it->value ); // return current register
}
#endif
// FORCED flag: return exact target register
if ( it->value == reg ) {
mask_rx( it->value );
it->type = TYPE_RAW;
return finish_rx( pref, reg );
} else {
// allocate target register
reg = alloc_rx( pref );
// copy source to target
emit(MOV(reg, it->value));
it->type = TYPE_RAW;
return reg;
}
} // it->type == TYPE_RX
// scalar register on the stack
if ( it->type == TYPE_SX ) {
// move from scalar to general-purpose register
reg = alloc_rx( pref );
emit(VMOVssa(reg, it->value)); // singe precision register -> arm core register
it->type = TYPE_RAW;
return reg;
}
if ( ( pref & RCONST ) == 0 ) {
pref |= XMASK;
} // else we can search for constants in masked registers
if ( it->type == TYPE_CONST ) {
// move constant to general-purpose register
reg = alloc_rx_const( pref, it->value );
it->type = TYPE_RAW;
return finish_rx( pref, reg );
}
if ( it->type == TYPE_LOCAL ) {
reg = alloc_rx_local( pref, it->value );
it->type = TYPE_RAW;
return finish_rx( pref, reg );
}
// default raw type, explicit load from opStack
reg = alloc_rx( pref );
emit(LDRai(reg, rOPSTACK, opstack * sizeof(int32_t))); // rX = *opstack
it->type = TYPE_RAW;
return reg;
}
static uint32_t finish_sx( uint32_t pref, uint32_t reg ) {
if ( pref & RCONST ) {
// non-destructive operation
return reg;
}
if ( search_opstack( TYPE_SX, reg ) ) {
// another instance is present on opStack
if ( pref & FORCED ) {
// nothing should left for a FORCED register
flush_items( TYPE_SX, reg );
} else {
// must be copied
int sx = alloc_sx( S2 );
emit(VMOV(sx, reg));
unmask_sx( reg );
return sx;
}
}
wipe_sx_meta( reg );
return reg;
}
static void load_rx_opstack2( uint32_t *dst, uint32_t dst_pref, uint32_t *src, uint32_t src_pref )
{
#if 0
*dst = *src = load_rx_opstack( src_pref &= ~RCONST ); // source, target = *opstack
#else
*dst = *src = load_rx_opstack( src_pref | RCONST ); // source, target = *opstack
if ( search_opstack( TYPE_RX, *src ) || find_free_rx() ) {
// *src is duplicated on opStack or there is a free register
*dst = alloc_rx( dst_pref & ~RCONST ); // allocate new register for the target
} else {
// will be overwritten, wipe metadata
wipe_rx_meta( *dst );
}
#endif
}
// we must unmask register manually after allocation/loading
static uint32_t load_sx_opstack( uint32_t pref )
{
opstack_t *it = opstackv + opstack;
uint32_t reg = pref & RMASK;
#ifdef DEBUG_VM
if ( opstack <= 0 )
DROP( "bad opstack %i", opstack*4 );
#endif
// scalar register on the stack
if ( it->type == TYPE_SX ) {
#ifdef DYN_ALLOC_SX
if ( !( pref & FORCED ) ) {
mask_sx( it->value );
it->type = TYPE_RAW;
return finish_sx( pref, it->value );
}
#endif
// FORCED flag: return exact target register
if ( it->value == reg ) {
mask_sx( it->value );
it->type = TYPE_RAW;
return finish_sx( pref, reg );
} else {
// allocate target register
reg = alloc_sx( pref );
// copy source to target
emit(VMOV(reg, it->value));
it->type = TYPE_RAW;
return reg;
}
}
// integer register on the stack
if ( it->type == TYPE_RX ) {
// move from general-purpose to scalar register
// should never happen with FPU type promotion, except syscalls
reg = alloc_sx( pref );
emit(VMOVass(reg, it->value)); // arm core register -> singe precision register
it->type = TYPE_RAW;
return reg;
}
if ( ( pref & RCONST ) == 0 ) {
pref |= XMASK;
} // else we can search for constants in masked registers
if ( it->type == TYPE_CONST ) {
// move constant to scalar register
reg = alloc_sx_const( pref, it->value );
it->type = TYPE_RAW;
return finish_sx( pref, reg );
}
if ( it->type == TYPE_LOCAL ) {
uint32_t rx;
// bogus case: local address casted to float
reg = alloc_sx( pref );
rx = alloc_rx_local( R2 | RCONST, it->value );
emit(VMOVass(reg, rx)); // arm core register -> singe precision register
unmask_rx( rx );
it->type = TYPE_RAW;
return reg;
}
// default raw type, explicit load from opStack
reg = alloc_sx( pref );
emit(VLDRai(reg, rOPSTACK, opstack * sizeof( int32_t ))); // sX = *opstack
it->type = TYPE_RAW;
return reg;
}
static void load_sx_opstack2( uint32_t *dst, uint32_t dst_pref, uint32_t *src, uint32_t src_pref )
{
#if 0
*dst = *src = load_sx_opstack( src_pref ); // source, target = *opstack
#else
*dst = *src = load_sx_opstack( src_pref | RCONST ); // source, target = *opstack
if ( search_opstack( TYPE_SX, *src ) || find_free_sx() ) {
// *src is duplicated on opStack or there is a free register
*dst = alloc_sx( dst_pref &= ~RCONST ); // allocate new register for the target
} else {
// will be overwritten, wipe metadata
wipe_sx_meta( *dst );
}
#endif
}
static uint32_t get_comp( int op )
{
switch ( op ) {
case OP_EQ: return EQ;
case OP_NE: return NE;
case OP_LTI: return LT;
case OP_LEI: return LE;
case OP_GTI: return GT;
case OP_GEI: return GE;
case OP_LTU: return LO;
case OP_LEU: return LS;
case OP_GTU: return HI;
case OP_GEU: return HS;
case OP_EQF: return EQ;
case OP_NEF: return NE;
case OP_LTF: return MI;
case OP_LEF: return LS;
case OP_GTF: return GT;
case OP_GEF: return GE;
default: DROP( "unexpected op %i", op );
}
return 0;
}
static uint32_t encode_offset( uint32_t ofs )
{
const uint32_t x = (ofs - 8) >> 2;
const uint32_t t = x >> 24;
if ( t != 0x3F && t != 0x00 )
DROP( "%s: can't encode %i", __func__, ofs );
return x & 0x00FFFFFF;
}
static void emitAlign( const uint32_t align )
{
while ( compiledOfs & (align-1) )
emit(NOP);
}
static void emitFuncOffset( uint32_t comp, vm_t *vm, offset_t func )
{
uint32_t offset = savedOffset[ func ] - compiledOfs;
emit( cond( comp, BLi( encode_offset( offset ) ) ) );
}
static void emit_CheckReg( vm_t *vm, uint32_t reg, offset_t func )
{
if ( vm->forceDataMask || !( vm_rtChecks->integer & VM_RTCHECK_DATA ) ) {
emit(AND(reg, rDATAMASK, reg)); // rN = rN & rDATAMASK
return;
}
emit( CMP( reg, rDATAMASK ) );
emitFuncOffset( HI, vm, func );
}
static void emit_CheckJump( vm_t *vm, uint32_t reg, int proc_base, int proc_len )
{
if ( ( vm_rtChecks->integer & VM_RTCHECK_JUMP ) == 0 ) {
return;
}
if ( proc_base != -1 ) {
uint32_t rx[2];
// allow jump within local function scope only
// r2 = ip - proc_base
rx[0] = alloc_rx( R2 | TEMP );
if ( can_encode( proc_base ) )
emit(SUBi(rx[0], reg, proc_base)); // r2 = reg - procBase
else {
emit_MOVRxi(rx[0], proc_base); // r2 = procBase
emit(SUB(rx[0], reg, rx[0])); // r2 = reg - r2
}
// (ip > proc_len) ?
if ( can_encode( proc_len ) ) {
emit(CMPi(rx[0], proc_len));
} else {
rx[1] = alloc_rx_const( R1, proc_len ); // r1 = procLen
emit(CMP(rx[0], rx[1]));
unmask_rx( rx[1] );
}
emitFuncOffset( HI, vm, FUNC_OUTJ ); // error if unsigned higher
unmask_rx( rx[0] );
} else {
uint32_t rx = alloc_rx( R2 | TEMP );
// check if reg >= vm->instructionCount
emit_MOVRxi(rx, vm->instructionCount);
//emit(LDRai(rx, rVMBASE, offsetof(vm_t, instructionCount)));
emit(CMP(reg, rx));
emitFuncOffset( HS, vm, FUNC_OUTJ ); // error if unsigned higher or same
unmask_rx( rx );
}
}
static void emit_CheckProc( vm_t *vm, instruction_t *inst )
{
// programStack overflow check
if ( vm_rtChecks->integer & VM_RTCHECK_PSTACK ) {
// check if pStack < vm->stackBottom
uint32_t rx = alloc_rx( R2 | TEMP );
emit(LDRai(rx, rVMBASE, offsetof(vm_t, stackBottom))); // r1 = vm->stackBottom
emit(CMP(rPSTACK, rx));
emitFuncOffset( LT, vm, FUNC_PSOF );
unmask_rx( rx );
}
// opStack overflow check
if ( vm_rtChecks->integer & VM_RTCHECK_OPSTACK ) {
uint32_t n = inst->opStack; // proc->opStack carries max.used opStack value
uint32_t rx = alloc_rx( R2 | TEMP );
if ( can_encode( n ) ) {
emit(ADDi(rx, rOPSTACK, n)); // r2 = opstack + n;
} else {
emit_MOVRxi(rx, n); // r2 = n
emit(ADD(rx, rOPSTACK, rx)); // r2 = opstack + r2;
}
emit(CMP(rx, rOPSTACKTOP));
emitFuncOffset( HI, vm, FUNC_OSOF ); // error if unsigned higher
unmask_rx( rx );
}
}
static void emitCallFunc( vm_t *vm )
{
static int bytes_to_skip = -1;
static unsigned start_block = -1;
init_opstack(); // to avoid any side-effects on emit_CheckJump()
savedOffset[ FUNC_CALL ] = compiledOfs; // to jump from OP_CALL
emit(CMPi(R0, 0)); // check if syscall
if (start_block == -1)
start_block = compiledOfs;
emit(cond(LT, Bi(encode_offset(bytes_to_skip))));
// check if R0 >= header->instructionCount
mask_rx( R0 );
emit_CheckJump( vm, R0, -1, 0 );
unmask_rx( R0 );
// local function call
emit(LDRa(R12, rINSPOINTERS, rLSL(2, R0))); // r12 = instructionPointers[r0]
emit(BX(R12)); // keep LR so OP_LEAVE will return directly to our caller
//emit(BKPT(0));
// syscall
if (bytes_to_skip == -1)
bytes_to_skip = compiledOfs - start_block;
savedOffset[ FUNC_SYSC ] = compiledOfs; // to jump from OP_CALL
emit(MVN(R0, R0)); // r0 = ~r0
savedOffset[ FUNC_SYSF ] = compiledOfs; // to jump from ConstOptimize()
// save LR because it will be clobbered by BLX instruction
emit(PUSH((1<<rOPSTACK)|(1<<rPSTACK)|(1<<rPROCBASE)|(1<<LR)));
// modify VM stack pointer for recursive VM entry
//currentVM->programStack = pstack - 8;
emit(SUBi(R1, rPSTACK, 8)); // r1 = pstack - 8
emit(STRai(R1, rVMBASE, offsetof(vm_t, programStack))); // vm->programStack = r1
//argPosition = (intptr_t *)((byte *)currentVM->dataBase + pstack + 4);
emit(ADDi(R2,rPROCBASE,4)); // r2 = rPROCBASE + 4
//argPosition[0] = call;
emit(STRai(R0, R2, 0)); // r2[0] = r0
emit(MOV(R0,R2));
//ret = currentVM->systemCall( argPosition );
emit(LDRai(R12, rVMBASE, offsetof(vm_t,systemCall))); // r12 = vm->systemCall
emit(BLX(R12)); // call [r12]( r0 )
emit(POP((1<<rOPSTACK)|(1<<rPSTACK)|(1<<rPROCBASE))|(1<<LR));
// store return value
emit(STRai(R0, rOPSTACK, 4)); // *(opstack+4) = r0
// return to caller
emit(MOV(PC,LR));
}
// R0 - src, R1 - dst, R2 - count, R3 - scratch
static void emitBlockCopyFunc( vm_t *vm )
{
// adjust R2 if needed
emit(AND(R0, R0, rDATAMASK)); // r0 &= dataMask
emit(AND(R1, R1, rDATAMASK)); // r1 &= dataMask
emit(ADD(R3, R0, R2)); // r3 = r0 + r2
emit(AND(R3, R3, rDATAMASK)); // r3 &= dataMask
emit(SUB(R2, R3, R0)); // r2 = r3 - r0
emit(ADD(R3, R1, R2)); // r3 = r1 + r2
emit(AND(R3, R3, rDATAMASK)); // r3 &= dataMask
emit(SUB(R2, R3, R1)); // r2 = r3 - r1
emit(ADD(R0, R0, rDATABASE)); // r0 += dataBase
emit(ADD(R1, R1, rDATABASE)); // r1 += dataBase
// word copy loop
emit(CMPi(R2, 4));
emit(cond(LT, Bi( encode_offset( +20 ))));
emit(LDRTaiw(R3, R0, 4)); // r2 = [r0]; r0 += 4
emit(STRTaiw(R3, R1, 4)); // [r1] = r2; r1 += 4
emit(SUBi(R2, R2, 4)); // r3 -= 4
emit(Bi(encode_offset( -20 )));
// byte copy loop
emit(CMPi(R2, 1));
emit(cond(LT, Bi(encode_offset( +20 ))));
emit(LDRBTaiw(R3, R0, 1)); // r2 = [r0]; r0 += 1
emit(STRBTaiw(R3, R1, 1)); // [r1] = r2; r1 += 1
emit(SUBi(R2, R2, 1)); // r3 -= 1
emit(Bi(encode_offset( -20 )));
emit(MOV(PC, LR));
}
static void emitBlockCopy( vm_t *vm, const uint32_t count )
{
if ( count <= 12 )
{
uint32_t rx[4];
int safe_arg[2];
int i;
rx[0] = load_rx_opstack( R0 ); // src: r0 = *opstack;
safe_arg[0] = is_safe_arg();
dec_opstack(); // opstack -= 4
rx[1] = load_rx_opstack( R1 ); // dst: r1 = *opstack
safe_arg[1] = is_safe_arg();
dec_opstack(); // opstack -= 4
if ( !safe_arg[0] ) {
emit(AND(rx[0], rx[0], rDATAMASK)); // r0 &= dataMask
}
if ( !safe_arg[1] ) {
emit(AND(rx[1], rx[1], rDATAMASK)); // r1 &= dataMask
}
rx[2] = alloc_rx( R2 | TEMP ); // scratch
emit(ADD(rx[0], rx[0], rDATABASE)); // r0 += dataBase
emit(ADD(rx[1], rx[1], rDATABASE)); // r1 += dataBase
i = 0;
// copy words
for ( ; i < (count & ~3); i += 4 ) {
emit(LDRai(rx[2], rx[0], i)); // r2 = dataBase[r0+i];
emit(STRai(rx[2], rx[1], i)); // dataBase[r1+i] = r2;
}
// copy tail bytes
for ( ; i < count; i += 1 ) {
emit(LDRBai(rx[2], rx[0], i)); // r2 = dataBase[r0+i];
emit(STRBai(rx[2], rx[1], i)); // dataBase[r1+i] = r2;
}
unmask_rx( rx[2] );
unmask_rx( rx[1] );
unmask_rx( rx[0] );
return;
}
load_rx_opstack( R0 | FORCED ); dec_opstack(); // src: r0 = *opstack; opstack -= 4
load_rx_opstack( R1 | FORCED ); dec_opstack(); // dst: r1 = *opstack; opstack -= 4
alloc_rx( R2 | FORCED ); // counter
alloc_rx( R3 | FORCED ); // scratch
emit_MOVRxi(R2, count);
emitFuncOffset(AL, vm, FUNC_BCPY);
unmask_rx( R3 );
unmask_rx( R2 );
unmask_rx( R1 );
unmask_rx( R0 );
}
#ifdef CONST_OPTIMIZE
static qboolean ConstOptimize( vm_t *vm, instruction_t *ci, instruction_t *ni )
{
uint32_t rx[2];
uint32_t sx[2];
switch ( ni->op ) {
case OP_ADD:
case OP_SUB:
case OP_BAND:
case OP_BOR:
case OP_BXOR:
if ( can_encode( ci->value ) ) {
//rx[1] = rx[0] = load_rx_opstack( R0 ); // r0 = *opstack
load_rx_opstack2( &rx[1], R1, &rx[0], R0 ); // r1 = r0 = *opstack
switch ( ni->op ) {
case OP_ADD: emit( ADDi( rx[1], rx[0], ci->value ) ); break; // r1 = r0 + x
case OP_SUB: emit( SUBi( rx[1], rx[0], ci->value ) ); break; // r1 = r0 - x
case OP_BAND: emit( ANDi( rx[1], rx[0], ci->value ) ); break; // r1 = r0 & x
case OP_BOR: emit( ORRi( rx[1], rx[0], ci->value ) ); break; // r1 = r0 | x
case OP_BXOR: emit( EORi( rx[1], rx[0], ci->value ) ); break; // r1 = r0 ^ x
}
if ( rx[0] != rx[1] ) {
unmask_rx( rx[0] );
}
store_rx_opstack( rx[1] ); // *opstack = r1
ip += 1; // OP_ADD
return qtrue;
}
break;
case OP_LSH:
case OP_RSHI:
case OP_RSHU:
if ( ci->value <= 0 || ci->value > 31 )
break;
//rx[1] = rx[0] = load_rx_opstack( R0 ); // r0 = *opstack
load_rx_opstack2( &rx[1], R1, &rx[0], R0 ); // r1 = r0 = *opstack
switch ( ni->op ) {
case OP_LSH: emit( LSLi( rx[1], rx[0], ci->value ) ); break; // r1 = r0 << x
case OP_RSHI: emit( ASRi( rx[1], rx[0], ci->value ) ); break; // r1 = r0 >> x
case OP_RSHU: emit( LSRi( rx[1], rx[0], ci->value ) ); break; // r1 = (unsigned)r0 >> x
}
if ( rx[0] != rx[1] ) {
unmask_rx( rx[0] );
}
store_rx_opstack( rx[1] ); // *opstack = r1
ip += 1;
return qtrue;
case OP_JUMP:
flush_volatile();
emit(Bi(encode_offset(vm->instructionPointers[ ci->value ] - compiledOfs)));
ip += 1; // OP_JUMP
return qtrue;
case OP_CALL:
inc_opstack(); // opstack += 4
if ( ci->value == ~TRAP_SQRT ) {
sx[0] = alloc_sx( S0 | TEMP );
emit(VLDRai(sx[0], rPROCBASE, 8)); // s0 = [procBase + 8]
emit(VSQRT_F32(sx[0], sx[0])); // s0 = sqrt(s0)
store_sx_opstack( sx[0] );
ip += 1;
return qtrue;
}
flush_volatile();
if ( ci->value == ~TRAP_SIN || ci->value == ~TRAP_COS ) {
#if (__ARM_PCS_VFP)
// -mfloat-abi=hard
sx[0] = S0; mask_sx( sx[0] );
rx[0] = alloc_rx( R12 );
emit(VLDRai(sx[0], rPROCBASE, 8)); // s0 = [procBase + 8]
if ( ci->value == ~TRAP_SIN )
emit_MOVRxi(rx[0], (intptr_t)sinf);
else
emit_MOVRxi(rx[0], (intptr_t)cosf);
emit(BLX(rx[0]));
unmask_rx( rx[0] );
store_sx_opstack( sx[0] ); // *opstack = s0
#else
// -mfloat-abi=soft or softfp
rx[0] = R0; mask_rx( rx[0] );
rx[1] = R12; mask_rx( rx[1] );
emit(LDRai(rx[0], rPROCBASE, 8)); // r0 = [procBase + 8]
if ( ci->value == ~TRAP_SIN )
emit_MOVRxi(rx[1], (intptr_t)sinf);
else
emit_MOVRxi(rx[1], (intptr_t)cosf);
emit(BLX(rx[1]));
unmask_rx( rx[1] );
store_rx_opstack( rx[0] ); // *opstack = r0
#endif
ip += 1; // OP_CALL
return qtrue;
}
if ( ci->value < 0 ) { // syscall
mask_rx( R0 );
emit_MOVRxi(R0, ~ci->value); // r0 = syscall number
if ( opstack != 1 ) {
emit( ADDi( rOPSTACK, rOPSTACK, (opstack-1)*sizeof(int32_t) ) );
emitFuncOffset( AL, vm, FUNC_SYSF );
emit( SUBi( rOPSTACK, rOPSTACK, (opstack-1)*sizeof(int32_t) ) );
} else {
emitFuncOffset( AL, vm, FUNC_SYSF );
}
ip += 1; // OP_CALL;
store_syscall_opstack();
return qtrue;
}
if ( opstack != 1 ) {
emit( ADDi( rOPSTACK, rOPSTACK, (opstack-1)*sizeof(int32_t) ) );
emit(BLi(encode_offset(vm->instructionPointers[ ci->value ] - compiledOfs)));
emit( SUBi( rOPSTACK, rOPSTACK, (opstack-1)*sizeof(int32_t) ) );
} else {
emit(BLi(encode_offset(vm->instructionPointers[ ci->value ] - compiledOfs)));
}
ip += 1; // OP_CALL;
return qtrue;
case OP_EQ:
case OP_NE:
case OP_GEI:
case OP_GTI:
case OP_GTU:
case OP_GEU:
case OP_LTU:
case OP_LEU:
case OP_LEI:
case OP_LTI:
if ( can_encode( ci->value ) ) {
uint32_t comp = get_comp( ni->op );
rx[0] = load_rx_opstack( R0 | RCONST ); dec_opstack(); // r0 = *opstack; opstack -= 4
emit( CMPi( rx[0], ci->value ) );
emit( cond( comp, Bi( encode_offset( vm->instructionPointers[ni->value] - compiledOfs ) ) ) );
unmask_rx( rx[0] );
ip += 1; // OP_cond
return qtrue;
}
break;
default:
break;
}
return qfalse;
}
#endif // CONST_OPTIMIZE
#ifdef DUMP_CODE
static void dump_code( const char *vmname, uint32_t *code, int32_t code_len )
{
const char *filename = va( "vm-%s.hex", vmname );
fileHandle_t fh = FS_FOpenFileWrite( filename );
if ( fh != FS_INVALID_HANDLE ) {
uint32_t i;
for ( i = 0; i < code_len; i++ ) {
FS_Printf( fh, "%02x %02x %02x %02x\n", ( code[i] >> 0 ) & 0xFF, ( code[i] >> 8 ) & 0xFF, ( code[i] >> 16 ) & 0xFF, ( code[i] >> 24 ) & 0xFF );
}
FS_FCloseFile( fh );
}
}
#endif
qboolean VM_Compile( vm_t *vm, vmHeader_t *header )
{
const char *errMsg;
var_addr_t var;
reg_t *reg;
int proc_base;
int proc_len;
uint32_t rx[3];
uint32_t sx[3];
opcode_t sign_extend;
int var_size;
int i;
if ( ( CPU_Flags & ( CPU_ARMv7 | CPU_VFPv3 ) ) != ( CPU_ARMv7 | CPU_VFPv3 ) ) {
// ARMv7+ is required for MOVW/MOVT/MLS
// VFPv3 is required for VMOVi
return qfalse;
}
inst = (instruction_t*)Z_Malloc( (header->instructionCount + 8 ) * sizeof( instruction_t ) );
//instructionOffsets = (uint32_t*)Z_Malloc( header->instructionCount * sizeof( uint32_t ) );
errMsg = VM_LoadInstructions( (byte *) header + header->codeOffset, header->codeLength, header->instructionCount, inst );
if ( !errMsg ) {
errMsg = VM_CheckInstructions( inst, vm->instructionCount, vm->jumpTableTargets, vm->numJumpTableTargets, vm->exactDataLength );
}
if ( errMsg ) {
VM_FreeBuffers();
Com_Printf( S_COLOR_YELLOW "%s(%s) error: %s\n", __func__, vm->name, errMsg );
return qfalse;
}
if ( !vm->instructionPointers ) {
vm->instructionPointers = Hunk_Alloc( header->instructionCount * sizeof(vm->instructionPointers[0]), h_high );
}
VM_ReplaceInstructions( vm, inst );
memset( savedOffset, 0, sizeof( savedOffset ) );
code = NULL;
vm->codeBase.ptr = NULL;
for ( pass = 0; pass < NUM_PASSES; pass++ ) {
__recompile:
// translate all instructions
ip = 0;
compiledOfs = 0;
proc_base = -1;
proc_len = 0;
init_opstack();
emit(PUSH(R4_R11|(1<<LR))); // push R4-R11, LR
emit(SUBi(SP, SP, 12)); // align stack to 16 bytes
// these are constant at compile time
emit_MOVRxi(rVMBASE, (intptr_t)vm);
emit(LDRai(rINSPOINTERS, rVMBASE, offsetof(vm_t, instructionPointers)));
emit(LDRai(rDATABASE, rVMBASE, offsetof(vm_t, dataBase)));
emit(LDRai(rDATAMASK, rVMBASE, offsetof(vm_t, dataMask)));
// these are volatile variables
emit(LDRai(rPSTACK, rVMBASE, offsetof(vm_t, programStack)));
emit(LDRai(rOPSTACK, rVMBASE, offsetof(vm_t, opStack)));
emit(LDRai(rOPSTACKTOP, rVMBASE, offsetof(vm_t, opStackTop)));
emitFuncOffset( AL, vm, FUNC_ENTR ); // call vmMain()
#ifdef DEBUG_VM
emit(STRai(rPSTACK, rVMBASE, offsetof(vm_t, programStack))); // vm->programStack = rPSTACK;
#endif
emit(ADDi(SP, SP, 12)); // align stack to 16 bytes
emit(POP(R4_R11|(1<<PC))); // pop R4-R11, LR -> PC
//emit(BKPT(0));
#ifdef FUNC_ALIGN
emitAlign( FUNC_ALIGN );
#endif
savedOffset[ FUNC_ENTR ] = compiledOfs; // offset to vmMain() entry point
while ( ip < header->instructionCount ) {
ci = &inst[ ip + 0 ];
#ifdef REGS_OPTIMIZE
if ( ci->jused )
#endif
{
// we can safely perform register optimizations only in case if
// we are 100% sure that current instruction is not a jump label
flush_volatile();
}
vm->instructionPointers[ ip++ ] = compiledOfs;
switch ( ci->op )
{
case OP_UNDEF:
emit(BKPT(1));
break;
case OP_IGNORE:
break;
case OP_BREAK:
emit(BKPT(3));
break;
case OP_ENTER:
#ifdef FUNC_ALIGN
emitAlign( FUNC_ALIGN );
#endif
vm->instructionPointers[ ip - 1 ] = compiledOfs;
proc_base = ip; // this points on next instruction after OP_ENTER
// locate endproc
for ( proc_len = -1, i = ip; i < header->instructionCount; i++ ) {
if ( inst[ i ].op == OP_PUSH && inst[ i + 1 ].op == OP_LEAVE ) {
proc_len = i - proc_base;
break;
}
}
emit(PUSH((1<<rOPSTACK)|(1<<rPSTACK)|(1<<rPROCBASE)|(1<<LR)));
if ( can_encode( ci->value ) ) {
emit(SUBi(rPSTACK, rPSTACK, ci->value)); // pstack -= arg
} else {
rx[0] = alloc_rx_const( R2, ci->value ); // r2 = arg
emit(SUB(rPSTACK, rPSTACK, rx[0])); // pstack -= r2
unmask_rx( rx[0] );
}
emit_CheckProc( vm, ci );
emit(ADD(rPROCBASE, rPSTACK, rDATABASE));
break;
case OP_LEAVE:
flush_opstack();
dec_opstack(); // opstack -= 4
#ifdef DEBUG_VM
if ( opstack != 0 )
DROP( "opStack corrupted on OP_LEAVE" );
#endif
emit(POP((1<<rOPSTACK)|(1<<rPSTACK)|(1<<rPROCBASE)|(1<<PC)));
break;
case OP_CALL:
rx[0] = load_rx_opstack( R0 | FORCED ); // r0 = *opstack
flush_volatile();
if ( opstack != 1 ) {
emit( ADDi( rOPSTACK, rOPSTACK, (opstack-1)*sizeof(int32_t) ) );
emitFuncOffset(AL, vm, FUNC_CALL);
emit( SUBi( rOPSTACK, rOPSTACK, (opstack-1)*sizeof(int32_t) ) );
} else {
emitFuncOffset(AL, vm, FUNC_CALL);
}
unmask_rx( rx[0] );
break;
case OP_PUSH:
inc_opstack(); // opstack += 4
if ( (ci + 1)->op == OP_LEAVE ) {
proc_base = -1;
}
break;
case OP_POP:
dec_opstack_discard(); // opstack -= 4
break;
case OP_CONST:
#ifdef CONST_OPTIMIZE
if ( ConstOptimize( vm, ci + 0, ci + 1 ) )
break;
#endif
inc_opstack(); // opstack += 4
store_item_opstack( ci );
break;
case OP_LOCAL:
inc_opstack(); // opstack += 4
store_item_opstack( ci );
break;
case OP_JUMP:
rx[0] = load_rx_opstack( R0 | RCONST ); dec_opstack(); // r0 = *opstack; opstack -= 4
flush_volatile();
emit_CheckJump( vm, rx[0], proc_base, proc_len ); // check if r0 is within current proc
rx[1] = alloc_rx( R12 );
emit(LDRa(rx[1], rINSPOINTERS, rLSL(2, rx[0]))); // r12 = instructionPointers[ r0 ]
emit(BX(rx[1]));
unmask_rx( rx[1] );
unmask_rx( rx[0] );
break;
case OP_EQ:
case OP_NE:
case OP_LTI:
case OP_LEI:
case OP_GTI:
case OP_GEI:
case OP_LTU:
case OP_LEU:
case OP_GTU:
case OP_GEU: {
uint32_t comp = get_comp( ci->op );
rx[0] = load_rx_opstack( R0 | RCONST ); dec_opstack(); // r0 = *opstack; opstack -= 4
rx[1] = load_rx_opstack( R1 | RCONST ); dec_opstack(); // r1 = *opstack; opstack -= 4
unmask_rx( rx[0] );
unmask_rx( rx[1] );
emit(CMP(rx[1], rx[0]));
emit(cond(comp, Bi(encode_offset(vm->instructionPointers[ci->value] - compiledOfs))));
}
break;
case OP_EQF:
case OP_NEF:
case OP_LTF:
case OP_LEF:
case OP_GTF:
case OP_GEF: {
uint32_t comp = get_comp( ci->op );
sx[0] = load_sx_opstack( S0 | RCONST ); dec_opstack(); // s0 = *opstack; opstack -= 4
sx[1] = load_sx_opstack( S1 | RCONST ); dec_opstack(); // s1 = *opstack; opstack -= 4
unmask_sx( sx[0] );
unmask_sx( sx[1] );
emit(VCMP_F32(sx[1], sx[0]));
emit(VMRS(APSR_nzcv));
emit(cond(comp, Bi(encode_offset(vm->instructionPointers[ci->value] - compiledOfs))));
}
break;
case OP_LOAD1:
case OP_LOAD2:
case OP_LOAD4:
#ifdef FPU_OPTIMIZE
if ( ci->op == OP_LOAD4 && ci->fpu ) {
if ( addr_on_top( &var ) ) {
// address specified by CONST/LOCAL
discard_top();
var.size = 4;
if ( find_sx_var( &sx[0], &var ) ) {
// already cached in some register
mask_sx( sx[0] );
} else {
// not cached, perform load
sx[0] = alloc_sx( S0 );
if ( var.addr < 1024 && ( var.addr & 3 ) == 0 ) {
// short offset
emit( VLDRai( sx[0], var.base, var.addr ) ); // s0 = var.base[var.addr]
} else {
// long offset
if ( can_encode( var.addr ) ) {
rx[1] = alloc_rx( R1 );
emit( ADDi( rx[1], var.base, var.addr ) ); // r1 = var.base + var.addr
emit( VLDRai( sx[0], rx[1], 0 ) ); // s0 = [r1]
unmask_rx( rx[1] );
} else {
rx[1] = alloc_rx_const( R1, var.addr ); // r1 = var.addr
rx[2] = alloc_rx( R2 );
emit( ADD( rx[2], rx[1], var.base ) ); // r2 = r1 + var.base
emit( VLDRai( sx[0], rx[2], 0 ) ); // s = [r2]
unmask_rx( rx[1] );
unmask_rx( rx[2] );
}
}
set_sx_var( sx[0], &var ); // update metadata, this may wipe constant
}
} else {
// address specified by register
rx[0] = load_rx_opstack( R0 ); // r0 = *opstack
emit_CheckReg( vm, rx[0], FUNC_BADR );
sx[0] = alloc_sx( S0 );
// no indexing register mode for VLDR...
emit( ADD( rx[0], rx[0], rDATABASE ) ); // r0 = r0 + database
emit( VLDRai( sx[0], rx[0], 0 ) ); // s0 = [r0]
unmask_rx( rx[0] );
}
store_sx_opstack( sx[0] ); // *opstack = s0
break;
}
#endif
switch ( ci->op ) {
case OP_LOAD1: var_size = 1; sign_extend = OP_SEX8; break;
case OP_LOAD2: var_size = 2; sign_extend = OP_SEX16; break;
default: var_size = 4; sign_extend = OP_UNDEF; break;
}
// integer path
if ( addr_on_top( &var ) ) {
// address specified by CONST/LOCAL
discard_top();
var.size = var_size;
if ( ( reg = find_rx_var( &rx[0], &var ) ) != NULL ) {
// already cached in some register
// do zero extension if needed
switch ( ci->op ) {
case OP_LOAD1:
if ( reg->ext != Z_EXT8 ) {
emit( UXTB( rx[0], rx[0] ) ); // r0 = (unsigned byte) r0
// invalidate any mappings that overlaps with high [8..31] bits
//var.addr += 1; var.size = 3;
//wipe_reg_range( rx_regs + rx[0], &var );
reduce_map_size( reg, 1 );
// modify constant
reg->cnst.value &= 0xFF;
reg->ext = Z_EXT8;
}
break;
case OP_LOAD2:
if ( reg->ext != Z_EXT16 ) {
emit( UXTH( rx[0], rx[0] ) ); // r0 = (unsigned short) r0
// invalidate any mappings that overlaps with high [16..31] bits
//var.addr += 2; var.size = 2;
//wipe_reg_range( rx_regs + rx[0], &var );
reduce_map_size( reg, 2 );
// modify constant
reg->cnst.value &= 0xFFFF;
reg->ext = Z_EXT16;
}
break;
case OP_LOAD4:
reg->ext = Z_NONE;
break;
}
mask_rx( rx[0] );
} else {
// not cached, perform load
int max_offset;
rx[0] = alloc_rx( R0 );
switch ( ci->op ) {
case OP_LOAD1: max_offset = 4096; break;
case OP_LOAD2: max_offset = 256; break;
default: max_offset = 4096; break;
}
if ( ( ci + 1 )->op == sign_extend && sign_extend != OP_UNDEF ) {
// load with sign-extension
if ( var.addr < 256 ) {
// short offset
switch ( ci->op ) {
case OP_LOAD1: emit( LDRSBai( rx[0], var.base, var.addr ) ); set_rx_ext( rx[0], S_EXT8 ); break; // r0 = (signed char)var.base[var.addr]
case OP_LOAD2: emit( LDRSHai( rx[0], var.base, var.addr ) ); set_rx_ext( rx[0], S_EXT16 ); break; // r0 = (signed short)var.base[var.addr]
}
} else {
// long offset
rx[1] = alloc_rx_const( R1, var.addr );
switch ( ci->op ) {
case OP_LOAD1: emit( LDRSBa( rx[0], var.base, rx[1] ) ); set_rx_ext( rx[0], S_EXT8 ); break; // r0 = (signed char)var.base[r1]
case OP_LOAD2: emit( LDRSHa( rx[0], var.base, rx[1] ) ); set_rx_ext( rx[0], S_EXT16 ); break; // r0 = (signed short)var.base[r1]
}
unmask_rx( rx[1] );
}
ip += 1; // OP_SEX8 | OP_SEX16
} else {
// load with zero-extension
if ( var.addr < max_offset ) {
// short offset
switch ( ci->op ) {
case OP_LOAD1: emit( LDRBai( rx[0], var.base, var.addr ) ); set_rx_ext( rx[0], Z_EXT8 ); break; // r0 = (unsigned char)var.base[var.addr]
case OP_LOAD2: emit( LDRHai( rx[0], var.base, var.addr ) ); set_rx_ext( rx[0], Z_EXT16 ); break; // r0 = (unsigned short)var.base[var.addr]
default: emit( LDRai( rx[0], var.base, var.addr ) ); set_rx_ext( rx[0], Z_NONE ); break; // r0 = var.base[var.addr]
}
} else {
// long offset
rx[1] = alloc_rx_const( R1, var.addr );
switch ( ci->op ) {
case OP_LOAD1: emit( LDRBa( rx[0], var.base, rx[1] ) ); set_rx_ext( rx[0], Z_EXT8 ); break; // r0 = (unsigned char)var.base[r1]
case OP_LOAD2: emit( LDRHa( rx[0], var.base, rx[1] ) ); set_rx_ext( rx[0], Z_EXT16 ); break; // r0 = (unsigned short)var.base[r1]
default: emit( LDRa( rx[0], var.base, rx[1] ) ); set_rx_ext( rx[0], Z_NONE ); break; // r0 = var.base[r1]
}
unmask_rx( rx[1] );
}
} // load with zero-extension
set_rx_var( rx[0], &var );
} // not cached, perform load
} else {
// address specified by register
rx[0] = load_rx_opstack( R0 ); // r0 = *opstack
emit_CheckReg( vm, rx[0], FUNC_BADR );
if ( (ci+1)->op == sign_extend && sign_extend != OP_UNDEF ) {
// merge with following sign-extension instruction
switch ( ci->op ) {
case OP_LOAD1: emit( LDRSBa( rx[0], rDATABASE, rx[0] ) ); set_rx_ext( rx[0], S_EXT8 ); break; // r0 = (signed char)dataBase[r0]
case OP_LOAD2: emit( LDRSHa( rx[0], rDATABASE, rx[0] ) ); set_rx_ext( rx[0], S_EXT16 ); break; // r0 = (signed short)dataBase[r0]
}
ip += 1; // OP_SEX8/OP_SEX16
} else {
// usual load with zero-extension
switch ( ci->op ) {
case OP_LOAD1: emit( LDRBa( rx[0], rDATABASE, rx[0] ) ); set_rx_ext( rx[0], Z_EXT8 ); break; // r0 = (unsigned char)dataBase[r0]
case OP_LOAD2: emit( LDRHa( rx[0], rDATABASE, rx[0] ) ); set_rx_ext( rx[0], Z_EXT16 ); break; // r0 = (unsigned short)dataBase[r0]
default: emit( LDRa( rx[0], rDATABASE, rx[0] ) ); set_rx_ext( rx[0], Z_NONE ); break; // r0 = dataBase[r0]
}
}
}
store_rx_opstack( rx[0] ); // *opstack = target
break;
case OP_STORE1:
case OP_STORE2:
case OP_STORE4:
if ( scalar_on_top() && ci->op == OP_STORE4 ) {
sx[0] = load_sx_opstack( S0 | RCONST ); dec_opstack(); // s0 = *opstack; opstack -= 4
if ( addr_on_top( &var ) ) {
// address specified by CONST/LOCAL
discard_top(); dec_opstack();
var.size = 4;
if ( var.addr < 1024 && (var.addr & 3) == 0 ) {
// short offset
emit( VSTRai( sx[0], var.base, var.addr ) ); // var.base[var.addr] = s0
} else {
// long offset
if ( can_encode( var.addr ) ) {
rx[1] = alloc_rx( R1 );
emit( ADDi( rx[1], var.base, var.addr ) ); // r1 = var.base + var.addr
emit( VSTRai( sx[0], rx[1], 0 ) ); // [r1] = s0
unmask_rx( rx[1] );
} else {
rx[1] = alloc_rx_const( R1, var.addr ); // r1 = var.addr
rx[2] = alloc_rx( R2 );
emit( ADD( rx[2], rx[1], var.base ) ); // r2 = r1 + var.base
emit( VSTRai( sx[0], rx[2], 0 ) ); // [r2] = s0
unmask_rx( rx[1] );
unmask_rx( rx[2] );
}
}
wipe_var_range( &var );
set_sx_var( sx[0], &var ); // update metadata
} else {
// address specified by register
rx[1] = load_rx_opstack( R1 ); dec_opstack(); // r1 = *opstack; opstack -= 4
emit_CheckReg( vm, rx[1], FUNC_BADW );
emit( ADD( rx[1], rx[1], rDATABASE ) ); // r1 = r1 + dataBase
emit( VSTRai( sx[0], rx[1], 0 ) ); // [r1] = s0
unmask_rx( rx[1] );
wipe_vars(); // unknown/dynamic address, wipe all register mappings
}
unmask_sx( sx[0] );
} else {
// integer path
rx[0] = load_rx_opstack( R0 | RCONST ); dec_opstack(); // r0 = *opstack; opstack -= 4
if ( addr_on_top( &var ) ) {
// address specified by CONST/LOCAL
int max_offset;
discard_top(); dec_opstack();
switch ( ci->op ) {
case OP_STORE1: var.size = 1; max_offset = 4096; break;
case OP_STORE2: var.size = 2; max_offset = 256; break;
default: var.size = 4; max_offset = 4096; break;
}
if ( var.addr < max_offset ) {
// short offset
switch ( ci->op ) {
case OP_STORE1: emit( STRBai( rx[0], var.base, var.addr ) ); break; // (byte*)var.base[var.addr] = r0
case OP_STORE2: emit( STRHai( rx[0], var.base, var.addr ) ); break; // (short*)var.base[var.addr] = r0
default: emit( STRai( rx[0], var.base, var.addr ) ); break; // var.base[var.addr] = r0
}
} else {
// long offset
rx[1] = alloc_rx_const( R1, var.addr );
switch ( ci->op ) {
case OP_STORE1: emit( STRBa( rx[0], var.base, rx[1] ) ); break; // (byte*)var.base[r1] = r0
case OP_STORE2: emit( STRHa( rx[0], var.base, rx[1] ) ); break; // (short*)var.base[r1] = r0
default: emit( STRa( rx[0], var.base, rx[1] ) ); break; // var.base[r1] = r0
}
unmask_rx( rx[1] );
}
wipe_var_range( &var );
set_rx_var( rx[0], &var ); // update metadata
} else {
// address specified by register
rx[1] = load_rx_opstack( R1 | RCONST ); dec_opstack(); // r1 = *opstack; opstack -= 4
emit_CheckReg( vm, rx[1], FUNC_BADW );
switch ( ci->op ) {
case OP_STORE1: emit( STRBa( rx[0], rDATABASE, rx[1] ) ); break; // (byte*)database[r1] = r0
case OP_STORE2: emit( STRHa( rx[0], rDATABASE, rx[1] ) ); break; // (short*)database[r1] = r0
default: emit( STRa( rx[0], rDATABASE, rx[1] ) ); break; // database[r1] = r0
}
unmask_rx( rx[1] );
wipe_vars(); // unknown/dynamic address, wipe all register mappings
}
unmask_rx( rx[0] );
}
break;
case OP_ARG:
var.base = rPROCBASE;
var.addr = ci->value;
var.size = 4;
wipe_var_range( &var );
if ( scalar_on_top() ) {
sx[0] = load_sx_opstack( S0 | RCONST ); dec_opstack(); // s0 = *opstack; opstack -=4
// v is in range [8..252] so it is fit in VSTRai immediate encoding
emit(VSTRai(sx[0], var.base, var.addr)); // [procBase + v] = s0
unmask_sx( sx[0] );
} else {
rx[0] = load_rx_opstack( R0 | RCONST ); dec_opstack(); // r0 = *opstack; opstack -=4
emit(STRai(rx[0], var.base, var.addr)); // [procBase + v] = r0
unmask_rx( rx[0] );
}
break;
case OP_BLOCK_COPY:
emitBlockCopy( vm, ci->value );
wipe_vars();
break;
case OP_SEX8:
case OP_SEX16:
case OP_NEGI:
case OP_BCOM:
if ( ci->op == OP_SEX8 || ci->op == OP_SEX16 ) {
// skip sign-extension for `if ( var == 0 )` tests if we already zero-extended
reg = rx_on_top();
if ( reg && (ci+1)->op == OP_CONST && (ci+1)->value == 0 && ( (ci+2)->op == OP_EQ || (ci+2)->op == OP_NE ) ) {
if ( !(ci+1)->jused && !(ci+2)->jused ) {
if ( ci->op == OP_SEX8 && reg->ext == Z_EXT8 ) {
break;
}
if ( ci->op == OP_SEX16 && reg->ext == Z_EXT16 ) {
break;
}
}
}
}
//rx[1] = rx[0] = load_rx_opstack( R0 ); // r0 = *opstack
load_rx_opstack2( &rx[1], R0, &rx[0], R1 ); // rx1 = r0 = *opstack
switch ( ci->op ) {
case OP_SEX8: emit(SXTB(rx[1], rx[0])); break; // r1 = sign extend r0
case OP_SEX16: emit(SXTH(rx[1], rx[0])); break; // r1 = sign extend r0
case OP_NEGI: emit(RSBi(rx[1], rx[0], 0)); break; // r1 = -r0
case OP_BCOM: emit( MVN(rx[1], rx[0])); break; // r1 = ~r0
}
if ( rx[0] != rx[1] ) {
unmask_rx( rx[0] );
}
store_rx_opstack( rx[1] ); // *opstack = r1
break;
case OP_DIVI:
case OP_DIVU:
if ( CPU_Flags & CPU_IDIVA ) {
rx[0] = load_rx_opstack( R0 ); dec_opstack(); // r0 = *opstack
rx[1] = load_rx_opstack( R1 ); // opstack-=4; r1 = *opstack
if ( ci->op == OP_DIVI ) {
emit(SDIV(rx[0], rx[1], rx[0]));
} else {
emit(UDIV(rx[0], rx[1], rx[0]));
}
store_rx_opstack( rx[0] ); // *opstack = r0
unmask_rx( rx[1] );
} else {
rx[1] = load_rx_opstack( R1 | FORCED ); dec_opstack(); // r1 = *opstack
rx[0] = load_rx_opstack( R0 | FORCED ); // opstack-=4; r0 = *opstack
rx[2] = alloc_rx( R12 );
flush_volatile();
if ( ci->op == OP_DIVI )
emit_MOVRxi(rx[2], (intptr_t)__aeabi_idiv);
else
emit_MOVRxi(rx[2], (intptr_t)__aeabi_uidiv);
emit(BLX(rx[2]));
store_rx_opstack( rx[0] ); // *opstack = r0
unmask_rx( rx[1] );
unmask_rx( rx[2] );
}
break;
case OP_MODI:
case OP_MODU:
if ( CPU_Flags & CPU_IDIVA ) {
rx[0] = load_rx_opstack( R0 ); dec_opstack(); // r0 = *opstack
rx[1] = load_rx_opstack( R1 ); // opstack-=4; r1 = *opstack
rx[2] = alloc_rx( R2 | TEMP );
if ( ci->op == OP_MODI ) {
emit(SDIV(rx[2], rx[1], rx[0])); // r2 = r1 / r0
} else {
emit(UDIV(rx[2], rx[1], rx[0])); // r2 = (unsigned)r1 / r0
}
emit(MLS(rx[0], rx[0], rx[2], rx[1])); // r0 = r1 - r0 * r2
store_rx_opstack( rx[0] ); // *opstack = r0
unmask_rx( rx[1] );
unmask_rx( rx[2] );
} else {
rx[1] = load_rx_opstack( R1 | FORCED ); dec_opstack(); // r1 = *opstack
rx[0] = load_rx_opstack( R0 | FORCED ); // opstack-=4; r0 = *opstack
rx[2] = alloc_rx( R12 );
flush_volatile();
if ( ci->op == OP_MODI )
emit_MOVRxi(rx[2], (intptr_t)__aeabi_idivmod);
else
emit_MOVRxi(rx[2], (intptr_t)__aeabi_uidivmod);
emit(BLX(rx[2]));
store_rx_opstack( rx[1] ); // *opstack = r1
unmask_rx( rx[0] );
unmask_rx( rx[2] );
}
break;
case OP_ADD:
case OP_SUB:
case OP_MULI:
case OP_MULU:
case OP_BAND:
case OP_BOR:
case OP_BXOR:
case OP_LSH:
case OP_RSHI:
case OP_RSHU:
//rx[2] = rx[0] = load_rx_opstack( R0 ); dec_opstack(); // r0 = *opstack
load_rx_opstack2( &rx[2], R0, &rx[0], R2 ); dec_opstack(); // r2 = r0 = *opstack
rx[1] = load_rx_opstack( R1 | RCONST ); // opstack-=4; r1 = *opstack
switch ( ci->op ) {
case OP_ADD: emit(ADD(rx[2], rx[1], rx[0])); break; // r2 = r1 + r0
case OP_SUB: emit(SUB(rx[2], rx[1], rx[0])); break; // r2 = r1 - r0
case OP_MULI:
case OP_MULU: emit(MUL(rx[2], rx[1], rx[0])); break; // r2 = r1 * r0
case OP_BAND: emit(AND(rx[2], rx[1], rx[0])); break; // r2 = r1 & r0
case OP_BOR: emit(ORR(rx[2], rx[1], rx[0])); break; // r2 = r1 | r0
case OP_BXOR: emit(EOR(rx[2], rx[1], rx[0])); break; // r2 = r1 ^ r0
case OP_LSH: emit(LSL(rx[2], rx[1], rx[0])); break; // r2 = r1 << r0
case OP_RSHI: emit(ASR(rx[2], rx[1], rx[0])); break; // r2 = r1 >> r0
case OP_RSHU: emit(LSR(rx[2], rx[1], rx[0])); break; // r2 = (unsigned)r1 >> r0
}
if ( rx[0] != rx[2] ) {
unmask_rx( rx[0] );
}
unmask_rx( rx[1] );
store_rx_opstack( rx[2] ); // *opstack = r2
break;
case OP_ADDF:
case OP_SUBF:
case OP_MULF:
case OP_DIVF:
//sx[2] = sx[0] = load_sx_opstack( S0 ); dec_opstack(); // s0 = *opstack
load_sx_opstack2( &sx[2], S0, &sx[0], S2 ); dec_opstack(); // s2 = s0 = *opstack
sx[1] = load_sx_opstack( S1 | RCONST ); // opstack -= 4; s1 = *opstack
switch ( ci->op ) {
case OP_ADDF: emit(VADD_F32(sx[2], sx[1], sx[0])); break; // s2 = s1 + s0
case OP_SUBF: emit(VSUB_F32(sx[2], sx[1], sx[0])); break; // s2 = s1 - s0
case OP_MULF: emit(VMUL_F32(sx[2], sx[1], sx[0])); break; // s2 = s1 * s0
case OP_DIVF: emit(VDIV_F32(sx[2], sx[1], sx[0])); break; // s2 = s1 / s0
}
if ( sx[0] != sx[2] ) {
unmask_sx( sx[0] );
}
unmask_sx( sx[1] );
store_sx_opstack( sx[2] ); // *opstack = s2
break;
case OP_NEGF:
case OP_CVIF:
case OP_CVFI:
sx[0] = load_sx_opstack( S0 ); // s0 = *opstack
switch ( ci->op ) {
case OP_NEGF: emit( VNEG_F32( sx[0], sx[0] ) ); break; // s0 = -s0
case OP_CVIF: emit( VCVT_F32_S32( sx[0], sx[0] ) ); break; // s0 = (float)s0
case OP_CVFI: emit( VCVT_S32_F32( sx[0], sx[0] ) ); break; // s0 = (int)s0
}
store_sx_opstack( sx[0] ); // *opstack = s0
break;
} // switch op
} // ip
#ifdef FUNC_ALIGN
emitAlign( FUNC_ALIGN );
#endif
// it will set multiple offsets
emitCallFunc( vm );
#ifdef FUNC_ALIGN
emitAlign( FUNC_ALIGN );
#endif
savedOffset[ FUNC_BCPY ] = compiledOfs;
emitBlockCopyFunc( vm );
savedOffset[ FUNC_BADJ ] = compiledOfs;
emit_MOVRxi(R12, (intptr_t)BadJump);
emit(BLX(R12));
//emit(BKPT(0));
savedOffset[ FUNC_OUTJ ] = compiledOfs;
emit_MOVRxi(R12, (intptr_t)OutJump);
emit(BLX(R12));
//emit(BKPT(0));
savedOffset[ FUNC_OSOF ] = compiledOfs;
emit_MOVRxi(R12, (intptr_t)ErrBadOpStack);
emit(BLX(R12));
//emit(BKPT(0));
savedOffset[ FUNC_PSOF ] = compiledOfs;
emit_MOVRxi(R12, (intptr_t)ErrBadProgramStack);
emit(BLX(R12));
//emit(BKPT(0));
savedOffset[FUNC_BADR] = compiledOfs;
emit_MOVRxi( R12, (intptr_t) ErrBadDataRead );
emit( BLX( R12 ) );
savedOffset[FUNC_BADW] = compiledOfs;
emit_MOVRxi( R12, (intptr_t) ErrBadDataWrite );
emit( BLX( R12 ) );
//emit(BKPT(0));
} // pass
if ( vm->codeBase.ptr == NULL ) {
#ifdef _WIN32
vm->codeBase.ptr = VirtualAlloc( NULL, compiledOfs, MEM_COMMIT, PAGE_EXECUTE_READWRITE );
if ( !vm->codeBase.ptr ) {
VM_FreeBuffers();
Com_Printf( S_COLOR_YELLOW "%s(%s): VirtualAlloc failed\n", __func__, vm->name );
return qfalse;
}
#else
vm->codeBase.ptr = mmap( NULL, compiledOfs, PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0 );
if ( vm->codeBase.ptr == MAP_FAILED ) {
VM_FreeBuffers();
Com_Printf( S_COLOR_YELLOW "%s(%s): mmap failed\n", __func__, vm->name );
return qfalse;
}
#endif
vm->codeLength = compiledOfs;
vm->codeSize = compiledOfs;
code = (uint32_t*)vm->codeBase.ptr;
goto __recompile;
}
#ifdef DUMP_CODE
dump_code( vm->name, code, compiledOfs / 4 );
#endif
// offset all the instruction pointers for the new location
for ( i = 0; i < header->instructionCount; i++ ) {
if ( !inst[i].jused ) {
vm->instructionPointers[ i ] = (intptr_t)BadJump;
continue;
}
vm->instructionPointers[ i ] += (intptr_t)vm->codeBase.ptr;
}
VM_FreeBuffers();
#ifdef _WIN32
{
DWORD oldProtect = 0;
// remove write permissions
if ( !VirtualProtect( vm->codeBase.ptr, vm->codeLength, PAGE_EXECUTE_READ, &oldProtect ) ) {
VM_Destroy_Compiled( vm );
Com_Printf( S_COLOR_YELLOW "%s(%s): VirtualProtect failed\n", __func__, vm->name );
return qfalse;
}
}
#else
if ( mprotect( vm->codeBase.ptr, vm->codeLength, PROT_READ | PROT_EXEC ) ) {
VM_Destroy_Compiled( vm );
Com_Printf( S_COLOR_YELLOW "%s(%s): mprotect failed\n", __func__, vm->name );
return qfalse;
}
// clear icache, http://blogs.arm.com/software-enablement/141-caches-and-self-modifying-code/
__clear_cache( vm->codeBase.ptr, vm->codeBase.ptr + vm->codeLength );
#endif
vm->destroy = VM_Destroy_Compiled;
Com_Printf( "VM file %s compiled to %i bytes of code\n", vm->name, vm->codeLength );
return qtrue;
}
int32_t VM_CallCompiled( vm_t *vm, int nargs, int32_t *args )
{
int32_t opStack[ MAX_OPSTACK_SIZE ];
int32_t stackOnEntry;
int32_t *image;
int i;
// we might be called recursively, so this might not be the very top
stackOnEntry = vm->programStack;
vm->programStack -= ( MAX_VMMAIN_CALL_ARGS + 2 ) * sizeof( int32_t );
// set up the stack frame
image = (int32_t*) ( vm->dataBase + vm->programStack );
for ( i = 0; i < nargs; i++ ) {
image[i + 2] = args[i];
}
// these only needed for interpreter:
// image[1] = 0; // return stack
// image[0] = -1; // will terminate loop on return
#ifdef DEBUG_VM
opStack[0] = 0xDEADC0DE;
#endif
opStack[1] = 0;
vm->opStack = opStack;
vm->opStackTop = opStack + ARRAY_LEN( opStack ) - 1;
vm->codeBase.func(); // go into generated code
#ifdef DEBUG_VM
if ( opStack[0] != 0xDEADC0DE ) {
Com_Error( ERR_DROP, "%s(%s): opStack corrupted in compiled code", __func__, vm->name );
}
if ( vm->programStack != stackOnEntry - ( MAX_VMMAIN_CALL_ARGS + 2 ) * sizeof( int32_t ) ) {
Com_Error( ERR_DROP, "%s(%s): programStack corrupted in compiled code", __func__, vm->name );
}
#endif
vm->programStack = stackOnEntry;
return opStack[1];
}