/* when I say JIT, I mean load time, not execution time. notes: qc jump offsets are all constants. we have no variable offset jumps (other than function calls/returns) field remapping... fields are in place, and cannot be adjusted. if a field is not set to 0, its assumed to be a constant. optimisations: none at the moment... instructions need to be chained. stuff that writes to C should be cacheable, etc. maybe we don't even need to do the write to C it should also be possible to fold in eq+ifnot, so none of this silly storeing of floats in equality tests this means that we need to track which vars are cached and in what form: fpreg, ireg+floatasint, ireg+float. certain qccx hacks can use fpu operations on ints, so do what the instruction says, rather than considering an add an add regardless of types. OP_AND_F, OP_OR_F etc will generally result in ints, and we should be able to keep them as ints if they combine with other ints. some instructions are jump sites. any cache must be flushed before the start of the instruction. some variables are locals, and will only ever be written by a single instruction, then read by the following instruction. such temps do not need to be written, or are overwritten later in the function anyway. such locals need to be calculated PER FUNCTION as (fte)qcc can overlap locals making multiple distinct locals on a single offset. store locals on a proper stack instead of the current absurd mechanism. eax - tmp ebx - prinst->edicttable ecx - tmp edx - tmp esi - debug opcode number edi - tmp (because its preserved by subfunctions ebp - to use gas to provide binary opcodes: vim -N blob.s && as blob.s && objdump.exe -d a.out notable mods to test: prydon gate, due to fpu mangling to carry values between maps */ #define PROGSUSED #include "progsint.h" #ifdef QCJIT #ifndef _WIN32 #include #endif static float ta, tb, nullfloat=0; struct jitstate { unsigned int *statementjumps; //[MAX_STATEMENTS*3] unsigned char **statementoffsets; //[MAX_STATEMENTS] unsigned int numjumps; unsigned char *code; unsigned int codesize; unsigned int jitstatements; }; static void EmitByte(struct jitstate *jit, unsigned char byte) { jit->code[jit->codesize++] = byte; } static void Emit4Byte(struct jitstate *jit, unsigned int value) { jit->code[jit->codesize++] = (value>> 0)&0xff; jit->code[jit->codesize++] = (value>> 8)&0xff; jit->code[jit->codesize++] = (value>>16)&0xff; jit->code[jit->codesize++] = (value>>24)&0xff; } static void EmitAdr(struct jitstate *jit, void *value) { Emit4Byte(jit, (unsigned int)value); } static void EmitFloat(struct jitstate *jit, float value) { union {float f; unsigned int i;} u; u.f = value; Emit4Byte(jit, u.i); } static void Emit2Byte(struct jitstate *jit, unsigned short value) { jit->code[jit->codesize++] = (value>> 0)&0xff; jit->code[jit->codesize++] = (value>> 8)&0xff; } static void EmitFOffset(struct jitstate *jit, void *func, int bias) { union {void *f; unsigned int i;} u; u.f = func; u.i -= (unsigned int)&jit->code[jit->codesize+bias]; Emit4Byte(jit, u.i); } static void Emit4ByteJump(struct jitstate *jit, int statementnum, int offset) { jit->statementjumps[jit->numjumps++] = jit->codesize; jit->statementjumps[jit->numjumps++] = statementnum; jit->statementjumps[jit->numjumps++] = offset; //the offset is filled in later jit->codesize += 4; } enum { REG_EAX, REG_ECX, REG_EDX, REG_EBX, REG_ESP, REG_EBP, REG_ESI, REG_EDI }; #define XOR(sr,dr) EmitByte(0x31);EmitByte(0xc0 | (sr<<3) | dr); #define CLEARREG(reg) XOR(reg,reg) #define LOADREG(addr, reg) if (reg == REG_EAX) {EmitByte(0xa1);} else {EmitByte(0x8b); EmitByte((reg<<3) | 0x05);} EmitAdr(addr); #define STOREREG(reg, addr) if (reg == REG_EAX) {EmitByte(0xa3);} else {EmitByte(0x89); EmitByte((reg<<3) | 0x05);} EmitAdr(addr); #define STOREF(f, addr) EmitByte(0xc7);EmitByte(0x05); EmitAdr(addr);EmitFloat(f); #define STOREI(i, addr) EmitByte(0xc7);EmitByte(0x05); EmitAdr(addr);Emit4Byte(i); #define SETREGI(val,reg) EmitByte(0xbe);Emit4Byte(val); static void *LocalLoc(struct jitstate *jit) { return &jit->code[jit->codesize]; } static void *LocalJmp(struct jitstate *jit, int cond) { /*floating point ops don't set the sign flag, thus we use the 'above/below' instructions instead of 'greater/less' instructions*/ if (cond == OP_GOTO) EmitByte(jit, 0xeb); //jmp else if (cond == OP_LE_F) EmitByte(jit, 0x76); //jbe else if (cond == OP_GE_F) EmitByte(jit, 0x73); //jae else if (cond == OP_LT_F) EmitByte(jit, 0x72); //jb else if (cond == OP_GT_F) EmitByte(jit, 0x77); //ja else if (cond == OP_LE_I) EmitByte(jit, 0x7e); //jle else if (cond == OP_LT_I) EmitByte(jit, 0x7c); //jl else if ((cond >= OP_NE_F && cond <= OP_NE_FNC) || cond == OP_NE_I) EmitByte(jit, 0x75); //jne else if ((cond >= OP_EQ_F && cond <= OP_EQ_FNC) || cond == OP_EQ_I) EmitByte(jit, 0x74); //je #if defined(DEBUG) && defined(_WIN32) else { OutputDebugString("oh noes!\n"); return NULL; } #endif EmitByte(jit, 0); return LocalLoc(jit); } static void LocalJmpLoc(void *jmp, void *loc) { int offs; unsigned char *a = jmp; offs = (char *)loc - (char *)jmp; #if defined(DEBUG) && defined(_WIN32) if (offs > 127 || offs <= -128) { OutputDebugStringA("bad jump\n"); a[-2] = 0xcd; a[-1] = 0xcc; return; } #endif a[-1] = offs; } static void FixupJumps(struct jitstate *jit) { unsigned int j; unsigned char *codesrc; unsigned char *codedst; unsigned int offset; unsigned int v; for (j = 0; j < jit->numjumps;) { v = jit->statementjumps[j++]; codesrc = &jit->code[v]; v = jit->statementjumps[j++]; codedst = jit->statementoffsets[v]; v = jit->statementjumps[j++]; offset = (int)(codedst - (codesrc-v)); //3rd term because the jump is relative to the instruction start, not the instruction's offset codesrc[0] = (offset>> 0)&0xff; codesrc[1] = (offset>> 8)&0xff; codesrc[2] = (offset>>16)&0xff; codesrc[3] = (offset>>24)&0xff; } } int ASMCALL PR_LeaveFunction (progfuncs_t *progfuncs); int ASMCALL PR_EnterFunction (progfuncs_t *progfuncs, dfunction_t *f, int progsnum); void PR_CloseJit(struct jitstate *jit) { if (jit) { free(jit->statementjumps); free(jit->statementoffsets); #ifndef _WIN32 munmap(jit->code, jit->jitstatements * 500); #else free(jit->code); #endif free(jit) } } #define EmitByte(v) EmitByte(jit, v) #define EmitAdr(v) EmitAdr(jit, v) #define EmitFOffset(a,b) EmitFOffset(jit, a, b) #define Emit4ByteJump(a,b) Emit4ByteJump(jit, a, b) #define Emit4Byte(v) Emit4Byte(jit, v) #define EmitFloat(v) EmitFloat(jit, v) #define LocalJmp(v) LocalJmp(jit, v) #define LocalLoc() LocalLoc(jit) struct jitstate *PR_GenerateJit(progfuncs_t *progfuncs) { struct jitstate *jit; void *j0, *l0; void *j1, *l1; void *j2, *l2; unsigned int i; dstatement16_t *op = (dstatement16_t*)current_progstate->statements; unsigned int numstatements = current_progstate->progs->numstatements; int *glob = (int*)current_progstate->globals; if (current_progstate->numbuiltins) return NULL; jit = malloc(sizeof(*jit)); jit->jitstatements = numstatements; jit->statementjumps = malloc(numstatements*12); jit->statementoffsets = malloc(numstatements*4); #ifndef _WIN32 jit->code = mmap(NULL, numstatements*500, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); #else jit->code = malloc(numstatements*500); #endif if (!jit->code) return NULL; jit->numjumps = 0; jit->codesize = 0; for (i = 0; i < numstatements; i++) { jit->statementoffsets[i] = &jit->code[jit->codesize]; /*DEBUG*/ SETREGI(op[i].op, REG_ESI); switch(op[i].op) { //jumps case OP_IF_I: //integer compare //if a, goto b //cmpl $0,glob[A] EmitByte(0x83);EmitByte(0x3d);EmitAdr(glob + op[i].a);EmitByte(0x0); //jne B EmitByte(0x0f);EmitByte(0x85);Emit4ByteJump(i + (signed short)op[i].b, -4); break; case OP_IFNOT_I: //integer compare //if !a, goto b //cmpl $0,glob[A] EmitByte(0x83);EmitByte(0x3d);EmitAdr(glob + op[i].a);EmitByte(0x0); //je B EmitByte(0x0f);EmitByte(0x84);Emit4ByteJump(i + (signed short)op[i].b, -4); break; case OP_GOTO: EmitByte(0xE9);Emit4ByteJump(i + (signed short)op[i].a, -4); break; //function returns case OP_DONE: case OP_RETURN: //done and return are the same //part 1: store A into OFS_RETURN if (!op[i].a) { //assumption: anything that returns address 0 is a void or zero return. //thus clear eax and copy that to the return vector. CLEARREG(REG_EAX); STOREREG(REG_EAX, glob + OFS_RETURN+0); STOREREG(REG_EAX, glob + OFS_RETURN+1); STOREREG(REG_EAX, glob + OFS_RETURN+2); } else { LOADREG(glob + op[i].a+0, REG_EAX); LOADREG(glob + op[i].a+1, REG_EDX); LOADREG(glob + op[i].a+2, REG_ECX); STOREREG(REG_EAX, glob + OFS_RETURN+0); STOREREG(REG_EDX, glob + OFS_RETURN+1); STOREREG(REG_ECX, glob + OFS_RETURN+2); } //call leavefunction to get the return address // pushl progfuncs EmitByte(0x68);EmitAdr(progfuncs); // call PR_LeaveFunction EmitByte(0xe8);EmitFOffset(PR_LeaveFunction, 4); // add $4,%esp EmitByte(0x83);EmitByte(0xc4);EmitByte(0x04); // movl pr_depth,%edx EmitByte(0x8b);EmitByte(0x15);EmitAdr(&pr_depth); // cmp prinst->exitdepth,%edx EmitByte(0x3b);EmitByte(0x15);EmitAdr(&prinst->exitdepth); // je returntoc j1 = LocalJmp(OP_EQ_E); // mov statementoffsets[%eax*4],%eax EmitByte(0x8b);EmitByte(0x04);EmitByte(0x85);EmitAdr(jit->statementoffsets+1); // jmp *eax EmitByte(0xff);EmitByte(0xe0); // returntoc: l1 = LocalLoc(); // ret EmitByte(0xc3); LocalJmpLoc(j1,l1); break; //function calls case OP_CALL0: case OP_CALL1: case OP_CALL2: case OP_CALL3: case OP_CALL4: case OP_CALL5: case OP_CALL6: case OP_CALL7: case OP_CALL8: //save the state in place the rest of the engine can cope with //movl $i, pr_xstatement EmitByte( 0xc7);EmitByte(0x05);EmitAdr(&pr_xstatement);Emit4Byte(i); //movl $(op[i].op-OP_CALL0), pr_argc EmitByte( 0xc7);EmitByte(0x05);EmitAdr(&pr_argc);Emit4Byte(op[i].op-OP_CALL0); //figure out who we're calling, and what that involves //%eax = glob[A] LOADREG(glob + op[i].a, REG_EAX); //eax is now the func num //mov %eax,%ecx EmitByte(0x89); EmitByte(0xc1); //shr $24,%ecx EmitByte(0xc1); EmitByte(0xe9); EmitByte(0x18); //ecx is now the progs num for the new func //cmp %ecx,pr_typecurrent EmitByte(0x39); EmitByte(0x0d); EmitAdr(&pr_typecurrent); //je sameprogs j1 = LocalJmp(OP_EQ_I); { //can't handle switching progs //FIXME: recurse though PR_ExecuteProgram //push eax //push progfuncs //call PR_ExecuteProgram //add $8,%esp //remember to change the je above //err... exit depth? no idea EmitByte(0xcd);EmitByte(op[i].op); //int $X //ret EmitByte(0xc3); } //sameprogs: l1 = LocalLoc(); LocalJmpLoc(j1,l1); //andl $0x00ffffff, %eax EmitByte(0x25);Emit4Byte(0x00ffffff); //mov $sizeof(dfunction_t),%edx EmitByte(0xba);Emit4Byte(sizeof(dfunction_t)); //mul %edx EmitByte(0xf7); EmitByte(0xe2); //add pr_functions,%eax EmitByte(0x05); EmitAdr(pr_functions); //eax is now the dfunction_t to be called //edx is clobbered. //mov (%eax),%edx EmitByte(0x8b);EmitByte(0x10); //edx is now the first statement number //cmp $0,%edx EmitByte(0x83);EmitByte(0xfa);EmitByte(0x00); //jl isabuiltin j1 = LocalJmp(OP_LT_I); { /* call the function*/ //push %ecx EmitByte(0x51); //push %eax EmitByte(0x50); //pushl progfuncs EmitByte(0x68);EmitAdr(progfuncs); //call PR_EnterFunction EmitByte(0xe8);EmitFOffset(PR_EnterFunction, 4); //sub $12,%esp EmitByte(0x83);EmitByte(0xc4);EmitByte(0xc); //eax is now the next statement number (first of the new function, usually equal to ecx, but not always) //jmp statementoffsets[%eax*4] EmitByte(0xff);EmitByte(0x24);EmitByte(0x85);EmitAdr(jit->statementoffsets+1); } /*its a builtin, figure out which, and call it*/ //isabuiltin: l1 = LocalLoc(); LocalJmpLoc(j1,l1); //push current_progstate->globals EmitByte(0x68);EmitAdr(current_progstate->globals); //push progfuncs EmitByte(0x68);EmitAdr(progfuncs); //neg %edx EmitByte(0xf7);EmitByte(0xda); //call externs->globalbuiltins[%edx,4] //FIXME: make sure this dereferences EmitByte(0xff);EmitByte(0x14);EmitByte(0x95);EmitAdr(externs->globalbuiltins); //add $8,%esp EmitByte(0x83);EmitByte(0xc4);EmitByte(0x8); //but that builtin might have been Abort() LOADREG(&prinst->continuestatement, REG_EAX); //cmp $-1,%eax EmitByte(0x83);EmitByte(0xf8);EmitByte(0xff); //je donebuiltincall j1 = LocalJmp(OP_EQ_I); { //mov $-1,prinst->continuestatement EmitByte(0xc7);EmitByte(0x05);EmitAdr(&prinst->continuestatement);Emit4Byte((unsigned int)-1); //jmp statementoffsets[%eax*4] EmitByte(0xff);EmitByte(0x24);EmitByte(0x85);EmitAdr(jit->statementoffsets); } //donebuiltincall: l1 = LocalLoc(); LocalJmpLoc(j1,l1); break; case OP_MUL_F: //flds glob[A] EmitByte(0xd9);EmitByte(0x05);EmitAdr(glob + op[i].a); //fmuls glob[B] EmitByte(0xd8);EmitByte(0x0d);EmitAdr(glob + op[i].b); //fstps glob[C] EmitByte(0xd9);EmitByte(0x1d);EmitAdr(glob + op[i].c); break; case OP_DIV_F: //flds glob[A] EmitByte(0xd9);EmitByte(0x05);EmitAdr(glob + op[i].a); //fdivs glob[B] EmitByte(0xd8);EmitByte(0x35);EmitAdr(glob + op[i].b); //fstps glob[C] EmitByte(0xd9);EmitByte(0x1d);EmitAdr(glob + op[i].c); break; case OP_ADD_F: //flds glob[A] EmitByte(0xd9);EmitByte(0x05);EmitAdr(glob + op[i].a); //fadds glob[B] EmitByte(0xd8);EmitByte(0x05);EmitAdr(glob + op[i].b); //fstps glob[C] EmitByte(0xd9);EmitByte(0x1d);EmitAdr(glob + op[i].c); break; case OP_SUB_F: //flds glob[A] EmitByte(0xd9);EmitByte(0x05);EmitAdr(glob + op[i].a); //fsubs glob[B] EmitByte(0xd8);EmitByte(0x25);EmitAdr(glob + op[i].b); //fstps glob[C] EmitByte(0xd9);EmitByte(0x1d);EmitAdr(glob + op[i].c); break; case OP_NOT_F: //fldz EmitByte(0xd9);EmitByte(0xee); //fcomps glob[A] EmitByte(0xd8); EmitByte(0x1d); EmitAdr(glob + op[i].a); //fnstsw %ax EmitByte(0xdf);EmitByte(0xe0); //testb 0x40,%ah EmitByte(0xf6);EmitByte(0xc4);EmitByte(0x40); j1 = LocalJmp(OP_NE_F); { STOREF(0.0f, glob + op[i].c); j2 = LocalJmp(OP_GOTO); } { //noteq: l1 = LocalLoc(); STOREF(1.0f, glob + op[i].c); } //end: l2 = LocalLoc(); LocalJmpLoc(j1,l1); LocalJmpLoc(j2,l2); break; case OP_STORE_F: case OP_STORE_S: case OP_STORE_ENT: case OP_STORE_FLD: case OP_STORE_FNC: LOADREG(glob + op[i].a, REG_EAX); STOREREG(REG_EAX, glob + op[i].b); break; case OP_STORE_V: LOADREG(glob + op[i].a+0, REG_EAX); LOADREG(glob + op[i].a+1, REG_EDX); LOADREG(glob + op[i].a+2, REG_ECX); STOREREG(REG_EAX, glob + op[i].b+0); STOREREG(REG_EDX, glob + op[i].b+1); STOREREG(REG_ECX, glob + op[i].b+2); break; case OP_LOAD_F: case OP_LOAD_S: case OP_LOAD_ENT: case OP_LOAD_FLD: case OP_LOAD_FNC: case OP_LOAD_V: //a is the ent number, b is the field //c is the dest LOADREG(glob + op[i].a, REG_EAX); LOADREG(glob + op[i].b, REG_ECX); //FIXME: bound eax (ent number) //FIXME: bound ecx (field index) //mov (ebx,eax,4).%eax EmitByte(0x8b); EmitByte(0x04); EmitByte(0x83); //eax is now an edictrun_t //mov fields(,%eax,4),%edx EmitByte(0x8b);EmitByte(0x50);EmitByte((int)&((edictrun_t*)NULL)->fields); //edx is now the field array for that ent //mov fieldajust(%edx,%ecx,4),%eax EmitByte(0x8b); EmitByte(0x84); EmitByte(0x8a); Emit4Byte(progfuncs->fieldadjust*4); STOREREG(REG_EAX, glob + op[i].c) if (op[i].op == OP_LOAD_V) { //mov fieldajust+4(%edx,%ecx,4),%eax EmitByte(0x8b); EmitByte(0x84); EmitByte(0x8a); Emit4Byte(4+progfuncs->fieldadjust*4); STOREREG(REG_EAX, glob + op[i].c+1) //mov fieldajust+8(%edx,%ecx,4),%eax EmitByte(0x8b); EmitByte(0x84); EmitByte(0x8a); Emit4Byte(8+progfuncs->fieldadjust*4); STOREREG(REG_EAX, glob + op[i].c+2) } break; case OP_ADDRESS: //a is the ent number, b is the field //c is the dest LOADREG(glob + op[i].a, REG_EAX); LOADREG(glob + op[i].b, REG_ECX); //FIXME: bound eax (ent number) //FIXME: bound ecx (field index) //mov (ebx,eax,4).%eax EmitByte(0x8b); EmitByte(0x04); EmitByte(0x83); //eax is now an edictrun_t //mov fields(,%eax,4),%edx EmitByte(0x8b);EmitByte(0x50);EmitByte((int)&((edictrun_t*)NULL)->fields); //edx is now the field array for that ent //mov fieldajust(%edx,%ecx,4),%eax //offset = progfuncs->fieldadjust //EmitByte(0x8d); EmitByte(0x84); EmitByte(0x8a); EmitByte(progfuncs->fieldadjust*4); EmitByte(0x8d); EmitByte(0x84); EmitByte(0x8a); Emit4Byte(progfuncs->fieldadjust*4); STOREREG(REG_EAX, glob + op[i].c); break; case OP_STOREP_F: case OP_STOREP_S: case OP_STOREP_ENT: case OP_STOREP_FLD: case OP_STOREP_FNC: LOADREG(glob + op[i].a, REG_EAX); LOADREG(glob + op[i].b, REG_ECX); //mov %eax,(%ecx) EmitByte(0x89);EmitByte(0x01); break; case OP_STOREP_V: LOADREG(glob + op[i].b, REG_ECX); LOADREG(glob + op[i].a+0, REG_EAX); //mov %eax,0(%ecx) EmitByte(0x89);EmitByte(0x01); LOADREG(glob + op[i].a+1, REG_EAX); //mov %eax,4(%ecx) EmitByte(0x89);EmitByte(0x41);EmitByte(0x04); LOADREG(glob + op[i].a+2, REG_EAX); //mov %eax,8(%ecx) EmitByte(0x89);EmitByte(0x41);EmitByte(0x08); break; case OP_NE_I: case OP_NE_E: case OP_NE_FNC: case OP_EQ_I: case OP_EQ_E: case OP_EQ_FNC: //integer equality LOADREG(glob + op[i].a, REG_EAX); //cmp glob[B],%eax EmitByte(0x3b); EmitByte(0x04); EmitByte(0x25); EmitAdr(glob + op[i].b); j1 = LocalJmp(op[i].op); { STOREF(0.0f, glob + op[i].c); j2 = LocalJmp(OP_GOTO); } { l1 = LocalLoc(); STOREF(1.0f, glob + op[i].c); } l2 = LocalLoc(); LocalJmpLoc(j1,l1); LocalJmpLoc(j2,l2); break; case OP_NOT_I: case OP_NOT_ENT: case OP_NOT_FNC: //cmp glob[B],$0 EmitByte(0x83); EmitByte(0x3d); EmitAdr(glob + op[i].a); EmitByte(0x00); j1 = LocalJmp(OP_NE_I); { STOREF(1.0f, glob + op[i].c); j2 = LocalJmp(OP_GOTO); } { l1 = LocalLoc(); STOREF(0.0f, glob + op[i].c); } l2 = LocalLoc(); LocalJmpLoc(j1,l1); LocalJmpLoc(j2,l2); break; case OP_BITOR_F: //floats... //flds glob[A] EmitByte(0xd9); EmitByte(0x05);EmitAdr(glob + op[i].a); //flds glob[B] EmitByte(0xd9); EmitByte(0x05);EmitAdr(glob + op[i].b); //fistp tb EmitByte(0xdb); EmitByte(0x1d);EmitAdr(&tb); //fistp ta EmitByte(0xdb); EmitByte(0x1d);EmitAdr(&ta); LOADREG(&ta, REG_EAX) //or %eax,tb EmitByte(0x09); EmitByte(0x05);EmitAdr(&tb); //fild tb EmitByte(0xdb); EmitByte(0x05);EmitAdr(&tb); //fstps glob[C] EmitByte(0xd9); EmitByte(0x1d);EmitAdr(glob + op[i].c); break; case OP_BITAND_F: //flds glob[A] EmitByte(0xd9); EmitByte(0x05);EmitAdr(glob + op[i].a); //flds glob[B] EmitByte(0xd9); EmitByte(0x05);EmitAdr(glob + op[i].b); //fistp tb EmitByte(0xdb); EmitByte(0x1d);EmitAdr(&tb); //fistp ta EmitByte(0xdb); EmitByte(0x1d);EmitAdr(&ta); /*two args are now at ta and tb*/ LOADREG(&ta, REG_EAX) //and tb,%eax EmitByte(0x21); EmitByte(0x05);EmitAdr(&tb); /*we just wrote the int value to tb, convert that to a float and store it at c*/ //fild tb EmitByte(0xdb); EmitByte(0x05);EmitAdr(&tb); //fstps glob[C] EmitByte(0xd9); EmitByte(0x1d);EmitAdr(glob + op[i].c); break; case OP_AND_F: //test floats properly, so we don't get confused with -0.0 //flds glob[A] EmitByte(0xd9); EmitByte(0x05); EmitAdr(glob + op[i].a); //fcomps nullfloat EmitByte(0xd8); EmitByte(0x1d); EmitAdr(&nullfloat); //fnstsw %ax EmitByte(0xdf); EmitByte(0xe0); //test $0x40,%ah EmitByte(0xf6); EmitByte(0xc4);EmitByte(0x40); //jz onefalse EmitByte(0x75); EmitByte(0x1f); //flds glob[B] EmitByte(0xd9); EmitByte(0x05); EmitAdr(glob + op[i].b); //fcomps nullfloat EmitByte(0xd8); EmitByte(0x1d); EmitAdr(&nullfloat); //fnstsw %ax EmitByte(0xdf); EmitByte(0xe0); //test $0x40,%ah EmitByte(0xf6); EmitByte(0xc4);EmitByte(0x40); //jnz onefalse EmitByte(0x75); EmitByte(0x0c); //mov float0,glob[C] EmitByte(0xc7); EmitByte(0x05); EmitAdr(glob + op[i].c); EmitFloat(1.0f); //jmp done EmitByte(0xeb); EmitByte(0x0a); //onefalse: //mov float1,glob[C] EmitByte(0xc7); EmitByte(0x05); EmitAdr(glob + op[i].c); EmitFloat(0.0f); //done: break; case OP_OR_F: //test floats properly, so we don't get confused with -0.0 //flds glob[A] EmitByte(0xd9); EmitByte(0x05); EmitAdr(glob + op[i].a); //fcomps nullfloat EmitByte(0xd8); EmitByte(0x1d); EmitAdr(&nullfloat); //fnstsw %ax EmitByte(0xdf); EmitByte(0xe0); //test $0x40,%ah EmitByte(0xf6); EmitByte(0xc4);EmitByte(0x40); //je onetrue EmitByte(0x74); EmitByte(0x1f); //flds glob[B] EmitByte(0xd9); EmitByte(0x05); EmitAdr(glob + op[i].b); //fcomps nullfloat EmitByte(0xd8); EmitByte(0x1d); EmitAdr(&nullfloat); //fnstsw %ax EmitByte(0xdf); EmitByte(0xe0); //test $0x40,%ah EmitByte(0xf6); EmitByte(0xc4);EmitByte(0x40); //je onetrue EmitByte(0x74); EmitByte(0x0c); //mov float0,glob[C] EmitByte(0xc7); EmitByte(0x05); EmitAdr(glob + op[i].c); EmitFloat(0.0f); //jmp done EmitByte(0xeb); EmitByte(0x0a); //onetrue: //mov float1,glob[C] EmitByte(0xc7); EmitByte(0x05); EmitAdr(glob + op[i].c); EmitFloat(1.0f); //done: break; case OP_EQ_S: case OP_NE_S: { //put a in ecx LOADREG(glob + op[i].a, REG_ECX); //put b in edi LOADREG(glob + op[i].b, REG_EDI); /* //early out if they're equal //cmp %ecx,%edi EmitByte(0x39); EmitByte(0xc0 | (REG_EDI<<3) | REG_ECX); j1c = LocalJmp(OP_EQ_S); //if a is 0, check if b is "" //jecxz ais0 EmitByte(0xe3); EmitByte(0x1a); //if b is 0, check if a is "" //cmp $0,%edi EmitByte(0x83); EmitByte(0xff); EmitByte(0x00); //jne bnot0 EmitByte(0x75); EmitByte(0x2a); { //push a EmitByte(0x51); //push progfuncs EmitByte(0x68); EmitAdr(progfuncs); //call PR_StringToNative EmitByte(0xe8); EmitFOffset(PR_StringToNative,4); //add $8,%esp EmitByte(0x83); EmitByte(0xc4); EmitByte(0x08); //cmpb $0,(%eax) EmitByte(0x80); EmitByte(0x38); EmitByte(0x00); j1b = LocalJmp(OP_EQ_S); j0b = LocalJmp(OP_GOTO); } //ais0: { //push edi EmitByte(0x57); //push progfuncs EmitByte(0x68); EmitAdr(progfuncs); //call PR_StringToNative EmitByte(0xe8); EmitFOffset(PR_StringToNative,4); //add $8,%esp EmitByte(0x83); EmitByte(0xc4); EmitByte(0x08); //cmpb $0,(%eax) EmitByte(0x80); EmitByte(0x38); EmitByte(0x00); //je _true EmitByte(0x74); EmitByte(0x36); //jmp _false EmitByte(0xeb); EmitByte(0x28); } //bnot0: */ LOADREG(glob + op[i].a, REG_ECX); //push ecx EmitByte(0x51); //push progfuncs EmitByte(0x68); EmitAdr(progfuncs); //call PR_StringToNative EmitByte(0xe8); EmitFOffset(PR_StringToNative,4); //push %eax EmitByte(0x50); LOADREG(glob + op[i].b, REG_EDI); //push %edi EmitByte(0x57); //push progfuncs EmitByte(0x68); EmitAdr(progfuncs); //call PR_StringToNative EmitByte(0xe8); EmitFOffset(PR_StringToNative,4); //add $8,%esp EmitByte(0x83); EmitByte(0xc4); EmitByte(0x08); //push %eax EmitByte(0x50); //call strcmp EmitByte(0xe8); EmitFOffset(strcmp,4); //add $16,%esp EmitByte(0x83); EmitByte(0xc4); EmitByte(0x10); //cmp $0,%eax EmitByte(0x83); EmitByte(0xf8); EmitByte(0x00); j1 = LocalJmp(OP_EQ_S); { l0 = LocalLoc(); STOREF((op[i].op == OP_NE_S)?1.0f:0.0f, glob + op[i].c); j2 = LocalJmp(OP_GOTO); } { l1 = LocalLoc(); STOREF((op[i].op == OP_NE_S)?0.0f:1.0f, glob + op[i].c); } l2 = LocalLoc(); // LocalJmpLoc(j0b, l0); LocalJmpLoc(j1, l1); // LocalJmpLoc(j1b, l1); LocalJmpLoc(j2, l2); } break; case OP_NOT_S: LOADREG(glob + op[i].a, REG_EAX) //cmp $0,%eax EmitByte(0x83); EmitByte(0xf8); EmitByte(0x00); j2 = LocalJmp(OP_EQ_S); //push %eax EmitByte(0x50); //push progfuncs EmitByte(0x68); EmitAdr(progfuncs); //call PR_StringToNative EmitByte(0xe8); EmitFOffset(PR_StringToNative,4); //add $8,%esp EmitByte(0x83); EmitByte(0xc4); EmitByte(0x08); //cmpb $0,(%eax) EmitByte(0x80); EmitByte(0x38); EmitByte(0x00); j1 = LocalJmp(OP_EQ_S); { STOREF(0.0f, glob + op[i].c); j0 = LocalJmp(OP_GOTO); } { l1 = LocalLoc(); STOREF(1.0f, glob + op[i].c); } l2 = LocalLoc(); LocalJmpLoc(j2, l1); LocalJmpLoc(j1, l1); LocalJmpLoc(j0, l2); break; case OP_ADD_V: //flds glob[A] EmitByte(0xd9);EmitByte(0x05);EmitAdr(glob + op[i].a+0); //fadds glob[B] EmitByte(0xd8);EmitByte(0x05);EmitAdr(glob + op[i].b+0); //fstps glob[C] EmitByte(0xd9);EmitByte(0x1d);EmitAdr(glob + op[i].c+0); //flds glob[A] EmitByte(0xd9);EmitByte(0x05);EmitAdr(glob + op[i].a+1); //fadds glob[B] EmitByte(0xd8);EmitByte(0x05);EmitAdr(glob + op[i].b+1); //fstps glob[C] EmitByte(0xd9);EmitByte(0x1d);EmitAdr(glob + op[i].c+1); //flds glob[A] EmitByte(0xd9);EmitByte(0x05);EmitAdr(glob + op[i].a+2); //fadds glob[B] EmitByte(0xd8);EmitByte(0x05);EmitAdr(glob + op[i].b+2); //fstps glob[C] EmitByte(0xd9);EmitByte(0x1d);EmitAdr(glob + op[i].c+2); break; case OP_SUB_V: //flds glob[A] EmitByte(0xd9);EmitByte(0x05);EmitAdr(glob + op[i].a+0); //fsubs glob[B] EmitByte(0xd8);EmitByte(0x25);EmitAdr(glob + op[i].b+0); //fstps glob[C] EmitByte(0xd9);EmitByte(0x1d);EmitAdr(glob + op[i].c+0); //flds glob[A] EmitByte(0xd9);EmitByte(0x05);EmitAdr(glob + op[i].a+1); //fsubs glob[B] EmitByte(0xd8);EmitByte(0x25);EmitAdr(glob + op[i].b+1); //fstps glob[C] EmitByte(0xd9);EmitByte(0x1d);EmitAdr(glob + op[i].c+1); //flds glob[A] EmitByte(0xd9);EmitByte(0x05);EmitAdr(glob + op[i].a+2); //fsubs glob[B] EmitByte(0xd8);EmitByte(0x25);EmitAdr(glob + op[i].b+2); //fstps glob[C] EmitByte(0xd9);EmitByte(0x1d);EmitAdr(glob + op[i].c+2); break; case OP_MUL_V: //this is actually a dotproduct //flds glob[A] EmitByte(0xd9);EmitByte(0x05);EmitAdr(glob + op[i].a+0); //fmuls glob[B] EmitByte(0xd8);EmitByte(0x0d);EmitAdr(glob + op[i].b+0); //flds glob[A] EmitByte(0xd9);EmitByte(0x05);EmitAdr(glob + op[i].a+1); //fmuls glob[B] EmitByte(0xd8);EmitByte(0x0d);EmitAdr(glob + op[i].b+1); //flds glob[A] EmitByte(0xd9);EmitByte(0x05);EmitAdr(glob + op[i].a+2); //fmuls glob[B] EmitByte(0xd8);EmitByte(0x0d);EmitAdr(glob + op[i].b+2); //faddp EmitByte(0xde);EmitByte(0xc1); //faddp EmitByte(0xde);EmitByte(0xc1); //fstps glob[C] EmitByte(0xd9);EmitByte(0x1d);EmitAdr(glob + op[i].c); break; case OP_EQ_F: case OP_NE_F: case OP_LE_F: case OP_GE_F: case OP_LT_F: case OP_GT_F: //flds glob[A] EmitByte(0xd9);EmitByte(0x05);EmitAdr(glob + op[i].b); //flds glob[B] EmitByte(0xd9);EmitByte(0x05);EmitAdr(glob + op[i].a); //fcomip %st(1),%st EmitByte(0xdf);EmitByte(0xe9); //fstp %st(0) (aka: pop) EmitByte(0xdd);EmitByte(0xd8); j1 = LocalJmp(op[i].op); { STOREF(0.0f, glob + op[i].c); j2 = LocalJmp(OP_GOTO); } { l1 = LocalLoc(); STOREF(1.0f, glob + op[i].c); } l2 = LocalLoc(); LocalJmpLoc(j1,l1); LocalJmpLoc(j2,l2); break; case OP_MUL_FV: case OP_MUL_VF: // { int v; int f; if (op[i].op == OP_MUL_FV) { f = op[i].a; v = op[i].b; } else { v = op[i].a; f = op[i].b; } //flds glob[F] EmitByte(0xd9);EmitByte(0x05);EmitAdr(glob + f); //flds glob[V0] EmitByte(0xd9);EmitByte(0x05);EmitAdr(glob + v+0); //fmul st(1) EmitByte(0xd8);EmitByte(0xc9); //fstps glob[C] EmitByte(0xd9);EmitByte(0x1d);EmitAdr(glob + op[i].c+0); //flds glob[V0] EmitByte(0xd9);EmitByte(0x05);EmitAdr(glob + v+1); //fmul st(1) EmitByte(0xd8);EmitByte(0xc9); //fstps glob[C] EmitByte(0xd9);EmitByte(0x1d);EmitAdr(glob + op[i].c+1); //flds glob[V0] EmitByte(0xd9);EmitByte(0x05);EmitAdr(glob + v+2); //fmul st(1) EmitByte(0xd8);EmitByte(0xc9); //fstps glob[C] EmitByte(0xd9);EmitByte(0x1d);EmitAdr(glob + op[i].c+2); //fstp %st(0) (aka: pop) EmitByte(0xdd);EmitByte(0xd8); } break; case OP_STATE: //externs->stateop(progfuncs, OPA->_float, OPB->function); //push b EmitByte(0xff);EmitByte(0x35);EmitAdr(glob + op[i].b); //push a EmitByte(0xff);EmitByte(0x35);EmitAdr(glob + op[i].a); //push $progfuncs EmitByte(0x68); EmitAdr(progfuncs); //call externs->stateop EmitByte(0xe8); EmitFOffset(externs->stateop, 4); //add $12,%esp EmitByte(0x83); EmitByte(0xc4); EmitByte(0x0c); break; #if 1 /* case OP_NOT_V: //flds 0 //flds glob[A+0] //fcomip %st(1),%st //jne _true //flds glob[A+1] //fcomip %st(1),%st //jne _true //flds glob[A+1] //fcomip %st(1),%st //jne _true //mov 1,C //jmp done //_true: //mov 0,C //done: break; */ case OP_NOT_V: EmitByte(0xcd);EmitByte(op[i].op); printf("QCJIT: instruction %i is not implemented\n", op[i].op); break; #endif case OP_NE_V: case OP_EQ_V: { void *f0, *f1, *f2, *floc; //compare v[0] //flds glob[A] EmitByte(0xd9);EmitByte(0x05);EmitAdr(glob + op[i].a+0); //flds glob[B] EmitByte(0xd9);EmitByte(0x05);EmitAdr(glob + op[i].b+0); //fcomip %st(1),%st EmitByte(0xdf);EmitByte(0xe9); //fstp %st(0) (aka: pop) EmitByte(0xdd);EmitByte(0xd8); /*if the condition is true, don't fail*/ j1 = LocalJmp(op[i].op); { STOREF(0.0f, glob + op[i].c); f0 = LocalJmp(OP_GOTO); } l1 = LocalLoc(); LocalJmpLoc(j1,l1); //compare v[1] //flds glob[A] EmitByte(0xd9);EmitByte(0x05);EmitAdr(glob + op[i].a+1); //flds glob[B] EmitByte(0xd9);EmitByte(0x05);EmitAdr(glob + op[i].b+1); //fcomip %st(1),%st EmitByte(0xdf);EmitByte(0xe9); //fstp %st(0) (aka: pop) EmitByte(0xdd);EmitByte(0xd8); /*if the condition is true, don't fail*/ j1 = LocalJmp(op[i].op); { STOREF(0.0f, glob + op[i].c); f1 = LocalJmp(OP_GOTO); } l1 = LocalLoc(); LocalJmpLoc(j1,l1); //compare v[2] //flds glob[A] EmitByte(0xd9);EmitByte(0x05);EmitAdr(glob + op[i].a+2); //flds glob[B] EmitByte(0xd9);EmitByte(0x05);EmitAdr(glob + op[i].b+2); //fcomip %st(1),%st EmitByte(0xdf);EmitByte(0xe9); //fstp %st(0) (aka: pop) EmitByte(0xdd);EmitByte(0xd8); /*if the condition is true, don't fail*/ j1 = LocalJmp(op[i].op); { STOREF(0.0f, glob + op[i].c); f2 = LocalJmp(OP_GOTO); } l1 = LocalLoc(); LocalJmpLoc(j1,l1); //success! STOREF(1.0f, glob + op[i].c); floc = LocalLoc(); LocalJmpLoc(f0,floc); LocalJmpLoc(f1,floc); LocalJmpLoc(f2,floc); break; } /*fteqcc generates these from reading 'fast arrays', and are part of hexenc extras*/ case OP_FETCH_GBL_F: case OP_FETCH_GBL_S: case OP_FETCH_GBL_E: case OP_FETCH_GBL_FNC: case OP_FETCH_GBL_V: { unsigned int max = ((unsigned int*)glob)[op[i].a-1]; unsigned int base = op[i].a; //flds glob[B] EmitByte(0xd9); EmitByte(0x05);EmitAdr(glob + op[i].b); //fistp ta EmitByte(0xdb); EmitByte(0x1d);EmitAdr(&ta); LOADREG(&ta, REG_EAX) //FIXME: if eax >= $max, abort if (op[i].op == OP_FETCH_GBL_V) { /*scale the index by 3*/ SETREGI(3, REG_EDX) //mul %edx EmitByte(0xf7); EmitByte(0xe2); } //lookup global //mov &glob[base](,%eax,4),%edx EmitByte(0x8b);EmitByte(0x14);EmitByte(0x85);Emit4Byte((unsigned int)(glob + base+0)); STOREREG(REG_EDX, glob + op[i].c+0) if (op[i].op == OP_FETCH_GBL_V) { //mov &glob[base+1](,%eax,4),%edx EmitByte(0x8b);EmitByte(0x14);EmitByte(0x85);Emit4Byte((unsigned int)(glob + base+1)); STOREREG(REG_EDX, glob + op[i].c+1) //mov &glob[base+2](,%eax,4),%edx EmitByte(0x8b);EmitByte(0x14);EmitByte(0x85);Emit4Byte((unsigned int)(glob + base+2)); STOREREG(REG_EDX, glob + op[i].c+2) } break; } /*fteqcc generates these from writing 'fast arrays'*/ case OP_GLOBALADDRESS: LOADREG(glob + op[i].b, REG_EAX); //lea &glob[A](, %eax, 4),%eax EmitByte(0x8d);EmitByte(0x04);EmitByte(0x85);EmitAdr(glob + op[i].b+2); STOREREG(REG_EAX, glob + op[i].c); break; // case OP_BOUNDCHECK: //FIXME: assert b <= a < c break; case OP_CONV_FTOI: //flds glob[A] EmitByte(0xd9); EmitByte(0x05);EmitAdr(glob + op[i].a); //fistp glob[C] EmitByte(0xdb); EmitByte(0x1d);EmitAdr(glob + op[i].c); break; case OP_MUL_I: LOADREG(glob + op[i].a, REG_EAX); //mull glob[C] (arg*eax => edx:eax) EmitByte(0xfc); EmitByte(0x25);EmitAdr(glob + op[i].b); STOREREG(REG_EAX, glob + op[i].c); break; /*other extended opcodes*/ case OP_BITOR_I: LOADREG(glob + op[i].a, REG_EAX) //or %eax,tb EmitByte(0x0b); EmitByte(0x05);EmitAdr(glob + op[i].b); STOREREG(REG_EAX, glob + op[i].c); break; default: { enum qcop_e e = op[i].op; printf("QCJIT: Extended instruction set %i is not supported, not using jit.\n", e); } free(jit->statementjumps); //[MAX_STATEMENTS] free(jit->statementoffsets); //[MAX_STATEMENTS] free(jit->code); free(jit); return NULL; } } FixupJumps(jit); /* most likely want executable memory calls somewhere else more common */ #ifdef _WIN32 { DWORD old; //this memory is on the heap. //this means that we must maintain read/write protection, or libc will crash us VirtualProtect(jit->code, jit->codesize, PAGE_EXECUTE_READWRITE, &old); } #else mprotect(jit->code, jit->codesize, PROT_READ|PROT_EXEC); #endif // externs->WriteFile("jit.x86", jit->code, jit->codesize); return jit; } float foo(float arg) { float f; if (!arg) f = 1; else f = 0; return f; } void PR_EnterJIT(progfuncs_t *progfuncs, struct jitstate *jit, int statement) { #ifdef __GNUC__ //call, it clobbers pretty much everything. asm("call *%0" :: "r"(jit->statementoffsets[statement+1]),"b"(prinst->edicttable):"cc","memory","eax","ecx","edx"); #elif defined(_MSC_VER) void *entry = jit->statementoffsets[statement+1]; void *edicttable = prinst->edicttable; __asm { pushad mov eax,entry mov ebx,edicttable call eax popad } #else #error "Sorry, no idea how to enter assembler safely for your compiler" #endif } #endif