mirror of
https://github.com/ZDoom/Raze.git
synced 2025-01-20 15:40:56 +00:00
c405a13bbd
For a 1680x1050 scene standing directly in front of a mirror, this increases FPS from ~90 to ~95 for me. git-svn-id: https://svn.eduke32.com/eduke32@4363 1a8010ca-5511-0410-912e-c29ae57300e0
447 lines
10 KiB
C
447 lines
10 KiB
C
// Function-wrapped Watcom pragmas
|
|
// by Jonathon Fowler (jf@jonof.id.au)
|
|
//
|
|
// These functions represent some of the more longer-winded pragmas
|
|
// from the original pragmas.h wrapped into functions for easier
|
|
// use since many jumps and whatnot make it harder to write macro-
|
|
// inline versions. I'll eventually convert these to macro-inline
|
|
// equivalents. --Jonathon
|
|
|
|
//#include "pragmas.h"
|
|
#include "compat.h"
|
|
|
|
int32_t dmval;
|
|
|
|
#if defined(__GNUC__) && defined(GEKKO)
|
|
|
|
// naked function (no prolog/epilog)
|
|
// FIXME: this function produces unused parameter warnings and a missing return warning
|
|
int32_t scale(int32_t a, int32_t d, int32_t c)
|
|
{
|
|
// return ((int64_t)a * d) / c;
|
|
|
|
__asm__ __volatile__ (
|
|
" mullw 6, 3, 4\n"
|
|
" mulhw 4, 3, 4\n"
|
|
" mr 3, 6\n"
|
|
|
|
" srawi. 0, 5, 31\n"
|
|
" cmpwi cr1, 4, 0\n"
|
|
" crxor 7, 0, 4\n"
|
|
|
|
" xor 5, 0, 5\n"
|
|
" subf. 5, 0, 5\n"
|
|
|
|
" beq DivByZero\n"
|
|
" bge cr1, Div64Common\n"
|
|
|
|
" subfic 3, 3, 0\n"
|
|
" subfze 4, 4\n"
|
|
|
|
"Div64Common:\n"
|
|
" cmplw 4, 5\n"
|
|
|
|
" cntlzw 6, 5\n"
|
|
" xor 4, 4, 3\n"
|
|
" slw 5, 5, 6\n"
|
|
" rotlw 4, 4, 6\n"
|
|
" slw 3, 3, 6\n"
|
|
" li 7, 2\n"
|
|
" xor 4, 4, 3\n"
|
|
|
|
" bge DivOverflow\n"
|
|
" mtctr 7\n"
|
|
|
|
"Div64Compute:\n"
|
|
" srwi 6, 5, 16\n"
|
|
" divwu 7, 4, 6\n"
|
|
" mullw 6, 7, 6\n"
|
|
" subf 4, 6, 4\n"
|
|
" slwi 4, 4, 16\n"
|
|
" inslwi 4, 3, 16, 16\n"
|
|
" slwi 3, 3, 16\n"
|
|
" clrlwi 6, 5, 16\n"
|
|
" mullw 6, 7, 6\n"
|
|
" subfc 4, 6, 4\n"
|
|
" subfe. 6, 6, 6\n"
|
|
" add 3, 3, 7\n"
|
|
" bge Div64Done\n"
|
|
"Div64Correct:\n"
|
|
" addc 4, 4, 5\n"
|
|
" addze. 6, 6\n"
|
|
" subi 3, 3, 1\n"
|
|
" blt Div64Correct\n"
|
|
|
|
"Div64Done:\n"
|
|
" bdnz Div64Compute\n"
|
|
|
|
" cmpwi 3, 0\n"
|
|
" bso cr1, Div64QuotientNeg\n"
|
|
|
|
" blt DivOverflow\n"
|
|
" blr\n"
|
|
|
|
"Div64QuotientNeg:\n"
|
|
" neg. 3, 3\n"
|
|
" blelr\n"
|
|
|
|
"DivOverflow:\n"
|
|
" cror 4, 7, 7\n"
|
|
|
|
"DivByZero:\n"
|
|
" lis 3, 0x8000\n"
|
|
" bltlr cr1\n"
|
|
" subi 3, 3, 1\n"
|
|
" blr\n"
|
|
);
|
|
}
|
|
|
|
void clearbufbyte(void *d, int32_t c, int32_t a)
|
|
{
|
|
if (a==0) {
|
|
uint8_t *dd = (uint8_t*)d;
|
|
int32_t align = (32 - (int32_t)d) & 31;
|
|
|
|
if (align && c >= align) {
|
|
uint32_t izero = 0;
|
|
double fzero = 0;
|
|
c -= align;
|
|
|
|
if (align&1) {
|
|
*dd = izero;
|
|
dd += 1;
|
|
}
|
|
if (align&2) {
|
|
*(uint16_t*)dd = izero;
|
|
dd += 2;
|
|
}
|
|
if (align&4) {
|
|
*(uint32_t*)dd = izero;
|
|
dd += 4;
|
|
}
|
|
if (align&8) {
|
|
*(double*)dd = fzero;
|
|
dd += 8;
|
|
}
|
|
if (align&16) {
|
|
*(double*)dd = fzero;
|
|
*(double*)(dd+8) = fzero;
|
|
dd += 16;
|
|
}
|
|
}
|
|
align = c >> 5;
|
|
while (align) {
|
|
__asm__ (
|
|
" dcbz 0, %0\n"
|
|
" addi %0, %0, 32\n"
|
|
: "+r"(dd)
|
|
:
|
|
: "memory"
|
|
);
|
|
align--;
|
|
}
|
|
if ((c &= 31)) {
|
|
while (c--) {
|
|
*dd++ = 0;
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
__asm__ __volatile__ (
|
|
" add %1, %1, %2\n"
|
|
" neg. %2, %2\n"
|
|
" beq 2f\n"
|
|
"1:\n"
|
|
" stbx %0, %1, %2\n"
|
|
" addic. %2, %2, 1\n"
|
|
" rotrwi %0, %0, 8\n"
|
|
" bne 1b\n"
|
|
"2:\n"
|
|
: "+r"(a), "+b"(d), "+r"(c)
|
|
:
|
|
: "cc", "xer", "memory"
|
|
);
|
|
}
|
|
|
|
#elif defined(__GNUC__) && defined(__i386__) && !defined(NOASM) // NOASM
|
|
|
|
//
|
|
// GCC Inline Assembler version
|
|
//
|
|
|
|
#define ASM __asm__ __volatile__
|
|
|
|
|
|
int32_t boundmulscale(int32_t a, int32_t b, int32_t c)
|
|
{
|
|
ASM(
|
|
"imull %%ebx\n\t"
|
|
"movl %%edx, %%ebx\n\t" // mov ebx, edx
|
|
"shrdl %%cl, %%edx, %%eax\n\t" // mov eax, edx, cl
|
|
"sarl %%cl, %%edx\n\t" // sar edx, cl
|
|
"xorl %%eax, %%edx\n\t" // xor edx, eax
|
|
"js 0f\n\t" // js checkit
|
|
"xorl %%eax, %%edx\n\t" // xor edx, eax
|
|
"jz 1f\n\t" // js skipboundit
|
|
"cmpl $0xffffffff, %%edx\n\t" // cmp edx, 0xffffffff
|
|
"je 1f\n\t" // je skipboundit
|
|
"0:\n\t" // checkit:
|
|
"movl %%ebx, %%eax\n\t" // mov eax, ebx
|
|
"sarl $31, %%eax\n\t" // sar eax, 31
|
|
"xorl $0x7fffffff, %%eax\n\t" // xor eax, 0x7fffffff
|
|
"1:" // skipboundit:
|
|
: "+a"(a), "+b"(b), "+c"(c) // input eax ebx ecx
|
|
:
|
|
: "edx", "cc"
|
|
);
|
|
return a;
|
|
}
|
|
|
|
|
|
void clearbufbyte(void *D, int32_t c, int32_t a)
|
|
{
|
|
ASM(
|
|
"cmpl $4, %%ecx\n\t"
|
|
"jae 1f\n\t"
|
|
"testb $1, %%cl\n\t"
|
|
"jz 0f\n\t" // jz preskip
|
|
"stosb\n\t"
|
|
"0:\n\t" // preskip:
|
|
"shrl $1, %%ecx\n\t"
|
|
"rep\n\t"
|
|
"stosw\n\t"
|
|
"jmp 5f\n\t" // jmp endit
|
|
"1:\n\t" // intcopy:
|
|
"testl $1, %%edi\n\t"
|
|
"jz 2f\n\t" // jz skip1
|
|
"stosb\n\t"
|
|
"decl %%ecx\n\t"
|
|
"2:\n\t" // skip1:
|
|
"testl $2, %%edi\n\t"
|
|
"jz 3f\n\t" // jz skip2
|
|
"stosw\n\t"
|
|
"subl $2, %%ecx\n\t"
|
|
"3:\n\t" // skip2:
|
|
"movl %%ecx, %%ebx\n\t"
|
|
"shrl $2, %%ecx\n\t"
|
|
"rep\n\t"
|
|
"stosl\n\t"
|
|
"testb $2, %%bl\n\t"
|
|
"jz 4f\n\t" // jz skip3
|
|
"stosw\n\t"
|
|
"4:\n\t" // skip3:
|
|
"testb $1, %%bl\n\t"
|
|
"jz 5f\n\t" // jz endit
|
|
"stosb\n\t"
|
|
"5:" // endit
|
|
: "+D"(D), "+c"(c), "+a"(a) :
|
|
: "ebx", "memory", "cc"
|
|
);
|
|
}
|
|
|
|
void copybufbyte(const void *S, void *D, int32_t c)
|
|
{
|
|
ASM(
|
|
"cmpl $4, %%ecx\n\t" // cmp ecx, 4
|
|
"jae 1f\n\t"
|
|
"testb $1, %%cl\n\t" // test cl, 1
|
|
"jz 0f\n\t"
|
|
"movsb\n\t"
|
|
"0:\n\t" // preskip:
|
|
"shrl $1, %%ecx\n\t" // shr ecx, 1
|
|
"rep\n\t"
|
|
"movsw\n\t"
|
|
"jmp 5f\n\t"
|
|
"1:\n\t" // intcopy:
|
|
"testl $1, %%edi\n\t" // test edi, 1
|
|
"jz 2f\n\t"
|
|
"movsb\n\t"
|
|
"decl %%ecx\n\t"
|
|
"2:\n\t" // skip1:
|
|
"testl $2, %%edi\n\t" // test edi, 2
|
|
"jz 3f\n\t"
|
|
"movsw\n\t"
|
|
"subl $2, %%ecx\n\t" // sub ecx, 2
|
|
"3:\n\t" // skip2:
|
|
"movl %%ecx, %%ebx\n\t" // mov ebx, ecx
|
|
"shrl $2, %%ecx\n\t" // shr ecx ,2
|
|
"rep\n\t"
|
|
"movsl\n\t"
|
|
"testb $2, %%bl\n\t" // test bl, 2
|
|
"jz 4f\n\t"
|
|
"movsw\n\t"
|
|
"4:\n\t" // skip3:
|
|
"testb $1, %%bl\n\t" // test bl, 1
|
|
"jz 5f\n\t"
|
|
"movsb\n\t"
|
|
"5:" // endit:
|
|
: "+c"(c), "+S"(S), "+D"(D) :
|
|
: "ebx", "memory", "cc"
|
|
);
|
|
}
|
|
|
|
void copybufreverse(const void *S, void *D, int32_t c)
|
|
{
|
|
ASM(
|
|
"shrl $1, %%ecx\n\t"
|
|
"jnc 0f\n\t" // jnc skipit1
|
|
"movb (%%esi), %%al\n\t"
|
|
"decl %%esi\n\t"
|
|
"movb %%al, (%%edi)\n\t"
|
|
"incl %%edi\n\t"
|
|
"0:\n\t" // skipit1:
|
|
"shrl $1, %%ecx\n\t"
|
|
"jnc 1f\n\t" // jnc skipit2
|
|
"movw -1(%%esi), %%ax\n\t"
|
|
"subl $2, %%esi\n\t"
|
|
"rorw $8, %%ax\n\t"
|
|
"movw %%ax, (%%edi)\n\t"
|
|
"addl $2, %%edi\n\t"
|
|
"1:\n\t" // skipit2
|
|
"testl %%ecx, %%ecx\n\t"
|
|
"jz 3f\n\t" // jz endloop
|
|
"2:\n\t" // begloop
|
|
"movl -3(%%esi), %%eax\n\t"
|
|
"subl $4, %%esi\n\t"
|
|
"bswapl %%eax\n\t"
|
|
"movl %%eax, (%%edi)\n\t"
|
|
"addl $4, %%edi\n\t"
|
|
"decl %%ecx\n\t"
|
|
"jnz 2b\n\t" // jnz begloop
|
|
"3:"
|
|
: "+S"(S), "+D"(D), "+c"(c) :
|
|
: "eax", "memory", "cc"
|
|
);
|
|
}
|
|
|
|
#elif defined(_MSC_VER) && !defined(NOASM) // __GNUC__ && __i386__
|
|
|
|
//
|
|
// Microsoft C Inline Assembler version
|
|
//
|
|
|
|
#else // _MSC_VER
|
|
|
|
//
|
|
// Generic C version
|
|
//
|
|
|
|
void qinterpolatedown16(intptr_t bufptr, int32_t num, int32_t val, int32_t add)
|
|
{
|
|
// gee, I wonder who could have provided this...
|
|
int32_t i, *lptr = (int32_t *)bufptr;
|
|
for (i=0; i<num; i++) { lptr[i] = (val>>16); val += add; }
|
|
}
|
|
|
|
void qinterpolatedown16short(intptr_t bufptr, int32_t num, int32_t val, int32_t add)
|
|
{
|
|
// ...maybe the same person who provided this too?
|
|
int32_t i; int16_t *sptr = (int16_t *)bufptr;
|
|
for (i=0; i<num; i++) { sptr[i] = (int16_t)(val>>16); val += add; }
|
|
}
|
|
|
|
void clearbuf(void *d, int32_t c, int32_t a)
|
|
{
|
|
int32_t *p = (int32_t *)d;
|
|
|
|
while ((c--) > 0) *(p++) = a;
|
|
}
|
|
|
|
void copybuf(const void *s, void *d, int32_t c)
|
|
{
|
|
const int32_t *p = (const int32_t *)s;
|
|
int32_t *q = (int32_t *)d;
|
|
|
|
while ((c--) > 0) *(q++) = *(p++);
|
|
}
|
|
|
|
void swapbuf4(void *a, void *b, int32_t c)
|
|
{
|
|
int32_t *p = (int32_t *)a, *q = (int32_t *)b;
|
|
int32_t x, y;
|
|
while ((c--) > 0)
|
|
{
|
|
x = *q;
|
|
y = *p;
|
|
*(q++) = y;
|
|
*(p++) = x;
|
|
}
|
|
}
|
|
|
|
void clearbufbyte(void *D, int32_t c, int32_t a)
|
|
{
|
|
// Cringe City
|
|
char *p = (char *)D;
|
|
int32_t m[4] = { 0xffl,0xff00l,0xff0000l,(int32_t)0xff000000l };
|
|
int32_t n[4] = { 0,8,16,24 };
|
|
int32_t z=0;
|
|
while ((c--) > 0)
|
|
{
|
|
*(p++) = (uint8_t)((a & m[z])>>n[z]);
|
|
z=(z+1)&3;
|
|
}
|
|
}
|
|
|
|
void copybufbyte(const void *S, void *D, int32_t c)
|
|
{
|
|
const char *p = (const char *)S;
|
|
char *q = (char *)D;
|
|
|
|
while ((c--) > 0) *(q++) = *(p++);
|
|
}
|
|
|
|
|
|
// copybufreverse() is a special case: use the assembly version for GCC on x86
|
|
// *and* x86_64, and the C version otherwise.
|
|
// XXX: we don't honor NOASM in the x86_64 case.
|
|
|
|
#if defined(__GNUC__) && defined(__x86_64__)
|
|
// NOTE: Almost CODEDUP from x86 GCC assembly version, except that
|
|
// - %%esi -> %%rsi
|
|
// - %%edi -> %%rdi
|
|
// - (dec,inc,sub,add)l suffix removed where necessary
|
|
void copybufreverse(const void *S, void *D, int32_t c)
|
|
{
|
|
__asm__ __volatile__(
|
|
"shrl $1, %%ecx\n\t"
|
|
"jnc 0f\n\t" // jnc skipit1
|
|
"movb (%%rsi), %%al\n\t"
|
|
"dec %%rsi\n\t"
|
|
"movb %%al, (%%rdi)\n\t"
|
|
"inc %%rdi\n\t"
|
|
"0:\n\t" // skipit1:
|
|
"shrl $1, %%ecx\n\t"
|
|
"jnc 1f\n\t" // jnc skipit2
|
|
"movw -1(%%rsi), %%ax\n\t"
|
|
"sub $2, %%rsi\n\t"
|
|
"rorw $8, %%ax\n\t"
|
|
"movw %%ax, (%%rdi)\n\t"
|
|
"add $2, %%rdi\n\t"
|
|
"1:\n\t" // skipit2
|
|
"testl %%ecx, %%ecx\n\t"
|
|
"jz 3f\n\t" // jz endloop
|
|
"2:\n\t" // begloop
|
|
"movl -3(%%rsi), %%eax\n\t"
|
|
"sub $4, %%rsi\n\t"
|
|
"bswapl %%eax\n\t"
|
|
"movl %%eax, (%%rdi)\n\t"
|
|
"add $4, %%rdi\n\t"
|
|
"decl %%ecx\n\t"
|
|
"jnz 2b\n\t" // jnz begloop
|
|
"3:"
|
|
: "+S"(S), "+D"(D), "+c"(c) :
|
|
: "eax", "memory", "cc"
|
|
);
|
|
}
|
|
#else
|
|
void copybufreverse(const void *S, void *D, int32_t c)
|
|
{
|
|
const char *p = (const char *)S;
|
|
char *q = (char *)D;
|
|
|
|
while ((c--) > 0) *(q++) = *(p--);
|
|
}
|
|
#endif
|
|
|
|
|
|
#endif
|