From 133acaecaeccdb03292391c87307327d8479d9d8 Mon Sep 17 00:00:00 2001 From: helixhorned Date: Tue, 1 May 2012 12:37:32 +0000 Subject: [PATCH] Patch adding Wii support by tueidj, part 1: assembly pragmas The original patch was communicated to me by Hendricks, but since it didn't apply cleanly (it's based on r2182) I took the liberty of slightly messing with it for inclusion into EDuke32. Info: http://wiibrew.org/wiki/User:Tueidj/Duke3D This first part (which wasn't changed from the original patch) implements scaling arithmetic and miscellaneous pragmas, some in PPC assembly and a part of them in C. Of some interest is the fact that the Wii processor apparently lacks support for 64-bit integers, so divscale() uses floating-point math. git-svn-id: https://svn.eduke32.com/eduke32@2621 1a8010ca-5511-0410-912e-c29ae57300e0 --- polymer/eduke32/build/include/pragmas.h | 404 +++++++++++++++++++++++- polymer/eduke32/build/src/a-c.c | 8 +- polymer/eduke32/build/src/pragmas.c | 152 ++++++++- 3 files changed, 556 insertions(+), 8 deletions(-) diff --git a/polymer/eduke32/build/include/pragmas.h b/polymer/eduke32/build/include/pragmas.h index de3d01246..badf91155 100644 --- a/polymer/eduke32/build/include/pragmas.h +++ b/polymer/eduke32/build/include/pragmas.h @@ -20,12 +20,32 @@ extern int32_t dmval; #define dw(x) ((int32_t)(x)) // doubleword cast #define by(x) ((uint8_t)(x)) // byte cast -#define _scaler(a) \ +#ifdef GEKKO +#include +static inline int32_t divscale(int32_t eax, int32_t ebx, int32_t ecx) +{ + // XXX: potential loss of precision? double has only 52 bits in the + // significand, after all... + return ldexp(eax, ecx) / ebx; +} + +# define _scaler(a) \ + static inline int32_t divscale##a(int32_t eax, int32_t ebx) \ +{ \ + return divscale(eax, ebx, a); \ +} \ + +#else +static inline int32_t divscale(int32_t eax, int32_t ebx, int32_t ecx) { return dw((qw(eax) << by(ecx)) / qw(ebx)); } + +# define _scaler(a) \ static inline int32_t divscale##a(int32_t eax, int32_t ebx) \ { \ return dw((qw(eax) << a) / qw(ebx)); \ } \ +#endif + _scaler(1) _scaler(2) _scaler(3) _scaler(4) _scaler(5) _scaler(6) _scaler(7) _scaler(8) _scaler(9) _scaler(10) _scaler(11) _scaler(12) @@ -35,17 +55,391 @@ _scaler(21) _scaler(22) _scaler(23) _scaler(24) _scaler(25) _scaler(26) _scaler(27) _scaler(28) _scaler(29) _scaler(30) _scaler(31) _scaler(32) -static inline int32_t divscale(int32_t eax, int32_t ebx, int32_t ecx) { return dw((qw(eax) << by(ecx)) / qw(ebx)); } - #undef qw #undef dw #undef by #undef _scaler -#if defined(__GNUC__) && defined(__i386__) && !defined(NOASM) +#if defined(__GNUC__) && defined(GEKKO) + +// GCC Inline Assembler version (PowerPC) + +#define sqr(a) ((a)*(a)) + +int32_t scale(int32_t a, int32_t d, int32_t c); + +static inline int32_t mulscale(int32_t a, int32_t d, int32_t c) +{ + int32_t mullo, mulhi; + __asm__ ( + " mullw %0, %2, %3\n" + " mulhw %1, %2, %3\n" + " srw %0, %0, %4\n" + " slw %1, %1, %5\n" + " or %0, %0, %1\n" + : "=&r"(mullo), "=&r"(mulhi) + : "r"(a), "r"(d), "r"(c), "r"(32-c) + : "xer" + ); + return mullo; +} + +#define MULSCALE(x) \ +static inline int32_t mulscale##x(int32_t a, int32_t d) \ +{ \ + int32_t mullo, mulhi; \ + __asm__ ( \ + " mullw %0, %2, %3\n" \ + " mulhw %1, %2, %3\n" \ + " srwi %0, %0, %4\n" \ + " insrwi %0, %1, %4, 0\n" \ + : "=&r"(mullo), "=r"(mulhi) \ + : "r"(a), "r"(d), "i"(x) \ + ); \ + return mullo; \ +} + +MULSCALE(1) MULSCALE(2) MULSCALE(3) MULSCALE(4) +MULSCALE(5) MULSCALE(6) MULSCALE(7) MULSCALE(8) +MULSCALE(9) MULSCALE(10) MULSCALE(11) MULSCALE(12) +MULSCALE(13) MULSCALE(14) MULSCALE(15) MULSCALE(16) +MULSCALE(17) MULSCALE(18) MULSCALE(19) MULSCALE(20) +MULSCALE(21) MULSCALE(22) MULSCALE(23) MULSCALE(24) +MULSCALE(25) MULSCALE(26) MULSCALE(27) MULSCALE(28) +MULSCALE(29) MULSCALE(30) MULSCALE(31) +#undef MULSCALE + +static inline int32_t mulscale32(int32_t a, int32_t d) +{ + int32_t mulhi; + __asm__ ( + " mulhw %0, %1, %2\n" + : "=r"(mulhi) + : "r"(a), "r"(d) + ); + return mulhi; +} + +static inline int32_t dmulscale(int32_t a, int32_t d, int32_t S, int32_t D, int32_t c) +{ + int32_t mulhi, mullo, sumhi, sumlo; + __asm__ ( + " mullw %0, %4, %5\n" + " mulhw %1, %4, %5\n" + " mullw %2, %6, %7\n" + " mulhw %3, %6, %7\n" + " addc %0, %0, %2\n" + " adde %1, %1, %3\n" + " srw %0, %0, %8\n" + " slw %1, %1, %9\n" + " or %0, %0, %1\n" + : "=&r"(sumlo), "=&r"(sumhi), "=&r"(mullo), "=&r"(mulhi) + : "r"(a), "r"(d), "r"(S), "r"(D), "r"(c), "r"(32-c) + : "xer" + ); + return sumlo; +} + +#define DMULSCALE(x) \ +static inline int32_t dmulscale##x(int32_t a, int32_t d, int32_t S, int32_t D) \ +{ \ + int32_t mulhi, mullo, sumhi, sumlo; \ + __asm__ ( \ + " mullw %0, %4, %5\n" \ + " mulhw %1, %4, %5\n" \ + " mullw %2, %6, %7\n" \ + " mulhw %3, %6, %7\n" \ + " addc %0, %0, %2\n" \ + " adde %1, %1, %3\n" \ + " srwi %0, %0, %8\n" \ + " insrwi %0, %1, %8, 0\n" \ + : "=&r"(sumlo), "=&r"(sumhi), "=&r"(mullo), "=r"(mulhi) \ + : "r"(a), "r"(d), "r"(S), "r"(D), "i"(x) \ + : "xer" \ + ); \ + return sumlo; \ +} + +DMULSCALE(1) DMULSCALE(2) DMULSCALE(3) DMULSCALE(4) +DMULSCALE(5) DMULSCALE(6) DMULSCALE(7) DMULSCALE(8) +DMULSCALE(9) DMULSCALE(10) DMULSCALE(11) DMULSCALE(12) +DMULSCALE(13) DMULSCALE(14) DMULSCALE(15) DMULSCALE(16) +DMULSCALE(17) DMULSCALE(18) DMULSCALE(19) DMULSCALE(20) +DMULSCALE(21) DMULSCALE(22) DMULSCALE(23) DMULSCALE(24) +DMULSCALE(25) DMULSCALE(26) DMULSCALE(27) DMULSCALE(28) +DMULSCALE(29) DMULSCALE(30) DMULSCALE(31) +#undef DMULSCALE + +static inline int32_t dmulscale32(int32_t a, int32_t d, int32_t S, int32_t D) +{ + int32_t mulhi, mullo, sumhi, sumlo; + __asm__ ( \ + " mullw %0, %4, %5\n" \ + " mulhw %1, %4, %5\n" \ + " mullw %2, %6, %7\n" \ + " mulhw %3, %6, %7\n" \ + " addc %0, %0, %2\n" \ + " adde %1, %1, %3\n" \ + : "=&r"(sumlo), "=&r"(sumhi), "=&r"(mullo), "=r"(mulhi) + : "r"(a), "r"(d), "r"(S), "r"(D) + : "xer" + ); + return sumhi; +} + +// tmulscale only seems to be used in one place... +static inline int32_t tmulscale11(int32_t a, int32_t d, int32_t b, int32_t c, int32_t S, int32_t D) +{ + int32_t mulhi, mullo, sumhi, sumlo; + __asm__ ( + " mullw %0, %4, %5\n" \ + " mulhw %1, %4, %5\n" \ + " mullw %2, %6, %7\n" \ + " mulhw %3, %6, %7\n" \ + " addc %0, %0, %2\n" \ + " adde %1, %1, %3\n" \ + " mullw %2, %8, %9\n" \ + " mulhw %3, %8, %9\n" \ + " addc %0, %0, %2\n" \ + " adde %1, %1, %3\n" \ + " srwi %0, %0, 11\n" \ + " insrwi %0, %1, 11, 0\n" \ + : "=&r"(sumlo), "=&r"(sumhi), "=&r"(mullo), "=&r"(mulhi) + : "r"(a), "r"(d), "r"(b), "r"(c), "r"(S), "r"(D) + : "xer" + ); + return sumlo; +} + +static inline int32_t boundmulscale(int32_t a, int32_t b, int32_t c) +{ + int32_t mulhi, mullo, mask; + __asm__ ( + " mulhw %1, %3, %4\n" + " mullw %0, %3, %4\n" + " sraw. %2, %1, %5\n" + " beq 1f\n" + " cmpwi %2, -1\n" + " beq+ 1f\n" + " lis %0, 0x7FFF\n" + " srawi %2, %2, 31\n" + " xor %0, %0, %2\n" + " subf %0, %6, %0\n" + " b 2f\n" + "1: \n" + " srw %0, %0, %5\n" + " slw %1, %1, %6\n" + " or %0, %0, %1\n" + "2: \n" + : "=&r"(mullo), "=&r"(mulhi), "=&r"(mask) + : "r"(a), "r"(b), "r"(c), "r"(32-c) + : "cc", "xer" + ); + return mullo; +} + +static inline char readpixel(void *d) +{ + return *(char*)d; +} + +static inline void drawpixel(void *d, char a) +{ + *(char*)d = a; +} + +static inline void drawpixels(void *d, int16_t a) +{ + __asm__ ( + " sthbrx %0, 0, %1\n" + : + : "r"(&a), "r"(d) + : "memory" + ); +} + +static inline void drawpixelses(void *d, int32_t a) +{ + __asm__ ( + " stwbrx %0, 0, %1\n" + : + : "r"(&a), "r"(d) + : "memory" + ); +} + +void clearbufbyte(void *d, int32_t c, int32_t a); + +static inline void clearbuf(void *d, int32_t c, int32_t a) +{ + int32_t *p = (int32_t*)d; + if (a==0) { + clearbufbyte(d, c<<2, 0); + return; + } + while (c--) { + *p++ = a; + } +} + +static inline void copybuf(void *s, void *d, int32_t c) +{ + int32_t *p = (int32_t*)s, *q = (int32_t*)d; + while (c--) { + *q++ = *p++; + } +} + +static inline void copybufbyte(void *s, void *d, int32_t c) +{ + uint8_t *src = (uint8_t*)s, *dst = (uint8_t*)d; + while (c--) { + *dst++ = *src++; + } +} + +static inline void copybufreverse(void *s, void *d, int32_t c) +{ + uint8_t *src = (uint8_t*)s, *dst = (uint8_t*)d; + while (c--) { + *dst++ = *src--; + } +} + +static inline void qinterpolatedown16(intptr_t bufptr, int32_t num, int32_t val, int32_t add) +{ + int i; + int32_t *lptr = (int32_t *)bufptr; + for (i=0; i>16); + val += add; + } +} + +static inline void qinterpolatedown16short(intptr_t bufptr, int32_t num, int32_t val, int32_t add) +{ + int i; + int16_t *sptr = (int16_t *)bufptr; + for (i=0; i>16); + val += add; + } +} + +static inline int32_t mul3(int32_t a) +{ + return (a<<1)+a; +} + +static inline int32_t mul5(int32_t a) +{ + return (a<<2)+a; +} + +static inline int32_t mul9(int32_t a) +{ + return (a<<3)+a; +} + +static inline int32_t klabs(int32_t a) +{ + int32_t mask; + __asm__ ( + " srawi %0, %1, 31\n" + " xor %1, %0, %1\n" + " subf %1, %0, %1\n" + : "=&r"(mask), "+r"(a) + : + : "xer" + ); + return a; +} + +static inline int32_t ksgn(int32_t a) +{ + int32_t s, t; + __asm__ ( + " neg %1, %2\n" + " srawi %0, %2, 31\n" + " srwi %1, %1, 31\n" + " or %1, %1, %0\n" + : "=r"(t), "=&r"(s) + : "r"(a) + : "xer" + ); + return s; +} + +static inline void swapchar(void *a, void *b) +{ + char t = *(char*)a; + *(char*)a = *(char*)b; + *(char*)b = t; +} + +static inline void swapchar2(void *a, void *b, int32_t s) +{ + swapchar(a, b); + swapchar((char*)a+1, (char*)b+s); +} + +static inline void swapshort(void *a, void *b) +{ + int16_t t = *(int16_t*)a; + *(int16_t*)a = *(int16_t*)b; + *(int16_t*)b = t; +} + +static inline void swaplong(void *a, void *b) +{ + int32_t t = *(int32_t*)a; + *(int32_t*)a = *(int32_t*)b; + *(int32_t*)b = t; +} + +static inline void swap64bit(void *a, void *b) +{ + double t = *(double*)a; + *(double*)a = *(double*)b; + *(double*)b = t; +} + +static inline int32_t divmod(int32_t a, int32_t b) +{ + int32_t div; + __asm__ ( + " divwu %0, %2, %3\n" + " mullw %1, %0, %3\n" + " subf %1, %1, %2\n" + : "=&r"(div), "=&r"(dmval) + : "r"(a), "r"(b) + ); + return div; +} + +static inline int32_t moddiv(int32_t a, int32_t b) +{ + int32_t mod; + __asm__ ( + " divwu %0, %2, %3\n" + " mullw %1, %0, %3\n" + " subf %1, %1, %2\n" + : "=&r"(dmval), "=&r"(mod) + : "r"(a), "r"(b) + ); + return mod; +} + +static inline int32_t umin(int32_t a, int32_t b) { if ((uint32_t)a < (uint32_t)b) return a; return b; } +static inline int32_t umax(int32_t a, int32_t b) { if ((uint32_t)a < (uint32_t)b) return b; return a; } +static inline int32_t kmin(int32_t a, int32_t b) { if ((int32_t)a < (int32_t)b) return a; return b; } +static inline int32_t kmax(int32_t a, int32_t b) { if ((int32_t)a < (int32_t)b) return b; return a; } + + +#elif defined(__GNUC__) && defined(__i386__) && !defined(NOASM) // -// GCC Inline Assembler version +// GCC Inline Assembler version (x86) // //{{{ diff --git a/polymer/eduke32/build/src/a-c.c b/polymer/eduke32/build/src/a-c.c index a55b014a1..6c2ac2a69 100644 --- a/polymer/eduke32/build/src/a-c.c +++ b/polymer/eduke32/build/src/a-c.c @@ -466,8 +466,12 @@ void drawslab(int32_t dx, int32_t v, int32_t dy, int32_t vi, intptr_t vptr, intp while (dy > 0) { - for (x=0; x>16)+vptr))]; - p += bpl; v += vi; dy--; + char c = gpal[(int32_t)(*(char *)((v>>16)+vptr))]; + for (x=0; x < dx; x++) + ((char*)p)[x] = c; + p += bpl; + v += vi; + dy--; } } diff --git a/polymer/eduke32/build/src/pragmas.c b/polymer/eduke32/build/src/pragmas.c index 114b724fd..f88d2f175 100644 --- a/polymer/eduke32/build/src/pragmas.c +++ b/polymer/eduke32/build/src/pragmas.c @@ -12,7 +12,157 @@ int32_t dmval; -#if defined(__GNUC__) && defined(__i386__) && !defined(NOASM) // NOASM +#if defined(__GNUC__) && defined(GEKKO) + +// naked function (no prolog/epilog) +int32_t scale(int32_t a, int32_t d, int32_t c) +{ +// return ((int64_t)a * d) / c; + + __asm__ __volatile__ ( + " mullw 6, 3, 4\n" + " mulhw 4, 3, 4\n" + " mr 3, 6\n" + + " srawi. 0, 5, 31\n" + " cmpwi cr1, 4, 0\n" + " crxor 7, 0, 4\n" + + " xor 5, 0, 5\n" + " subf. 5, 0, 5\n" + + " beq DivByZero\n" + " bge cr1, Div64Common\n" + + " subfic 3, 3, 0\n" + " subfze 4, 4\n" + + "Div64Common:\n" + " cmplw 4, 5\n" + + " cntlzw 6, 5\n" + " xor 4, 4, 3\n" + " slw 5, 5, 6\n" + " rotlw 4, 4, 6\n" + " slw 3, 3, 6\n" + " li 7, 2\n" + " xor 4, 4, 3\n" + + " bge DivOverflow\n" + " mtctr 7\n" + + "Div64Compute:\n" + " srwi 6, 5, 16\n" + " divwu 7, 4, 6\n" + " mullw 6, 7, 6\n" + " subf 4, 6, 4\n" + " slwi 4, 4, 16\n" + " inslwi 4, 3, 16, 16\n" + " slwi 3, 3, 16\n" + " clrlwi 6, 5, 16\n" + " mullw 6, 7, 6\n" + " subfc 4, 6, 4\n" + " subfe. 6, 6, 6\n" + " add 3, 3, 7\n" + " bge Div64Done\n" + "Div64Correct:\n" + " addc 4, 4, 5\n" + " addze. 6, 6\n" + " subi 3, 3, 1\n" + " blt Div64Correct\n" + + "Div64Done:\n" + " bdnz Div64Compute\n" + + " cmpwi 3, 0\n" + " bso cr1, Div64QuotientNeg\n" + + " blt DivOverflow\n" + " blr\n" + + "Div64QuotientNeg:\n" + " neg. 3, 3\n" + " blelr\n" + + "DivOverflow:\n" + " cror 4, 7, 7\n" + + "DivByZero:\n" + " lis 3, 0x8000\n" + " bltlr cr1\n" + " subi 3, 3, 1\n" + " blr\n" + ); +} + +void clearbufbyte(void *d, int32_t c, int32_t a) +{ + if (a==0) { + uint8_t *dd = (uint8_t*)d; + int32_t align = (32 - (int32_t)d) & 31; + + if (align && c >= align) { + uint32_t izero = 0; + double fzero = 0; + c -= align; + + if (align&1) { + *dd = izero; + dd += 1; + } + if (align&2) { + *(uint16_t*)dd = izero; + dd += 2; + } + if (align&4) { + *(uint32_t*)dd = izero; + dd += 4; + } + if (align&8) { + *(double*)dd = fzero; + dd += 8; + } + if (align&16) { + *(double*)dd = fzero; + *(double*)(dd+8) = fzero; + dd += 16; + } + } + align = c >> 5; + while (align) { + __asm__ ( + " dcbz 0, %0\n" + " addi %0, %0, 32\n" + : "+r"(dd) + : + : "memory" + ); + align--; + } + if ((c &= 31)) { + while (c--) { + *dd++ = 0; + } + } + return; + } + __asm__ __volatile__ ( + " add %1, %1, %2\n" + " neg. %2, %2\n" + " beq 2f\n" + "1:\n" + " stbx %0, %1, %2\n" + " addic. %2, %2, 1\n" + " rotrwi %0, %0, 8\n" + " bne 1b\n" + "2:\n" + : "+r"(a), "+b"(d), "+r"(c) + : + : "cc", "xer", "memory" + ); +} + +#elif defined(__GNUC__) && defined(__i386__) && !defined(NOASM) // NOASM // // GCC Inline Assembler version