Patch adding Wii support by tueidj, part 1: assembly pragmas

The original patch was communicated to me by Hendricks, but since it didn't
apply cleanly (it's based on r2182) I took the liberty of slightly messing
with it for inclusion into EDuke32.

Info: http://wiibrew.org/wiki/User:Tueidj/Duke3D

This first part (which wasn't changed from the original patch) implements
scaling arithmetic and miscellaneous pragmas, some in PPC assembly and a part
of them in C.  Of some interest is the fact that the Wii processor apparently
lacks support for 64-bit integers, so divscale() uses floating-point math.

git-svn-id: https://svn.eduke32.com/eduke32@2621 1a8010ca-5511-0410-912e-c29ae57300e0
This commit is contained in:
helixhorned 2012-05-01 12:37:32 +00:00
parent 226f04ddae
commit 133acaecae
3 changed files with 556 additions and 8 deletions

View file

@ -20,12 +20,32 @@ extern int32_t dmval;
#define dw(x) ((int32_t)(x)) // doubleword cast #define dw(x) ((int32_t)(x)) // doubleword cast
#define by(x) ((uint8_t)(x)) // byte cast #define by(x) ((uint8_t)(x)) // byte cast
#define _scaler(a) \ #ifdef GEKKO
#include <math.h>
static inline int32_t divscale(int32_t eax, int32_t ebx, int32_t ecx)
{
// XXX: potential loss of precision? double has only 52 bits in the
// significand, after all...
return ldexp(eax, ecx) / ebx;
}
# define _scaler(a) \
static inline int32_t divscale##a(int32_t eax, int32_t ebx) \
{ \
return divscale(eax, ebx, a); \
} \
#else
static inline int32_t divscale(int32_t eax, int32_t ebx, int32_t ecx) { return dw((qw(eax) << by(ecx)) / qw(ebx)); }
# define _scaler(a) \
static inline int32_t divscale##a(int32_t eax, int32_t ebx) \ static inline int32_t divscale##a(int32_t eax, int32_t ebx) \
{ \ { \
return dw((qw(eax) << a) / qw(ebx)); \ return dw((qw(eax) << a) / qw(ebx)); \
} \ } \
#endif
_scaler(1) _scaler(2) _scaler(3) _scaler(4) _scaler(1) _scaler(2) _scaler(3) _scaler(4)
_scaler(5) _scaler(6) _scaler(7) _scaler(8) _scaler(5) _scaler(6) _scaler(7) _scaler(8)
_scaler(9) _scaler(10) _scaler(11) _scaler(12) _scaler(9) _scaler(10) _scaler(11) _scaler(12)
@ -35,17 +55,391 @@ _scaler(21) _scaler(22) _scaler(23) _scaler(24)
_scaler(25) _scaler(26) _scaler(27) _scaler(28) _scaler(25) _scaler(26) _scaler(27) _scaler(28)
_scaler(29) _scaler(30) _scaler(31) _scaler(32) _scaler(29) _scaler(30) _scaler(31) _scaler(32)
static inline int32_t divscale(int32_t eax, int32_t ebx, int32_t ecx) { return dw((qw(eax) << by(ecx)) / qw(ebx)); }
#undef qw #undef qw
#undef dw #undef dw
#undef by #undef by
#undef _scaler #undef _scaler
#if defined(__GNUC__) && defined(__i386__) && !defined(NOASM) #if defined(__GNUC__) && defined(GEKKO)
// GCC Inline Assembler version (PowerPC)
#define sqr(a) ((a)*(a))
int32_t scale(int32_t a, int32_t d, int32_t c);
static inline int32_t mulscale(int32_t a, int32_t d, int32_t c)
{
int32_t mullo, mulhi;
__asm__ (
" mullw %0, %2, %3\n"
" mulhw %1, %2, %3\n"
" srw %0, %0, %4\n"
" slw %1, %1, %5\n"
" or %0, %0, %1\n"
: "=&r"(mullo), "=&r"(mulhi)
: "r"(a), "r"(d), "r"(c), "r"(32-c)
: "xer"
);
return mullo;
}
#define MULSCALE(x) \
static inline int32_t mulscale##x(int32_t a, int32_t d) \
{ \
int32_t mullo, mulhi; \
__asm__ ( \
" mullw %0, %2, %3\n" \
" mulhw %1, %2, %3\n" \
" srwi %0, %0, %4\n" \
" insrwi %0, %1, %4, 0\n" \
: "=&r"(mullo), "=r"(mulhi) \
: "r"(a), "r"(d), "i"(x) \
); \
return mullo; \
}
MULSCALE(1) MULSCALE(2) MULSCALE(3) MULSCALE(4)
MULSCALE(5) MULSCALE(6) MULSCALE(7) MULSCALE(8)
MULSCALE(9) MULSCALE(10) MULSCALE(11) MULSCALE(12)
MULSCALE(13) MULSCALE(14) MULSCALE(15) MULSCALE(16)
MULSCALE(17) MULSCALE(18) MULSCALE(19) MULSCALE(20)
MULSCALE(21) MULSCALE(22) MULSCALE(23) MULSCALE(24)
MULSCALE(25) MULSCALE(26) MULSCALE(27) MULSCALE(28)
MULSCALE(29) MULSCALE(30) MULSCALE(31)
#undef MULSCALE
static inline int32_t mulscale32(int32_t a, int32_t d)
{
int32_t mulhi;
__asm__ (
" mulhw %0, %1, %2\n"
: "=r"(mulhi)
: "r"(a), "r"(d)
);
return mulhi;
}
static inline int32_t dmulscale(int32_t a, int32_t d, int32_t S, int32_t D, int32_t c)
{
int32_t mulhi, mullo, sumhi, sumlo;
__asm__ (
" mullw %0, %4, %5\n"
" mulhw %1, %4, %5\n"
" mullw %2, %6, %7\n"
" mulhw %3, %6, %7\n"
" addc %0, %0, %2\n"
" adde %1, %1, %3\n"
" srw %0, %0, %8\n"
" slw %1, %1, %9\n"
" or %0, %0, %1\n"
: "=&r"(sumlo), "=&r"(sumhi), "=&r"(mullo), "=&r"(mulhi)
: "r"(a), "r"(d), "r"(S), "r"(D), "r"(c), "r"(32-c)
: "xer"
);
return sumlo;
}
#define DMULSCALE(x) \
static inline int32_t dmulscale##x(int32_t a, int32_t d, int32_t S, int32_t D) \
{ \
int32_t mulhi, mullo, sumhi, sumlo; \
__asm__ ( \
" mullw %0, %4, %5\n" \
" mulhw %1, %4, %5\n" \
" mullw %2, %6, %7\n" \
" mulhw %3, %6, %7\n" \
" addc %0, %0, %2\n" \
" adde %1, %1, %3\n" \
" srwi %0, %0, %8\n" \
" insrwi %0, %1, %8, 0\n" \
: "=&r"(sumlo), "=&r"(sumhi), "=&r"(mullo), "=r"(mulhi) \
: "r"(a), "r"(d), "r"(S), "r"(D), "i"(x) \
: "xer" \
); \
return sumlo; \
}
DMULSCALE(1) DMULSCALE(2) DMULSCALE(3) DMULSCALE(4)
DMULSCALE(5) DMULSCALE(6) DMULSCALE(7) DMULSCALE(8)
DMULSCALE(9) DMULSCALE(10) DMULSCALE(11) DMULSCALE(12)
DMULSCALE(13) DMULSCALE(14) DMULSCALE(15) DMULSCALE(16)
DMULSCALE(17) DMULSCALE(18) DMULSCALE(19) DMULSCALE(20)
DMULSCALE(21) DMULSCALE(22) DMULSCALE(23) DMULSCALE(24)
DMULSCALE(25) DMULSCALE(26) DMULSCALE(27) DMULSCALE(28)
DMULSCALE(29) DMULSCALE(30) DMULSCALE(31)
#undef DMULSCALE
static inline int32_t dmulscale32(int32_t a, int32_t d, int32_t S, int32_t D)
{
int32_t mulhi, mullo, sumhi, sumlo;
__asm__ ( \
" mullw %0, %4, %5\n" \
" mulhw %1, %4, %5\n" \
" mullw %2, %6, %7\n" \
" mulhw %3, %6, %7\n" \
" addc %0, %0, %2\n" \
" adde %1, %1, %3\n" \
: "=&r"(sumlo), "=&r"(sumhi), "=&r"(mullo), "=r"(mulhi)
: "r"(a), "r"(d), "r"(S), "r"(D)
: "xer"
);
return sumhi;
}
// tmulscale only seems to be used in one place...
static inline int32_t tmulscale11(int32_t a, int32_t d, int32_t b, int32_t c, int32_t S, int32_t D)
{
int32_t mulhi, mullo, sumhi, sumlo;
__asm__ (
" mullw %0, %4, %5\n" \
" mulhw %1, %4, %5\n" \
" mullw %2, %6, %7\n" \
" mulhw %3, %6, %7\n" \
" addc %0, %0, %2\n" \
" adde %1, %1, %3\n" \
" mullw %2, %8, %9\n" \
" mulhw %3, %8, %9\n" \
" addc %0, %0, %2\n" \
" adde %1, %1, %3\n" \
" srwi %0, %0, 11\n" \
" insrwi %0, %1, 11, 0\n" \
: "=&r"(sumlo), "=&r"(sumhi), "=&r"(mullo), "=&r"(mulhi)
: "r"(a), "r"(d), "r"(b), "r"(c), "r"(S), "r"(D)
: "xer"
);
return sumlo;
}
static inline int32_t boundmulscale(int32_t a, int32_t b, int32_t c)
{
int32_t mulhi, mullo, mask;
__asm__ (
" mulhw %1, %3, %4\n"
" mullw %0, %3, %4\n"
" sraw. %2, %1, %5\n"
" beq 1f\n"
" cmpwi %2, -1\n"
" beq+ 1f\n"
" lis %0, 0x7FFF\n"
" srawi %2, %2, 31\n"
" xor %0, %0, %2\n"
" subf %0, %6, %0\n"
" b 2f\n"
"1: \n"
" srw %0, %0, %5\n"
" slw %1, %1, %6\n"
" or %0, %0, %1\n"
"2: \n"
: "=&r"(mullo), "=&r"(mulhi), "=&r"(mask)
: "r"(a), "r"(b), "r"(c), "r"(32-c)
: "cc", "xer"
);
return mullo;
}
static inline char readpixel(void *d)
{
return *(char*)d;
}
static inline void drawpixel(void *d, char a)
{
*(char*)d = a;
}
static inline void drawpixels(void *d, int16_t a)
{
__asm__ (
" sthbrx %0, 0, %1\n"
:
: "r"(&a), "r"(d)
: "memory"
);
}
static inline void drawpixelses(void *d, int32_t a)
{
__asm__ (
" stwbrx %0, 0, %1\n"
:
: "r"(&a), "r"(d)
: "memory"
);
}
void clearbufbyte(void *d, int32_t c, int32_t a);
static inline void clearbuf(void *d, int32_t c, int32_t a)
{
int32_t *p = (int32_t*)d;
if (a==0) {
clearbufbyte(d, c<<2, 0);
return;
}
while (c--) {
*p++ = a;
}
}
static inline void copybuf(void *s, void *d, int32_t c)
{
int32_t *p = (int32_t*)s, *q = (int32_t*)d;
while (c--) {
*q++ = *p++;
}
}
static inline void copybufbyte(void *s, void *d, int32_t c)
{
uint8_t *src = (uint8_t*)s, *dst = (uint8_t*)d;
while (c--) {
*dst++ = *src++;
}
}
static inline void copybufreverse(void *s, void *d, int32_t c)
{
uint8_t *src = (uint8_t*)s, *dst = (uint8_t*)d;
while (c--) {
*dst++ = *src--;
}
}
static inline void qinterpolatedown16(intptr_t bufptr, int32_t num, int32_t val, int32_t add)
{
int i;
int32_t *lptr = (int32_t *)bufptr;
for (i=0; i<num; i++) {
lptr[i] = (val>>16);
val += add;
}
}
static inline void qinterpolatedown16short(intptr_t bufptr, int32_t num, int32_t val, int32_t add)
{
int i;
int16_t *sptr = (int16_t *)bufptr;
for (i=0; i<num; i++) {
sptr[i] = (val>>16);
val += add;
}
}
static inline int32_t mul3(int32_t a)
{
return (a<<1)+a;
}
static inline int32_t mul5(int32_t a)
{
return (a<<2)+a;
}
static inline int32_t mul9(int32_t a)
{
return (a<<3)+a;
}
static inline int32_t klabs(int32_t a)
{
int32_t mask;
__asm__ (
" srawi %0, %1, 31\n"
" xor %1, %0, %1\n"
" subf %1, %0, %1\n"
: "=&r"(mask), "+r"(a)
:
: "xer"
);
return a;
}
static inline int32_t ksgn(int32_t a)
{
int32_t s, t;
__asm__ (
" neg %1, %2\n"
" srawi %0, %2, 31\n"
" srwi %1, %1, 31\n"
" or %1, %1, %0\n"
: "=r"(t), "=&r"(s)
: "r"(a)
: "xer"
);
return s;
}
static inline void swapchar(void *a, void *b)
{
char t = *(char*)a;
*(char*)a = *(char*)b;
*(char*)b = t;
}
static inline void swapchar2(void *a, void *b, int32_t s)
{
swapchar(a, b);
swapchar((char*)a+1, (char*)b+s);
}
static inline void swapshort(void *a, void *b)
{
int16_t t = *(int16_t*)a;
*(int16_t*)a = *(int16_t*)b;
*(int16_t*)b = t;
}
static inline void swaplong(void *a, void *b)
{
int32_t t = *(int32_t*)a;
*(int32_t*)a = *(int32_t*)b;
*(int32_t*)b = t;
}
static inline void swap64bit(void *a, void *b)
{
double t = *(double*)a;
*(double*)a = *(double*)b;
*(double*)b = t;
}
static inline int32_t divmod(int32_t a, int32_t b)
{
int32_t div;
__asm__ (
" divwu %0, %2, %3\n"
" mullw %1, %0, %3\n"
" subf %1, %1, %2\n"
: "=&r"(div), "=&r"(dmval)
: "r"(a), "r"(b)
);
return div;
}
static inline int32_t moddiv(int32_t a, int32_t b)
{
int32_t mod;
__asm__ (
" divwu %0, %2, %3\n"
" mullw %1, %0, %3\n"
" subf %1, %1, %2\n"
: "=&r"(dmval), "=&r"(mod)
: "r"(a), "r"(b)
);
return mod;
}
static inline int32_t umin(int32_t a, int32_t b) { if ((uint32_t)a < (uint32_t)b) return a; return b; }
static inline int32_t umax(int32_t a, int32_t b) { if ((uint32_t)a < (uint32_t)b) return b; return a; }
static inline int32_t kmin(int32_t a, int32_t b) { if ((int32_t)a < (int32_t)b) return a; return b; }
static inline int32_t kmax(int32_t a, int32_t b) { if ((int32_t)a < (int32_t)b) return b; return a; }
#elif defined(__GNUC__) && defined(__i386__) && !defined(NOASM)
// //
// GCC Inline Assembler version // GCC Inline Assembler version (x86)
// //
//{{{ //{{{

View file

@ -466,8 +466,12 @@ void drawslab(int32_t dx, int32_t v, int32_t dy, int32_t vi, intptr_t vptr, intp
while (dy > 0) while (dy > 0)
{ {
for (x=0; x<dx; x++) *(char *)(p+x) = gpal[(int32_t)(*(char *)((v>>16)+vptr))]; char c = gpal[(int32_t)(*(char *)((v>>16)+vptr))];
p += bpl; v += vi; dy--; for (x=0; x < dx; x++)
((char*)p)[x] = c;
p += bpl;
v += vi;
dy--;
} }
} }

View file

@ -12,7 +12,157 @@
int32_t dmval; int32_t dmval;
#if defined(__GNUC__) && defined(__i386__) && !defined(NOASM) // NOASM #if defined(__GNUC__) && defined(GEKKO)
// naked function (no prolog/epilog)
int32_t scale(int32_t a, int32_t d, int32_t c)
{
// return ((int64_t)a * d) / c;
__asm__ __volatile__ (
" mullw 6, 3, 4\n"
" mulhw 4, 3, 4\n"
" mr 3, 6\n"
" srawi. 0, 5, 31\n"
" cmpwi cr1, 4, 0\n"
" crxor 7, 0, 4\n"
" xor 5, 0, 5\n"
" subf. 5, 0, 5\n"
" beq DivByZero\n"
" bge cr1, Div64Common\n"
" subfic 3, 3, 0\n"
" subfze 4, 4\n"
"Div64Common:\n"
" cmplw 4, 5\n"
" cntlzw 6, 5\n"
" xor 4, 4, 3\n"
" slw 5, 5, 6\n"
" rotlw 4, 4, 6\n"
" slw 3, 3, 6\n"
" li 7, 2\n"
" xor 4, 4, 3\n"
" bge DivOverflow\n"
" mtctr 7\n"
"Div64Compute:\n"
" srwi 6, 5, 16\n"
" divwu 7, 4, 6\n"
" mullw 6, 7, 6\n"
" subf 4, 6, 4\n"
" slwi 4, 4, 16\n"
" inslwi 4, 3, 16, 16\n"
" slwi 3, 3, 16\n"
" clrlwi 6, 5, 16\n"
" mullw 6, 7, 6\n"
" subfc 4, 6, 4\n"
" subfe. 6, 6, 6\n"
" add 3, 3, 7\n"
" bge Div64Done\n"
"Div64Correct:\n"
" addc 4, 4, 5\n"
" addze. 6, 6\n"
" subi 3, 3, 1\n"
" blt Div64Correct\n"
"Div64Done:\n"
" bdnz Div64Compute\n"
" cmpwi 3, 0\n"
" bso cr1, Div64QuotientNeg\n"
" blt DivOverflow\n"
" blr\n"
"Div64QuotientNeg:\n"
" neg. 3, 3\n"
" blelr\n"
"DivOverflow:\n"
" cror 4, 7, 7\n"
"DivByZero:\n"
" lis 3, 0x8000\n"
" bltlr cr1\n"
" subi 3, 3, 1\n"
" blr\n"
);
}
void clearbufbyte(void *d, int32_t c, int32_t a)
{
if (a==0) {
uint8_t *dd = (uint8_t*)d;
int32_t align = (32 - (int32_t)d) & 31;
if (align && c >= align) {
uint32_t izero = 0;
double fzero = 0;
c -= align;
if (align&1) {
*dd = izero;
dd += 1;
}
if (align&2) {
*(uint16_t*)dd = izero;
dd += 2;
}
if (align&4) {
*(uint32_t*)dd = izero;
dd += 4;
}
if (align&8) {
*(double*)dd = fzero;
dd += 8;
}
if (align&16) {
*(double*)dd = fzero;
*(double*)(dd+8) = fzero;
dd += 16;
}
}
align = c >> 5;
while (align) {
__asm__ (
" dcbz 0, %0\n"
" addi %0, %0, 32\n"
: "+r"(dd)
:
: "memory"
);
align--;
}
if ((c &= 31)) {
while (c--) {
*dd++ = 0;
}
}
return;
}
__asm__ __volatile__ (
" add %1, %1, %2\n"
" neg. %2, %2\n"
" beq 2f\n"
"1:\n"
" stbx %0, %1, %2\n"
" addic. %2, %2, 1\n"
" rotrwi %0, %0, 8\n"
" bne 1b\n"
"2:\n"
: "+r"(a), "+b"(d), "+r"(c)
:
: "cc", "xer", "memory"
);
}
#elif defined(__GNUC__) && defined(__i386__) && !defined(NOASM) // NOASM
// //
// GCC Inline Assembler version // GCC Inline Assembler version