diff --git a/polymer/eduke32/build/include/pragmas.h b/polymer/eduke32/build/include/pragmas.h index 455ded179..e54e2f6c7 100644 --- a/polymer/eduke32/build/include/pragmas.h +++ b/polymer/eduke32/build/include/pragmas.h @@ -11,6 +11,15 @@ extern "C" { #include +#define PRAGMA_FUNCS _scaler(1) _scaler(2) _scaler(3) _scaler(4)\ +_scaler(5) _scaler(6) _scaler(7) _scaler(8)\ +_scaler(9) _scaler(10) _scaler(11) _scaler(12)\ +_scaler(13) _scaler(14) _scaler(15) _scaler(16)\ +_scaler(17) _scaler(18) _scaler(19) _scaler(20)\ +_scaler(21) _scaler(22) _scaler(23) _scaler(24)\ +_scaler(25) _scaler(26) _scaler(27) _scaler(28)\ +_scaler(29) _scaler(30) _scaler(31) + extern int32_t dmval; // break the C version of divscale out from the others @@ -18,6 +27,7 @@ extern int32_t dmval; #define qw(x) ((int64_t)(x)) // quadword cast #define dw(x) ((int32_t)(x)) // doubleword cast +#define wo(x) ((int16_t)(x)) // word cast #define by(x) ((uint8_t)(x)) // byte cast // XXX: Only for testing on x86. Don't use from outside; it doesn't account for @@ -39,2209 +49,66 @@ static inline int32_t divscale(int32_t eax, int32_t ebx, int32_t ecx) } \ #else -static inline int32_t divscale(int32_t eax, int32_t ebx, int32_t ecx) { return dw((qw(eax) << by(ecx)) / qw(ebx)); } +static inline int32_t divscale(int32_t eax, int32_t ebx, int32_t ecx) { return dw((qw(eax) << by(ecx)) / ebx); } # define _scaler(a) \ static inline int32_t divscale##a(int32_t eax, int32_t ebx) \ { \ - return dw((qw(eax) << a) / qw(ebx)); \ + return dw((qw(eax) << by(a)) / ebx); \ } \ #endif -_scaler(1) _scaler(2) _scaler(3) _scaler(4) -_scaler(5) _scaler(6) _scaler(7) _scaler(8) -_scaler(9) _scaler(10) _scaler(11) _scaler(12) -_scaler(13) _scaler(14) _scaler(15) _scaler(16) -_scaler(17) _scaler(18) _scaler(19) _scaler(20) -_scaler(21) _scaler(22) _scaler(23) _scaler(24) -_scaler(25) _scaler(26) _scaler(27) _scaler(28) -_scaler(29) _scaler(30) _scaler(31) _scaler(32) +PRAGMA_FUNCS _scaler(32) -#undef qw -#undef dw -#undef by #undef _scaler #endif // !defined USE_ASM_DIVSCALE #if defined(__GNUC__) && defined(GEKKO) // GCC Inline Assembler version (PowerPC) - -#define sqr(a) ((a)*(a)) - -int32_t scale(int32_t a, int32_t d, int32_t c); - -static inline int32_t mulscale(int32_t a, int32_t d, int32_t c) -{ - int32_t mullo, mulhi; - __asm__ ( - " mullw %0, %2, %3\n" - " mulhw %1, %2, %3\n" - " srw %0, %0, %4\n" - " slw %1, %1, %5\n" - " or %0, %0, %1\n" - : "=&r"(mullo), "=&r"(mulhi) - : "r"(a), "r"(d), "r"(c), "r"(32-c) - : "xer" - ); - return mullo; -} - -#define MULSCALE(x) \ -static inline int32_t mulscale##x(int32_t a, int32_t d) \ -{ \ - int32_t mullo, mulhi; \ - __asm__ ( \ - " mullw %0, %2, %3\n" \ - " mulhw %1, %2, %3\n" \ - " srwi %0, %0, %4\n" \ - " insrwi %0, %1, %4, 0\n" \ - : "=&r"(mullo), "=r"(mulhi) \ - : "r"(a), "r"(d), "i"(x) \ - ); \ - return mullo; \ -} - -MULSCALE(1) MULSCALE(2) MULSCALE(3) MULSCALE(4) -MULSCALE(5) MULSCALE(6) MULSCALE(7) MULSCALE(8) -MULSCALE(9) MULSCALE(10) MULSCALE(11) MULSCALE(12) -MULSCALE(13) MULSCALE(14) MULSCALE(15) MULSCALE(16) -MULSCALE(17) MULSCALE(18) MULSCALE(19) MULSCALE(20) -MULSCALE(21) MULSCALE(22) MULSCALE(23) MULSCALE(24) -MULSCALE(25) MULSCALE(26) MULSCALE(27) MULSCALE(28) -MULSCALE(29) MULSCALE(30) MULSCALE(31) -#undef MULSCALE - -static inline int32_t mulscale32(int32_t a, int32_t d) -{ - int32_t mulhi; - __asm__ ( - " mulhw %0, %1, %2\n" - : "=r"(mulhi) - : "r"(a), "r"(d) - ); - return mulhi; -} - -static inline int32_t dmulscale(int32_t a, int32_t d, int32_t S, int32_t D, int32_t c) -{ - int32_t mulhi, mullo, sumhi, sumlo; - __asm__ ( - " mullw %0, %4, %5\n" - " mulhw %1, %4, %5\n" - " mullw %2, %6, %7\n" - " mulhw %3, %6, %7\n" - " addc %0, %0, %2\n" - " adde %1, %1, %3\n" - " srw %0, %0, %8\n" - " slw %1, %1, %9\n" - " or %0, %0, %1\n" - : "=&r"(sumlo), "=&r"(sumhi), "=&r"(mullo), "=&r"(mulhi) - : "r"(a), "r"(d), "r"(S), "r"(D), "r"(c), "r"(32-c) - : "xer" - ); - return sumlo; -} - -#define DMULSCALE(x) \ -static inline int32_t dmulscale##x(int32_t a, int32_t d, int32_t S, int32_t D) \ -{ \ - int32_t mulhi, mullo, sumhi, sumlo; \ - __asm__ ( \ - " mullw %0, %4, %5\n" \ - " mulhw %1, %4, %5\n" \ - " mullw %2, %6, %7\n" \ - " mulhw %3, %6, %7\n" \ - " addc %0, %0, %2\n" \ - " adde %1, %1, %3\n" \ - " srwi %0, %0, %8\n" \ - " insrwi %0, %1, %8, 0\n" \ - : "=&r"(sumlo), "=&r"(sumhi), "=&r"(mullo), "=r"(mulhi) \ - : "r"(a), "r"(d), "r"(S), "r"(D), "i"(x) \ - : "xer" \ - ); \ - return sumlo; \ -} - -DMULSCALE(1) DMULSCALE(2) DMULSCALE(3) DMULSCALE(4) -DMULSCALE(5) DMULSCALE(6) DMULSCALE(7) DMULSCALE(8) -DMULSCALE(9) DMULSCALE(10) DMULSCALE(11) DMULSCALE(12) -DMULSCALE(13) DMULSCALE(14) DMULSCALE(15) DMULSCALE(16) -DMULSCALE(17) DMULSCALE(18) DMULSCALE(19) DMULSCALE(20) -DMULSCALE(21) DMULSCALE(22) DMULSCALE(23) DMULSCALE(24) -DMULSCALE(25) DMULSCALE(26) DMULSCALE(27) DMULSCALE(28) -DMULSCALE(29) DMULSCALE(30) DMULSCALE(31) -#undef DMULSCALE - -static inline int32_t dmulscale32(int32_t a, int32_t d, int32_t S, int32_t D) -{ - int32_t mulhi, mullo, sumhi, sumlo; - __asm__ ( \ - " mullw %0, %4, %5\n" \ - " mulhw %1, %4, %5\n" \ - " mullw %2, %6, %7\n" \ - " mulhw %3, %6, %7\n" \ - " addc %0, %0, %2\n" \ - " adde %1, %1, %3\n" \ - : "=&r"(sumlo), "=&r"(sumhi), "=&r"(mullo), "=r"(mulhi) - : "r"(a), "r"(d), "r"(S), "r"(D) - : "xer" - ); - return sumhi; -} - -// tmulscale only seems to be used in one place... -static inline int32_t tmulscale11(int32_t a, int32_t d, int32_t b, int32_t c, int32_t S, int32_t D) -{ - int32_t mulhi, mullo, sumhi, sumlo; - __asm__ ( - " mullw %0, %4, %5\n" \ - " mulhw %1, %4, %5\n" \ - " mullw %2, %6, %7\n" \ - " mulhw %3, %6, %7\n" \ - " addc %0, %0, %2\n" \ - " adde %1, %1, %3\n" \ - " mullw %2, %8, %9\n" \ - " mulhw %3, %8, %9\n" \ - " addc %0, %0, %2\n" \ - " adde %1, %1, %3\n" \ - " srwi %0, %0, 11\n" \ - " insrwi %0, %1, 11, 0\n" \ - : "=&r"(sumlo), "=&r"(sumhi), "=&r"(mullo), "=&r"(mulhi) - : "r"(a), "r"(d), "r"(b), "r"(c), "r"(S), "r"(D) - : "xer" - ); - return sumlo; -} - -static inline int32_t boundmulscale(int32_t a, int32_t b, int32_t c) -{ - int32_t mulhi, mullo, mask; - __asm__ ( - " mulhw %1, %3, %4\n" - " mullw %0, %3, %4\n" - " sraw. %2, %1, %5\n" - " beq 1f\n" - " cmpwi %2, -1\n" - " beq+ 1f\n" - " lis %0, 0x7FFF\n" - " srawi %2, %2, 31\n" - " xor %0, %0, %2\n" - " subf %0, %6, %0\n" - " b 2f\n" - "1: \n" - " srw %0, %0, %5\n" - " slw %1, %1, %6\n" - " or %0, %0, %1\n" - "2: \n" - : "=&r"(mullo), "=&r"(mulhi), "=&r"(mask) - : "r"(a), "r"(b), "r"(c), "r"(32-c) - : "cc", "xer" - ); - return mullo; -} - -static inline char readpixel(void *d) -{ - return *(char*)d; -} - -static inline void drawpixel(void *d, char a) -{ - *(char*)d = a; -} - -static inline void drawpixels(void *d, int16_t a) -{ - __asm__ ( - " sthbrx %0, 0, %1\n" - : - : "r"(&a), "r"(d) - : "memory" - ); -} - -static inline void drawpixelses(void *d, int32_t a) -{ - __asm__ ( - " stwbrx %0, 0, %1\n" - : - : "r"(&a), "r"(d) - : "memory" - ); -} - -void clearbufbyte(void *d, int32_t c, int32_t a); - -static inline void clearbuf(void *d, int32_t c, int32_t a) -{ - int32_t *p = (int32_t*)d; - if (a==0) { - clearbufbyte(d, c<<2, 0); - return; - } - while (c--) { - *p++ = a; - } -} - -static inline void copybuf(void *s, void *d, int32_t c) -{ - int32_t *p = (int32_t*)s, *q = (int32_t*)d; - while (c--) { - *q++ = *p++; - } -} - -static inline void copybufbyte(void *s, void *d, int32_t c) -{ - uint8_t *src = (uint8_t*)s, *dst = (uint8_t*)d; - while (c--) { - *dst++ = *src++; - } -} - -static inline void copybufreverse(void *s, void *d, int32_t c) -{ - uint8_t *src = (uint8_t*)s, *dst = (uint8_t*)d; - while (c--) { - *dst++ = *src--; - } -} - -static inline void qinterpolatedown16(intptr_t bufptr, int32_t num, int32_t val, int32_t add) -{ - int i; - int32_t *lptr = (int32_t *)bufptr; - for (i=0; i>16); - val += add; - } -} - -static inline void qinterpolatedown16short(intptr_t bufptr, int32_t num, int32_t val, int32_t add) -{ - int i; - int16_t *sptr = (int16_t *)bufptr; - for (i=0; i>16); - val += add; - } -} - -static inline int32_t mul3(int32_t a) -{ - return (a<<1)+a; -} - -static inline int32_t mul5(int32_t a) -{ - return (a<<2)+a; -} - -static inline int32_t mul9(int32_t a) -{ - return (a<<3)+a; -} - -static inline int32_t klabs(int32_t a) -{ - int32_t mask; - __asm__ ( - " srawi %0, %1, 31\n" - " xor %1, %0, %1\n" - " subf %1, %0, %1\n" - : "=&r"(mask), "+r"(a) - : - : "xer" - ); - return a; -} - -static inline int32_t ksgn(int32_t a) -{ - int32_t s, t; - __asm__ ( - " neg %1, %2\n" - " srawi %0, %2, 31\n" - " srwi %1, %1, 31\n" - " or %1, %1, %0\n" - : "=r"(t), "=&r"(s) - : "r"(a) - : "xer" - ); - return s; -} - -static inline void swapchar(void *a, void *b) -{ - char t = *(char*)a; - *(char*)a = *(char*)b; - *(char*)b = t; -} - -static inline void swapchar2(void *a, void *b, int32_t s) -{ - swapchar(a, b); - swapchar((char*)a+1, (char*)b+s); -} - -static inline void swapshort(void *a, void *b) -{ - int16_t t = *(int16_t*)a; - *(int16_t*)a = *(int16_t*)b; - *(int16_t*)b = t; -} - -static inline void swaplong(void *a, void *b) -{ - int32_t t = *(int32_t*)a; - *(int32_t*)a = *(int32_t*)b; - *(int32_t*)b = t; -} - -static inline void swap64bit(void *a, void *b) -{ - double t = *(double*)a; - *(double*)a = *(double*)b; - *(double*)b = t; -} - -static inline int32_t divmod(int32_t a, int32_t b) -{ - int32_t div; - __asm__ ( - " divwu %0, %2, %3\n" - " mullw %1, %0, %3\n" - " subf %1, %1, %2\n" - : "=&r"(div), "=&r"(dmval) - : "r"(a), "r"(b) - ); - return div; -} - -static inline int32_t moddiv(int32_t a, int32_t b) -{ - int32_t mod; - __asm__ ( - " divwu %0, %2, %3\n" - " mullw %1, %0, %3\n" - " subf %1, %1, %2\n" - : "=&r"(dmval), "=&r"(mod) - : "r"(a), "r"(b) - ); - return mod; -} - -static inline int32_t umin(int32_t a, int32_t b) { if ((uint32_t)a < (uint32_t)b) return a; return b; } -static inline int32_t umax(int32_t a, int32_t b) { if ((uint32_t)a < (uint32_t)b) return b; return a; } -static inline int32_t kmin(int32_t a, int32_t b) { if ((int32_t)a < (int32_t)b) return a; return b; } -static inline int32_t kmax(int32_t a, int32_t b) { if ((int32_t)a < (int32_t)b) return b; return a; } - +#include "pragmas_ppc.h" #elif defined(__GNUC__) && defined(__i386__) && !defined(NOASM) -// // GCC Inline Assembler version (x86) -// +#include "pragmas_x86_gcc.h" -//{{{ - -#ifndef UNDERSCORES -#define _DMVAL "dmval" -#else -#define _DMVAL "_dmval" -#endif - - -// maybe one day I'll make these into macros -int32_t boundmulscale(int32_t a, int32_t b, int32_t c); -void clearbufbyte(void *D, int32_t c, int32_t a); -void copybufbyte(const void *S, void *D, int32_t c); -void copybufreverse(const void *S, void *D, int32_t c); - - -#ifdef NO_GCC_BUILTINS -#define sqr(a) \ - ({ int32_t __a=(a); \ - __asm__ __volatile__ ("imull %0, %0" \ - : "=q" (__a) \ - : "0" (__a) \ - : "cc"); \ - __a; }) -#else -#define sqr(a) __builtin_sqr(a) -#endif - -#define scale(a,d,c) \ - ({ int32_t __a=(a), __d=(d), __c=(c); \ - __asm__ __volatile__ ("imull %%edx; idivl %%ecx" \ - : "=a" (__a), "=d" (__d) \ - : "0" (__a), "1" (__d), "c" (__c) : "cc"); \ - __a; }) - -#define mulscale(a,d,c) \ - ({ int32_t __a=(a), __d=(d), __c=(c); \ - __asm__ __volatile__ ("imull %%edx; shrdl %%cl, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d), "c" (__c) : "cc"); \ - __a; }) -#define mulscale1(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $1, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale2(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $2, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale3(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $3, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale4(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $4, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale5(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $5, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale6(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $6, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale7(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $7, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale8(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $8, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale9(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $9, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale10(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $10, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale11(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $11, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale12(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $12, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale13(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $13, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale14(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $14, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale15(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $15, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale16(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $16, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale17(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $17, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale18(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $18, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale19(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $19, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale20(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $20, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale21(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $21, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale22(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $22, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale23(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $23, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale24(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $24, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale25(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $25, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale26(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $26, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale27(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $27, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale28(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $28, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale29(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $29, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale30(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $30, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale31(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx; shrdl $31, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __a; }) -#define mulscale32(a,d) \ - ({ int32_t __a=(a), __d=(d); \ - __asm__ __volatile__ ("imull %%edx" \ - : "=a" (__a), "=d" (__d) \ - : "a" (__a), "d" (__d) : "cc"); \ - __d; }) - -#define dmulscale(a,d,S,D,c) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D), __c=(c); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl %%cl, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D), "c" (__c) : "ebx", "cc"); \ - __a; }) -#define dmulscale1(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $1, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale2(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $2, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale3(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $3, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale4(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $4, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale5(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $5, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale6(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $6, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale7(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $7, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale8(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $8, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale9(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $9, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale10(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $10, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale11(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $11, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale12(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $12, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale13(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $13, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale14(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $14, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale15(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $15, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale16(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $16, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale17(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $17, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale18(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $18, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale19(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $19, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale20(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $20, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale21(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $21, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale22(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $22, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale23(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $23, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale24(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $24, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale25(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $25, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale26(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $26, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale27(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $27, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale28(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $28, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale29(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $29, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale30(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $30, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale31(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $31, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __a; }) -#define dmulscale32(a,d,S,D) \ - ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx" \ - : "=a" (__a), "=d" (__d), "=S" (__S) \ - : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ - __d; }) - -#define tmulscale1(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $1, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale2(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $2, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale3(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $3, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale4(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $4, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale5(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $5, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale6(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $6, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale7(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $7, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale8(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $8, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale9(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $9, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale10(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $10, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale11(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $11, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale12(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $12, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale13(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $13, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale14(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $14, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale15(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $15, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale16(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $16, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale17(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $17, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale18(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $18, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale19(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $19, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale20(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $20, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale21(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $21, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale22(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $22, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale23(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $23, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale24(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $24, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale25(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $25, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale26(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $26, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale27(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $27, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale28(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $28, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale29(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $29, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale30(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $30, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale31(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $31, %%edx, %%eax" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __a; }) -#define tmulscale32(a,d,b,c,S,D) \ - ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ - __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ - "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ - "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx" \ - : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ - : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ - __d; }) - -#ifdef USE_ASM_DIVSCALE -#define divscale(a,b,c) \ - ({ int32_t __a=(a), __b=(b), __c=(c); \ - __asm__ __volatile__ ("movl %%eax, %%edx; shll %%cl, %%eax; negb %%cl; sarl %%cl, %%edx; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "c" (__c), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale1(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("addl %%eax, %%eax; sbbl %%edx, %%edx; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale2(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $30, %%edx; leal (,%%eax,4), %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale3(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $29, %%edx; leal (,%%eax,8), %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale4(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $28, %%edx; shll $4, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale5(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $27, %%edx; shll $5, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale6(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $26, %%edx; shll $6, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale7(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $25, %%edx; shll $7, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale8(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $24, %%edx; shll $8, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale9(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $23, %%edx; shll $9, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale10(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $22, %%edx; shll $10, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale11(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $21, %%edx; shll $11, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale12(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $20, %%edx; shll $12, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale13(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $19, %%edx; shll $13, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale14(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $18, %%edx; shll $14, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale15(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $17, %%edx; shll $15, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale16(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $16, %%edx; shll $16, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale17(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $15, %%edx; shll $17, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale18(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $14, %%edx; shll $18, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale19(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $13, %%edx; shll $19, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale20(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $12, %%edx; shll $20, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale21(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $11, %%edx; shll $21, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale22(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $10, %%edx; shll $22, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale23(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $9, %%edx; shll $23, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale24(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $8, %%edx; shll $24, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale25(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $7, %%edx; shll $25, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale26(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $6, %%edx; shll $26, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale27(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $5, %%edx; shll $27, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale28(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $4, %%edx; shll $28, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale29(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $3, %%edx; shll $29, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale30(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $2, %%edx; shll $30, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale31(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $1, %%edx; shll $31, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale32(d,b) \ - ({ int32_t __d=(d), __b=(b), __r; \ - __asm__ __volatile__ ("xorl %%eax, %%eax; idivl %%ebx" \ - : "=a" (__r), "=d" (__d) : "d" (__d), "b" (__b) : "cc"); \ - __r; }) -#endif // defined USE_ASM_DIVSCALE - -#define readpixel(D) \ - ({ void *__D=(D); int32_t __a; \ - __asm__ __volatile__ ("movb (%%edi), %%al" \ - : "=a" (__a): "D" (__D) : "cc"); \ - __a; }) -#define drawpixel(D,a) \ - ({ void *__D=(D); int32_t __a=(a); \ - __asm__ __volatile__ ("movb %%al, (%%edi)" \ - : : "D" (__D), "a" (__a) : "memory", "cc"); \ - 0; }) -#define drawpixels(D,a) \ - ({ void *__D=(D); int32_t __a=(a); \ - __asm__ __volatile__ ("movw %%ax, (%%edi)" \ - : : "D" (__D), "a" (__a) : "memory", "cc"); \ - 0; }) -#define drawpixelses(D,a) \ - ({ void *__D=(D); int32_t __a=(a); \ - __asm__ __volatile__ ("movl %%eax, (%%edi)" \ - : : "D" (__D), "a" (__a) : "memory", "cc"); \ - 0; }) -#define clearbuf(D,c,a) \ - ({ void *__D=(D); int32_t __c=(c), __a=(a); \ - __asm__ __volatile__ ("rep; stosl" \ - : "=&D" (__D), "=&c" (__c) : "0" (__D), "1" (__c), "a" (__a) : "memory", "cc"); \ - 0; }) -#define copybuf(S,D,c) \ - ({ const void *__S=(S), *__D=(D); int32_t __c=(c); \ - __asm__ __volatile__ ("rep; movsl" \ - : "=&S" (__S), "=&D" (__D), "=&c" (__c) : "0" (__S), "1" (__D), "2" (__c) : "memory", "cc"); \ - 0; }) - -#define mul3(a) \ - ({ int32_t __a=(a), __r; \ - __asm__ __volatile__ ("lea (%1,%1,2), %0" \ - : "=r" (__r) : "0" (__a) : "cc"); \ - __r; }) -#define mul5(a) \ - ({ int32_t __a=(a), __r; \ - __asm__ __volatile__ ("lea (%1,%1,4), %0" \ - : "=r" (__r) : "0" (__a) : "cc"); \ - __r; }) -#define mul9(a) \ - ({ int32_t __a=(a), __r; \ - __asm__ __volatile__ ("lea (%1,%1,8), %0" \ - : "=r" (__r) : "0" (__a) : "cc"); \ - __r; }) - -//returns eax/ebx, dmval = eax%edx; -#define divmod(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("xorl %%edx, %%edx; divl %%ebx; movl %%edx, "_DMVAL \ - : "+a" (__a) : "b" (__b) : "edx", "memory", "cc"); \ - __a; }) -//returns eax%ebx, dmval = eax/edx; -#define moddiv(a,b) \ - ({ int32_t __a=(a), __b=(b), __d; \ - __asm__ __volatile__ ("xorl %%edx, %%edx; divl %%ebx; movl %%eax, "_DMVAL \ - : "=d" (__d) : "a" (__a), "b" (__b) : "eax", "memory", "cc"); \ - __d; }) - -#define klabs(a) \ - ({ int32_t __a=(a); \ - __asm__ __volatile__ ("testl %%eax, %%eax; jns 0f; negl %%eax; 0:" \ - : "=a" (__a) : "a" (__a) : "cc"); \ - __a; }) -#define ksgn(b) \ - ({ int32_t __b=(b), __r; \ - __asm__ __volatile__ ("addl %%ebx, %%ebx; sbbl %%eax, %%eax; cmpl %%ebx, %%eax; adcb $0, %%al" \ - : "=a" (__r) : "b" (__b) : "cc"); \ - __r; }) - -#define umin(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("subl %%ebx, %%eax; sbbl %%ecx, %%ecx; andl %%ecx, %%eax; addl %%ebx, %%eax" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "ecx", "cc"); \ - __a; }) -#define umax(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("subl %%ebx, %%eax; sbbl %%ecx, %%ecx; xorl $0xffffffff, %%ecx; andl %%ecx, %%eax; addl %%ebx, %%eax" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "ecx", "cc"); \ - __a; }) - -#define kmin(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("cmpl %%ebx, %%eax; jl 0f; movl %%ebx, %%eax; 0:" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "cc"); \ - __a; }) -#define kmax(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("cmpl %%ebx, %%eax; jg 0f; movl %%ebx, %%eax; 0:" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "cc"); \ - __a; }) - -#define swapchar(a,b) \ - ({ void *__a=(a), *__b=(b); \ - __asm__ __volatile__ ("movb (%%eax), %%cl; movb (%%ebx), %%ch; movb %%cl, (%%ebx); movb %%ch, (%%eax)" \ - : : "a" (__a), "b" (__b) : "ecx", "memory", "cc"); \ - 0; }) -#define swapshort(a,b) \ - ({ void *__a=(a), *__b=(b); \ - __asm__ __volatile__ ("movw (%%eax), %%cx; movw (%%ebx), %%dx; movw %%cx, (%%ebx); movw %%dx, (%%eax)" \ - : : "a" (__a), "b" (__b) : "ecx", "edx", "memory", "cc"); \ - 0; }) -#define swaplong(a,b) \ - ({ void *__a=(a), *__b=(b); \ - __asm__ __volatile__ ("movl (%%eax), %%ecx; movl (%%ebx), %%edx; movl %%ecx, (%%ebx); movl %%edx, (%%eax)" \ - : : "a" (__a), "b" (__b) : "ecx", "edx", "memory", "cc"); \ - 0; }) -#define swapbuf4(a,b,c) \ - ({ void *__a=(a), *__b=(b); int32_t __c=(c); \ - __asm__ __volatile__ ("0: movl (%%eax), %%esi; movl (%%ebx), %%edi; movl %%esi, (%%ebx); " \ - "movl %%edi, (%%eax); addl $4, %%eax; addl $4, %%ebx; decl %%ecx; jnz 0b" \ - : : "a" (__a), "b" (__b), "c" (__c) : "esi", "edi", "memory", "cc"); \ - 0; }) -#define swap64bit(a,b) \ - ({ void *__a=(a), *__b=(b); \ - __asm__ __volatile__ ("movl (%%eax), %%ecx; movl (%%ebx), %%edx; movl %%ecx, (%%ebx); " \ - "movl 4(%%eax), %%ecx; movl %%edx, (%%eax); movl 4(%%ebx), %%edx; " \ - "movl %%ecx, 4(%%ebx); movl %%edx, 4(%%eax)" \ - : : "a" (__a), "b" (__b) : "ecx", "edx", "memory", "cc"); \ - 0; }) - -//swapchar2(ptr1,ptr2,xsiz); is the same as: -//swapchar(ptr1,ptr2); swapchar(ptr1+1,ptr2+xsiz); -#define swapchar2(a,b,S) \ - ({ void *__a=(a), *__b=(b); int32_t __S=(S); \ - __asm__ __volatile__ ("addl %%ebx, %%esi; movw (%%eax), %%cx; movb (%%ebx), %%dl; " \ - "movb %%cl, (%%ebx); movb (%%esi), %%dh; movb %%ch, (%%esi); " \ - "movw %%dx, (%%eax)" \ - : "=S" (__S) : "a" (__a), "b" (__b), "S" (__S) : "ecx", "edx", "memory", "cc"); \ - 0; }) - - -#define qinterpolatedown16(a,c,d,S) \ - ({ void *__a=(void*)(a); int32_t __c=(c), __d=(d), __S=(S); \ - __asm__ __volatile__ ("movl %%ecx, %%ebx; shrl $1, %%ecx; jz 1f; " \ - "0: leal (%%edx,%%esi,), %%edi; sarl $16, %%edx; movl %%edx, (%%eax); " \ - "leal (%%edi,%%esi,), %%edx; sarl $16, %%edi; movl %%edi, 4(%%eax); " \ - "addl $8, %%eax; decl %%ecx; jnz 0b; testl $1, %%ebx; jz 2f; " \ - "1: sarl $16, %%edx; movl %%edx, (%%eax); 2:" \ - : "=a" (__a), "=c" (__c), "=d" (__d) : "a" (__a), "c" (__c), "d" (__d), "S" (__S) \ - : "ebx", "edi", "memory", "cc"); \ - 0; }) - -#define qinterpolatedown16short(a,c,d,S) \ - ({ void *__a=(void*)(a); int32_t __c=(c), __d=(d), __S=(S); \ - __asm__ __volatile__ ("testl %%ecx, %%ecx; jz 3f; testb $2, %%al; jz 0f; movl %%edx, %%ebx; " \ - "sarl $16, %%ebx; movw %%bx, (%%eax); addl %%esi, %%edx; addl $2, %%eax; " \ - "decl %%ecx; jz 3f; " \ - "0: subl $2, %%ecx; jc 2f; " \ - "1: movl %%edx, %%ebx; addl %%esi, %%edx; sarl $16, %%ebx; movl %%edx, %%edi; " \ - "andl $0xffff0000, %%edi; addl %%esi, %%edx; addl %%edi, %%ebx; " \ - "movl %%ebx, (%%eax); addl $4, %%eax; subl $2, %%ecx; jnc 1b; testb $1, %%cl; " \ - "jz 3f; " \ - "2: movl %%edx, %%ebx; sarl $16, %%ebx; movw %%bx, (%%eax); 3:" \ - : "=a" (__a), "=c" (__c), "=d" (__d) : "a" (__a), "c" (__c), "d" (__d), "S" (__S) \ - : "ebx", "edi", "memory", "cc"); \ - 0; }) - - -//}}} - #elif defined(_MSC_VER) && !defined(NOASM) // __GNUC__ -// // Microsoft C inline assembler -// +#include "pragmas_x86_msvc.h" -//{{{ -static __inline int32_t sqr(int32_t a) -{ - _asm { - mov eax, a - imul eax, eax - } -} +#elif defined(__arm__) // _MSC_VER -static __inline int32_t scale(int32_t a, int32_t d, int32_t c) -{ - _asm { - mov eax, a - imul d - idiv c - } -} +// GCC Inline Assembler version (ARM) +#include "pragmas_arm.h" -static __inline int32_t mulscale(int32_t a, int32_t d, int32_t c) -{ - _asm { - mov ecx, c - mov eax, a - imul d - shrd eax, edx, cl - } -} - -#define MULSCALE(x) \ -static __inline int32_t mulscale##x (int32_t a, int32_t d) \ -{ \ - _asm mov eax, a \ - _asm imul d \ - _asm shrd eax, edx, x \ -} - -MULSCALE(1) MULSCALE(2) MULSCALE(3) MULSCALE(4) -MULSCALE(5) MULSCALE(6) MULSCALE(7) MULSCALE(8) -MULSCALE(9) MULSCALE(10) MULSCALE(11) MULSCALE(12) -MULSCALE(13) MULSCALE(14) MULSCALE(15) MULSCALE(16) -MULSCALE(17) MULSCALE(18) MULSCALE(19) MULSCALE(20) -MULSCALE(21) MULSCALE(22) MULSCALE(23) MULSCALE(24) -MULSCALE(25) MULSCALE(26) MULSCALE(27) MULSCALE(28) -MULSCALE(29) MULSCALE(30) MULSCALE(31) -#undef MULSCALE -static __inline int32_t mulscale32(int32_t a, int32_t d) -{ - _asm { - mov eax, a - imul d - mov eax, edx - } -} - -static __inline int32_t dmulscale(int32_t a, int32_t d, int32_t S, int32_t D, int32_t c) -{ - _asm { - mov ecx, c - mov eax, a - imul d - mov ebx, eax - mov eax, S - mov esi, edx - imul D - add eax, ebx - adc edx, esi - shrd eax, edx, cl - } -} - -#define DMULSCALE(x) \ -static __inline int32_t dmulscale##x (int32_t a, int32_t d, int32_t S, int32_t D) \ -{ \ - _asm mov eax, a \ - _asm imul d \ - _asm mov ebx, eax \ - _asm mov eax, S \ - _asm mov esi, edx \ - _asm imul D \ - _asm add eax, ebx \ - _asm adc edx, esi \ - _asm shrd eax, edx, x \ -} - -DMULSCALE(1) DMULSCALE(2) DMULSCALE(3) DMULSCALE(4) -DMULSCALE(5) DMULSCALE(6) DMULSCALE(7) DMULSCALE(8) -DMULSCALE(9) DMULSCALE(10) DMULSCALE(11) DMULSCALE(12) -DMULSCALE(13) DMULSCALE(14) DMULSCALE(15) DMULSCALE(16) -DMULSCALE(17) DMULSCALE(18) DMULSCALE(19) DMULSCALE(20) -DMULSCALE(21) DMULSCALE(22) DMULSCALE(23) DMULSCALE(24) -DMULSCALE(25) DMULSCALE(26) DMULSCALE(27) DMULSCALE(28) -DMULSCALE(29) DMULSCALE(30) DMULSCALE(31) -#undef DMULSCALE -static __inline int32_t dmulscale32(int32_t a, int32_t d, int32_t S, int32_t D) -{ - _asm { - mov eax, a - imul d - mov ebx, eax - mov eax, S - mov esi, edx - imul D - add eax, ebx - adc edx, esi - mov eax, edx - } -} - -#define TMULSCALE(x) \ -static __inline int32_t tmulscale##x (int32_t a, int32_t d, int32_t b, int32_t c, int32_t S, int32_t D) \ -{ \ - _asm mov eax, a \ - _asm mov ebx, b \ - _asm imul d \ - _asm xchg eax, ebx \ - _asm mov ecx, c \ - _asm xchg edx, ecx \ - _asm imul edx \ - _asm add ebx, eax \ - _asm adc ecx, edx \ - _asm mov eax, S \ - _asm imul D \ - _asm add eax, ebx \ - _asm adc edx, ecx \ - _asm shrd eax, edx, x \ -} - -TMULSCALE(1) TMULSCALE(2) TMULSCALE(3) TMULSCALE(4) -TMULSCALE(5) TMULSCALE(6) TMULSCALE(7) TMULSCALE(8) -TMULSCALE(9) TMULSCALE(10) TMULSCALE(11) TMULSCALE(12) -TMULSCALE(13) TMULSCALE(14) TMULSCALE(15) TMULSCALE(16) -TMULSCALE(17) TMULSCALE(18) TMULSCALE(19) TMULSCALE(20) -TMULSCALE(21) TMULSCALE(22) TMULSCALE(23) TMULSCALE(24) -TMULSCALE(25) TMULSCALE(26) TMULSCALE(27) TMULSCALE(28) -TMULSCALE(29) TMULSCALE(30) TMULSCALE(31) -#undef TMULSCALE -static __inline int32_t tmulscale32(int32_t a, int32_t d, int32_t b, int32_t c, int32_t S, int32_t D) -{ - _asm { - mov eax, a - mov ebx, b - imul d - xchg eax, ebx - mov ecx, c - xchg edx, ecx - imul edx - add ebx, eax - adc ecx, edx - mov eax, S - imul D - add eax, ebx - adc edx, ecx - mov eax, edx - } -} - -static __inline int32_t boundmulscale(int32_t a, int32_t b, int32_t c) -{ - _asm { - mov eax, a - mov ecx, c - imul b - mov ebx, edx - shrd eax, edx, cl - sar edx, cl - xor edx, eax - js checkit - xor edx, eax - jz skipboundit - cmp edx, 0xffffffff - je skipboundit - checkit: - mov eax, ebx - sar eax, 31 - xor eax, 0x7fffffff - skipboundit: - } -} - -#ifdef USE_ASM_DIVSCALE -static __inline int32_t divscale(int32_t a, int32_t b, int32_t c) -{ - _asm { - mov eax, a - mov ecx, c - mov edx, eax - shl eax, cl - neg cl - sar edx, cl - idiv b - } -} - -static __inline int32_t divscale1(int32_t a, int32_t b) -{ - _asm { - mov eax, a - add eax, eax - sbb edx, edx - idiv b - } -} - -static __inline int32_t divscale2(int32_t a, int32_t b) -{ - _asm { - mov eax, a - mov edx, eax - sar edx, 30 - lea eax, [eax*4] - idiv b - } -} - -static __inline int32_t divscale3(int32_t a, int32_t b) -{ - _asm { - mov eax, a - mov edx, eax - sar edx, 29 - lea eax, [eax*8] - idiv b - } -} - -#define DIVSCALE(x,y) \ -static __inline int32_t divscale##y(int32_t a, int32_t b) \ -{ \ - _asm mov eax, a \ - _asm mov edx, eax \ - _asm sar edx, x \ - _asm shl eax, y \ - _asm idiv b \ -} - -DIVSCALE(28,4) DIVSCALE(27,5) DIVSCALE(26,6) DIVSCALE(25,7) -DIVSCALE(24,8) DIVSCALE(23,9) DIVSCALE(22,10) DIVSCALE(21,11) -DIVSCALE(20,12) DIVSCALE(19,13) DIVSCALE(18,14) DIVSCALE(17,15) -DIVSCALE(16,16) DIVSCALE(15,17) DIVSCALE(14,18) DIVSCALE(13,19) -DIVSCALE(12,20) DIVSCALE(11,21) DIVSCALE(10,22) DIVSCALE(9,23) -DIVSCALE(8,24) DIVSCALE(7,25) DIVSCALE(6,26) DIVSCALE(5,27) -DIVSCALE(4,28) DIVSCALE(3,29) DIVSCALE(2,30) DIVSCALE(1,31) - -static __inline int32_t divscale32(int32_t d, int32_t b) -{ - _asm { - mov edx, d - xor eax, eax - idiv b - } -} -#endif // defined USE_ASM_DIVSCALE - -static __inline char readpixel(void *d) -{ - _asm { - mov edx, d - mov al, byte ptr [edx] - } -} - -static __inline void drawpixel(void *d, char a) -{ - _asm { - mov edx, d - mov al, a - mov byte ptr [edx], al - } -} - -static __inline void drawpixels(void *d, int16_t a) -{ - _asm { - mov edx, d - mov ax, a - mov word ptr [edx], ax - } -} - -static __inline void drawpixelses(void *d, int32_t a) -{ - _asm { - mov edx, d - mov eax, a - mov dword ptr [edx], eax - } -} - -static __inline void clearbuf(void *d, int32_t c, int32_t a) -{ - _asm { - mov edi, d - mov ecx, c - mov eax, a - rep stosd - } -} - -static __inline void clearbufbyte(void *d, int32_t c, int32_t a) -{ - _asm { - mov edi, d - mov ecx, c - mov eax, a - cmp ecx, 4 - jae longcopy - test cl, 1 - jz preskip - stosb - preskip: - shr ecx, 1 - rep stosw - jmp endit - longcopy: - test edi, 1 - jz skip1 - stosb - dec ecx - skip1: - test edi, 2 - jz skip2 - stosw - sub ecx, 2 - skip2: - mov ebx, ecx - shr ecx, 2 - rep stosd - test bl, 2 - jz skip3 - stosw - skip3: - test bl, 1 - jz endit - stosb - endit: - } -} - -static __inline void copybuf(const void *s, void *d, int32_t c) -{ - _asm { - mov esi, s - mov edi, d - mov ecx, c - rep movsd - } -} - -static __inline void copybufbyte(const void *s, void *d, int32_t c) -{ - _asm { - mov esi, s - mov edi, d - mov ecx, c - cmp ecx, 4 - jae longcopy - test cl, 1 - jz preskip - movsb - preskip: - shr ecx, 1 - rep movsw - jmp endit - longcopy: - test edi, 1 - jz skip1 - movsb - dec ecx - skip1: - test edi, 2 - jz skip2 - movsw - sub ecx, 2 - skip2: - mov ebx, ecx - shr ecx, 2 - rep movsd - test bl, 2 - jz skip3 - movsw - skip3: - test bl, 1 - jz endit - movsb - endit: - } -} - -static __inline void copybufreverse(const void *s, void *d, int32_t c) -{ - _asm { - mov esi, s - mov edi, d - mov ecx, c - shr ecx, 1 - jnc skipit1 - mov al, byte ptr [esi] - dec esi - mov byte ptr [edi], al - inc edi - skipit1: - shr ecx, 1 - jnc skipit2 - mov ax, word ptr [esi-1] - sub esi, 2 - ror ax, 8 - mov word ptr [edi], ax - add edi, 2 - skipit2: - test ecx, ecx - jz endloop - begloop: - mov eax, dword ptr [esi-3] - sub esi, 4 - bswap eax - mov dword ptr [edi], eax - add edi, 4 - dec ecx - jnz begloop - endloop: - } -} - -static __inline void qinterpolatedown16(int32_t a, int32_t c, int32_t d, int32_t s) -{ - _asm { - mov eax, a - mov ecx, c - mov edx, d - mov esi, s - mov ebx, ecx - shr ecx, 1 - jz skipbegcalc - begqcalc: - lea edi, [edx+esi] - sar edx, 16 - mov dword ptr [eax], edx - lea edx, [edi+esi] - sar edi, 16 - mov dword ptr [eax+4], edi - add eax, 8 - dec ecx - jnz begqcalc - test ebx, 1 - jz skipbegqcalc2 - skipbegcalc: - sar edx, 16 - mov dword ptr [eax], edx - skipbegqcalc2: - } -} - -static __inline void qinterpolatedown16short(int32_t a, int32_t c, int32_t d, int32_t s) -{ - _asm { - mov eax, a - mov ecx, c - mov edx, d - mov esi, s - test ecx, ecx - jz endit - test al, 2 - jz skipalignit - mov ebx, edx - sar ebx, 16 - mov word ptr [eax], bx - add edx, esi - add eax, 2 - dec ecx - jz endit - skipalignit: - sub ecx, 2 - jc finishit - begqcalc: - mov ebx, edx - add edx, esi - sar ebx, 16 - mov edi, edx - and edi, 0ffff0000h - add edx, esi - add ebx, edi - mov dword ptr [eax], ebx - add eax, 4 - sub ecx, 2 - jnc begqcalc - test cl, 1 - jz endit - finishit: - mov ebx, edx - sar ebx, 16 - mov word ptr [eax], bx - endit: - } -} - -static __inline int32_t mul3(int32_t a) -{ - _asm { - mov eax, a - lea eax, [eax+eax*2] - } -} - -static __inline int32_t mul5(int32_t a) -{ - _asm { - mov eax, a - lea eax, [eax+eax*4] - } -} - -static __inline int32_t mul9(int32_t a) -{ - _asm { - mov eax, a - lea eax, [eax+eax*8] - } -} - - //returns eax/ebx, dmval = eax%edx; -static __inline int32_t divmod(int32_t a, int32_t b) -{ - _asm { - mov eax, a - xor edx, edx - div b - mov dmval, edx - } -} - - //returns eax%ebx, dmval = eax/edx; -static __inline int32_t moddiv(int32_t a, int32_t b) -{ - _asm { - mov eax, a - xor edx, edx - div b - mov dmval, eax - mov eax, edx - } -} - -static __inline int32_t klabs(int32_t a) -{ - _asm { - mov eax, a - test eax, eax - jns skipnegate - neg eax - skipnegate: - } -} - -static __inline int32_t ksgn(int32_t b) -{ - _asm { - mov ebx, b - add ebx, ebx - sbb eax, eax - cmp eax, ebx - adc al, 0 - } -} - - //eax = (unsigned min)umin(eax,ebx) -static __inline int32_t umin(int32_t a, int32_t b) -{ - _asm { - mov eax, a - sub eax, b - sbb ecx, ecx - and eax, ecx - add eax, b - } -} - - //eax = (unsigned max)umax(eax,ebx) -static __inline int32_t umax(int32_t a, int32_t b) -{ - _asm { - mov eax, a - sub eax, b - sbb ecx, ecx - xor ecx, 0xffffffff - and eax, ecx - add eax, b - } -} - -static __inline int32_t kmin(int32_t a, int32_t b) -{ - _asm { - mov eax, a - mov ebx, b - cmp eax, ebx - jl skipit - mov eax, ebx - skipit: - } -} - -static __inline int32_t kmax(int32_t a, int32_t b) -{ - _asm { - mov eax, a - mov ebx, b - cmp eax, ebx - jg skipit - mov eax, ebx - skipit: - } -} - -static __inline void swapchar(void *a, void *b) -{ - _asm { - mov eax, a - mov ebx, b - mov cl, [eax] - mov ch, [ebx] - mov [ebx], cl - mov [eax], ch - } -} - -static __inline void swapshort(void *a, void *b) -{ - _asm { - mov eax, a - mov ebx, b - mov cx, [eax] - mov dx, [ebx] - mov [ebx], cx - mov [eax], dx - } -} - -static __inline void swaplong(void *a, void *b) -{ - _asm { - mov eax, a - mov ebx, b - mov ecx, [eax] - mov edx, [ebx] - mov [ebx], ecx - mov [eax], edx - } -} - -static __inline void swapbuf4(void *a, void *b, int32_t c) -{ - _asm { - mov eax, a - mov ebx, b - mov ecx, c - begswap: - mov esi, [eax] - mov edi, [ebx] - mov [ebx], esi - mov [eax], edi - add eax, 4 - add ebx, 4 - dec ecx - jnz short begswap - } -} - -static __inline void swap64bit(void *a, void *b) -{ - _asm { - mov eax, a - mov ebx, b - mov ecx, [eax] - mov edx, [ebx] - mov [ebx], ecx - mov ecx, [eax+4] - mov [eax], edx - mov edx, [ebx+4] - mov [ebx+4], ecx - mov [eax+4], edx - } -} - - //swapchar2(ptr1,ptr2,xsiz); is the same as: - //swapchar(ptr1,ptr2); swapchar(ptr1+1,ptr2+xsiz); -static __inline void swapchar2(void *a, void *b, int32_t s) -{ - _asm { - mov eax, a - mov ebx, b - mov esi, s - add esi, ebx - mov cx, [eax] - mov dl, [ebx] - mov [ebx], cl - mov dh, [esi] - mov [esi], ch - mov [eax], dx - } -} -//}}} - -#else // _MSC_VER +#else // // Generic C // -#define qw(x) ((int64_t)(x)) // quadword cast -#define dw(x) ((int32_t)(x)) // doubleword cast -#define wo(x) ((int16_t)(x)) // word cast -#define by(x) ((uint8_t)(x)) // byte cast - #define _scaler(a) \ static inline int32_t mulscale##a(int32_t eax, int32_t edx) \ { \ - return dw((qw(eax) * qw(edx)) >> a); \ + return dw((qw(eax) * qw(edx)) >> by(a)); \ } \ \ static inline int32_t dmulscale##a(int32_t eax, int32_t edx, int32_t esi, int32_t edi) \ { \ - return dw(((qw(eax) * qw(edx)) + (qw(esi) * qw(edi))) >> a); \ + return dw(((qw(eax) * qw(edx)) + (qw(esi) * qw(edi))) >> by(a)); \ } \ \ static inline int32_t tmulscale##a(int32_t eax, int32_t edx, int32_t ebx, int32_t ecx, int32_t esi, int32_t edi) \ { \ - return dw(((qw(eax) * qw(edx)) + (qw(ebx) * qw(ecx)) + (qw(esi) * qw(edi))) >> a); \ + return dw(((qw(eax) * qw(edx)) + (qw(ebx) * qw(ecx)) + (qw(esi) * qw(edi))) >> by(a)); \ } \ -_scaler(1) _scaler(2) _scaler(3) _scaler(4) -_scaler(5) _scaler(6) _scaler(7) _scaler(8) -_scaler(9) _scaler(10) _scaler(11) _scaler(12) -_scaler(13) _scaler(14) _scaler(15) _scaler(16) -_scaler(17) _scaler(18) _scaler(19) _scaler(20) -_scaler(21) _scaler(22) _scaler(23) _scaler(24) -_scaler(25) _scaler(26) _scaler(27) _scaler(28) -_scaler(29) _scaler(30) _scaler(31) _scaler(32) +PRAGMA_FUNCS _scaler(32) + +#undef _scaler static inline void swapchar(void* a, void* b) { char t = *((char*)b); *((char*)b) = *((char*)a); *((char*)a) = t; } static inline void swapchar2(void* a, void* b, int32_t s) { swapchar(a,b); swapchar((char*)a+1,(char*)b+s); } @@ -2274,21 +141,6 @@ static inline int32_t scale(int32_t eax, int32_t edx, int32_t ecx) { return dw(( static inline int32_t mulscale(int32_t eax, int32_t edx, int32_t ecx) { return dw((qw(eax) * qw(edx)) >> by(ecx)); } static inline int32_t dmulscale(int32_t eax, int32_t edx, int32_t esi, int32_t edi, int32_t ecx) { return dw(((qw(eax) * qw(edx)) + (qw(esi) * qw(edi))) >> by(ecx)); } -static inline int32_t boundmulscale(int32_t a, int32_t d, int32_t c) -{ // courtesy of Ken - int64_t p; - p = (((int64_t)a)*((int64_t)d))>>c; - if (p >= INT32_MAX) p = INT32_MAX; - if (p < INT32_MIN) p = INT32_MIN; - return((int32_t)p); -} - -#undef qw -#undef dw -#undef wo -#undef by -#undef _scaler - void qinterpolatedown16 (intptr_t bufptr, int32_t num, int32_t val, int32_t add); void qinterpolatedown16short (intptr_t bufptr, int32_t num, int32_t val, int32_t add); @@ -2302,6 +154,11 @@ void copybufreverse(const void *S, void *D, int32_t c); #endif +#undef qw +#undef dw +#undef wo +#undef by + static inline void swapbufreverse(void *s, void *d, int32_t c) { uint8_t *src = (uint8_t*)s, *dst = (uint8_t*)d; diff --git a/polymer/eduke32/build/include/pragmas_arm.h b/polymer/eduke32/build/include/pragmas_arm.h new file mode 100644 index 000000000..df276874a --- /dev/null +++ b/polymer/eduke32/build/include/pragmas_arm.h @@ -0,0 +1,69 @@ +// +// GCC Inline Assembler version (ARM) +// + +#ifndef __pragmas_arm_h__ +#define __pragmas_arm_h__ + +#define _scaler(a) \ +static inline int32_t mulscale##a(int32_t eax, int32_t edx) \ +{ \ + return dw((qw(eax) * qw(edx)) >> by(a)); \ +} \ +\ +static inline int32_t dmulscale##a(int32_t eax, int32_t edx, int32_t esi, int32_t edi) \ +{ \ + return dw(((qw(eax) * qw(edx)) + (qw(esi) * qw(edi))) >> by(a)); \ +} \ +\ +static inline int32_t tmulscale##a(int32_t eax, int32_t edx, int32_t ebx, int32_t ecx, int32_t esi, int32_t edi) \ +{ \ + return dw(((qw(eax) * qw(edx)) + (qw(ebx) * qw(ecx)) + (qw(esi) * qw(edi))) >> by(a)); \ +} \ + +PRAGMA_FUNCS _scaler(32) + +#undef _scaler + +static inline void swapchar(void* a, void* b) { char t = *((char*) b); *((char*) b) = *((char*) a); *((char*) a) = t; } +static inline void swapchar2(void* a, void* b, int32_t s) { swapchar(a, b); swapchar((char*) a+1, (char*) b+s); } +static inline void swapshort(void* a, void* b) { int16_t t = *((int16_t*) b); *((int16_t*) b) = *((int16_t*) a); *((int16_t*) a) = t; } +static inline void swaplong(void* a, void* b) { int32_t t = *((int32_t*) b); *((int32_t*) b) = *((int32_t*) a); *((int32_t*) a) = t; } +static inline void swap64bit(void* a, void* b) { int64_t t = *((int64_t*) b); *((int64_t*) b) = *((int64_t*) a); *((int64_t*) a) = t; } + +static inline char readpixel(void* s) { return (*((char*) (s))); } +static inline void drawpixel(void* s, char a) { *((char*) (s)) = a; } +static inline void drawpixels(void* s, int16_t a) { *((int16_t*) (s)) = a; } +static inline void drawpixelses(void* s, int32_t a) { *((int32_t*) (s)) = a; } + +static inline int32_t mul3(int32_t a) { return (a<<1)+a; } +static inline int32_t mul5(int32_t a) { return (a<<2)+a; } +static inline int32_t mul9(int32_t a) { return (a<<3)+a; } + +static inline int32_t divmod(int32_t a, int32_t b) { uint32_t _a=(uint32_t) a, _b=(uint32_t) b; dmval = _a%_b; return _a/_b; } +static inline int32_t moddiv(int32_t a, int32_t b) { uint32_t _a=(uint32_t) a, _b=(uint32_t) b; dmval = _a/_b; return _a%_b; } + +static inline int32_t klabs(int32_t a) { if (a < 0) return -a; return a; } +static inline int32_t ksgn(int32_t a) { if (a > 0) return 1; if (a < 0) return -1; return 0; } + +static inline int32_t umin(int32_t a, int32_t b) { if ((uint32_t) a < (uint32_t) b) return a; return b; } +static inline int32_t umax(int32_t a, int32_t b) { if ((uint32_t) a < (uint32_t) b) return b; return a; } +static inline int32_t kmin(int32_t a, int32_t b) { if ((int32_t) a < (int32_t) b) return a; return b; } +static inline int32_t kmax(int32_t a, int32_t b) { if ((int32_t) a < (int32_t) b) return b; return a; } + +static inline int32_t sqr(int32_t eax) { return (eax) * (eax); } +static inline int32_t scale(int32_t eax, int32_t edx, int32_t ecx) { return dw((qw(eax) * qw(edx)) / qw(ecx)); } +static inline int32_t mulscale(int32_t eax, int32_t edx, int32_t ecx) { return dw((qw(eax) * qw(edx)) >> by(ecx)); } +static inline int32_t dmulscale(int32_t eax, int32_t edx, int32_t esi, int32_t edi, int32_t ecx) { return dw(((qw(eax) * qw(edx)) + (qw(esi) * qw(edi))) >> by(ecx)); } + +void qinterpolatedown16(intptr_t bufptr, int32_t num, int32_t val, int32_t add); +void qinterpolatedown16short(intptr_t bufptr, int32_t num, int32_t val, int32_t add); + +void clearbuf(void* d, int32_t c, int32_t a); +void copybuf(const void* s, void* d, int32_t c); +void swapbuf4(void* a, void* b, int32_t c); + +void clearbufbyte(void *D, int32_t c, int32_t a); +void copybufbyte(const void *S, void *D, int32_t c); +void copybufreverse(const void *S, void *D, int32_t c); +#endif diff --git a/polymer/eduke32/build/include/pragmas_ppc.h b/polymer/eduke32/build/include/pragmas_ppc.h new file mode 100644 index 000000000..5458489c4 --- /dev/null +++ b/polymer/eduke32/build/include/pragmas_ppc.h @@ -0,0 +1,338 @@ +// GCC Inline Assembler version (PowerPC) + +#ifdef __pragmas_h__ +#ifndef __pragmas_ppc_h__ +#define __pragmas_ppc_h__ + +#define sqr(a) ((a)*(a)) + +int32_t scale(int32_t a, int32_t d, int32_t c); + +static inline int32_t mulscale(int32_t a, int32_t d, int32_t c) +{ + int32_t mullo, mulhi; + __asm__( + " mullw %0, %2, %3\n" + " mulhw %1, %2, %3\n" + " srw %0, %0, %4\n" + " slw %1, %1, %5\n" + " or %0, %0, %1\n" + : "=&r"(mullo), "=&r"(mulhi) + : "r"(a), "r"(d), "r"(c), "r"(32-c) + : "xer" + ); + return mullo; +} + +#define _scaler(x) \ +static inline int32_t mulscale##x(int32_t a, int32_t d) \ +{ \ + int32_t mullo, mulhi; \ + __asm__ ( \ + " mullw %0, %2, %3\n" \ + " mulhw %1, %2, %3\n" \ + " srwi %0, %0, %4\n" \ + " insrwi %0, %1, %4, 0\n" \ + : "=&r"(mullo), "=r"(mulhi) \ + : "r"(a), "r"(d), "i"(x) \ + ); \ + return mullo; \ +} + +PRAGMA_FUNCS +#undef _scaler + +static inline int32_t mulscale32(int32_t a, int32_t d) +{ + int32_t mulhi; + __asm__( + " mulhw %0, %1, %2\n" + : "=r"(mulhi) + : "r"(a), "r"(d) + ); + return mulhi; +} + +static inline int32_t dmulscale(int32_t a, int32_t d, int32_t S, int32_t D, int32_t c) +{ + int32_t mulhi, mullo, sumhi, sumlo; + __asm__( + " mullw %0, %4, %5\n" + " mulhw %1, %4, %5\n" + " mullw %2, %6, %7\n" + " mulhw %3, %6, %7\n" + " addc %0, %0, %2\n" + " adde %1, %1, %3\n" + " srw %0, %0, %8\n" + " slw %1, %1, %9\n" + " or %0, %0, %1\n" + : "=&r"(sumlo), "=&r"(sumhi), "=&r"(mullo), "=&r"(mulhi) + : "r"(a), "r"(d), "r"(S), "r"(D), "r"(c), "r"(32-c) + : "xer" + ); + return sumlo; +} + +#define _scaler(x) \ +static inline int32_t dmulscale##x(int32_t a, int32_t d, int32_t S, int32_t D) \ +{ \ + int32_t mulhi, mullo, sumhi, sumlo; \ + __asm__ ( \ + " mullw %0, %4, %5\n" \ + " mulhw %1, %4, %5\n" \ + " mullw %2, %6, %7\n" \ + " mulhw %3, %6, %7\n" \ + " addc %0, %0, %2\n" \ + " adde %1, %1, %3\n" \ + " srwi %0, %0, %8\n" \ + " insrwi %0, %1, %8, 0\n" \ + : "=&r"(sumlo), "=&r"(sumhi), "=&r"(mullo), "=r"(mulhi) \ + : "r"(a), "r"(d), "r"(S), "r"(D), "i"(x) \ + : "xer" \ + ); \ + return sumlo; \ +} + +PRAGMA_FUNCS +#undef _scaler + +static inline int32_t dmulscale32(int32_t a, int32_t d, int32_t S, int32_t D) +{ + int32_t mulhi, mullo, sumhi, sumlo; + __asm__(\ + " mullw %0, %4, %5\n" \ + " mulhw %1, %4, %5\n" \ + " mullw %2, %6, %7\n" \ + " mulhw %3, %6, %7\n" \ + " addc %0, %0, %2\n" \ + " adde %1, %1, %3\n" \ + : "=&r"(sumlo), "=&r"(sumhi), "=&r"(mullo), "=r"(mulhi) + : "r"(a), "r"(d), "r"(S), "r"(D) + : "xer" + ); + return sumhi; +} + +// tmulscale only seems to be used in one place... +static inline int32_t tmulscale11(int32_t a, int32_t d, int32_t b, int32_t c, int32_t S, int32_t D) +{ + int32_t mulhi, mullo, sumhi, sumlo; + __asm__( + " mullw %0, %4, %5\n" \ + " mulhw %1, %4, %5\n" \ + " mullw %2, %6, %7\n" \ + " mulhw %3, %6, %7\n" \ + " addc %0, %0, %2\n" \ + " adde %1, %1, %3\n" \ + " mullw %2, %8, %9\n" \ + " mulhw %3, %8, %9\n" \ + " addc %0, %0, %2\n" \ + " adde %1, %1, %3\n" \ + " srwi %0, %0, 11\n" \ + " insrwi %0, %1, 11, 0\n" \ + : "=&r"(sumlo), "=&r"(sumhi), "=&r"(mullo), "=&r"(mulhi) + : "r"(a), "r"(d), "r"(b), "r"(c), "r"(S), "r"(D) + : "xer" + ); + return sumlo; +} + +static inline char readpixel(void *d) +{ + return *(char*) d; +} + +static inline void drawpixel(void *d, char a) +{ + *(char*) d = a; +} + +static inline void drawpixels(void *d, int16_t a) +{ + __asm__( + " sthbrx %0, 0, %1\n" + : + : "r"(&a), "r"(d) + : "memory" + ); +} + +static inline void drawpixelses(void *d, int32_t a) +{ + __asm__( + " stwbrx %0, 0, %1\n" + : + : "r"(&a), "r"(d) + : "memory" + ); +} + +void clearbufbyte(void *d, int32_t c, int32_t a); + +static inline void clearbuf(void *d, int32_t c, int32_t a) +{ + int32_t *p = (int32_t*) d; + if (a==0) { + clearbufbyte(d, c<<2, 0); + return; + } + while (c--) { + *p++ = a; + } +} + +static inline void copybuf(void *s, void *d, int32_t c) +{ + int32_t *p = (int32_t*) s, *q = (int32_t*) d; + while (c--) { + *q++ = *p++; + } +} + +static inline void copybufbyte(void *s, void *d, int32_t c) +{ + uint8_t *src = (uint8_t*) s, *dst = (uint8_t*) d; + while (c--) { + *dst++ = *src++; + } +} + +static inline void copybufreverse(void *s, void *d, int32_t c) +{ + uint8_t *src = (uint8_t*) s, *dst = (uint8_t*) d; + while (c--) { + *dst++ = *src--; + } +} + +static inline void qinterpolatedown16(intptr_t bufptr, int32_t num, int32_t val, int32_t add) +{ + int i; + int32_t *lptr = (int32_t *) bufptr; + for (i=0; i>16); + val += add; + } +} + +static inline void qinterpolatedown16short(intptr_t bufptr, int32_t num, int32_t val, int32_t add) +{ + int i; + int16_t *sptr = (int16_t *) bufptr; + for (i=0; i>16); + val += add; + } +} + +static inline int32_t mul3(int32_t a) +{ + return (a<<1)+a; +} + +static inline int32_t mul5(int32_t a) +{ + return (a<<2)+a; +} + +static inline int32_t mul9(int32_t a) +{ + return (a<<3)+a; +} + +static inline int32_t klabs(int32_t a) +{ + int32_t mask; + __asm__( + " srawi %0, %1, 31\n" + " xor %1, %0, %1\n" + " subf %1, %0, %1\n" + : "=&r"(mask), "+r"(a) + : + : "xer" + ); + return a; +} + +static inline int32_t ksgn(int32_t a) +{ + int32_t s, t; + __asm__( + " neg %1, %2\n" + " srawi %0, %2, 31\n" + " srwi %1, %1, 31\n" + " or %1, %1, %0\n" + : "=r"(t), "=&r"(s) + : "r"(a) + : "xer" + ); + return s; +} + +static inline void swapchar(void *a, void *b) +{ + char t = *(char*) a; + *(char*) a = *(char*) b; + *(char*) b = t; +} + +static inline void swapchar2(void *a, void *b, int32_t s) +{ + swapchar(a, b); + swapchar((char*) a+1, (char*) b+s); +} + +static inline void swapshort(void *a, void *b) +{ + int16_t t = *(int16_t*) a; + *(int16_t*) a = *(int16_t*) b; + *(int16_t*) b = t; +} + +static inline void swaplong(void *a, void *b) +{ + int32_t t = *(int32_t*) a; + *(int32_t*) a = *(int32_t*) b; + *(int32_t*) b = t; +} + +static inline void swap64bit(void *a, void *b) +{ + double t = *(double*) a; + *(double*) a = *(double*) b; + *(double*) b = t; +} + +static inline int32_t divmod(int32_t a, int32_t b) +{ + int32_t div; + __asm__( + " divwu %0, %2, %3\n" + " mullw %1, %0, %3\n" + " subf %1, %1, %2\n" + : "=&r"(div), "=&r"(dmval) + : "r"(a), "r"(b) + ); + return div; +} + +static inline int32_t moddiv(int32_t a, int32_t b) +{ + int32_t mod; + __asm__( + " divwu %0, %2, %3\n" + " mullw %1, %0, %3\n" + " subf %1, %1, %2\n" + : "=&r"(dmval), "=&r"(mod) + : "r"(a), "r"(b) + ); + return mod; +} + +static inline int32_t umin(int32_t a, int32_t b) { if ((uint32_t) a < (uint32_t) b) return a; return b; } +static inline int32_t umax(int32_t a, int32_t b) { if ((uint32_t) a < (uint32_t) b) return b; return a; } +static inline int32_t kmin(int32_t a, int32_t b) { if ((int32_t) a < (int32_t) b) return a; return b; } +static inline int32_t kmax(int32_t a, int32_t b) { if ((int32_t) a < (int32_t) b) return b; return a; } + +#endif // __pragmas_ppc_h__ +#endif // __pragmas_h__ diff --git a/polymer/eduke32/build/include/pragmas_x86_gcc.h b/polymer/eduke32/build/include/pragmas_x86_gcc.h new file mode 100644 index 000000000..0cbb45c77 --- /dev/null +++ b/polymer/eduke32/build/include/pragmas_x86_gcc.h @@ -0,0 +1,1062 @@ +// +// GCC Inline Assembler version (x86) +// + +//{{{ + +#ifdef __pragmas_h__ +#ifndef __pragmas_x86_h__ +#define __pragmas_x86_h__ + +#ifndef UNDERSCORES +#define _DMVAL "dmval" +#else +#define _DMVAL "_dmval" +#endif + + +// maybe one day I'll make these into macros +void clearbufbyte(void *D, int32_t c, int32_t a); +void copybufbyte(const void *S, void *D, int32_t c); +void copybufreverse(const void *S, void *D, int32_t c); + + +#ifdef NO_GCC_BUILTINS +#define sqr(a) \ + ({ int32_t __a=(a); \ + __asm__ __volatile__ ("imull %0, %0" \ + : "=q" (__a) \ + : "0" (__a) \ + : "cc"); \ + __a; }) +#else +#define sqr(a) __builtin_sqr(a) +#endif + +#define scale(a,d,c) \ + ({ int32_t __a=(a), __d=(d), __c=(c); \ + __asm__ __volatile__ ("imull %%edx; idivl %%ecx" \ + : "=a" (__a), "=d" (__d) \ + : "0" (__a), "1" (__d), "c" (__c) : "cc"); \ + __a; }) + +#define mulscale(a,d,c) \ + ({ int32_t __a=(a), __d=(d), __c=(c); \ + __asm__ __volatile__ ("imull %%edx; shrdl %%cl, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d), "c" (__c) : "cc"); \ + __a; }) +#define mulscale1(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $1, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale2(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $2, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale3(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $3, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale4(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $4, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale5(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $5, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale6(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $6, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale7(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $7, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale8(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $8, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale9(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $9, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale10(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $10, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale11(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $11, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale12(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $12, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale13(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $13, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale14(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $14, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale15(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $15, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale16(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $16, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale17(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $17, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale18(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $18, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale19(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $19, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale20(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $20, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale21(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $21, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale22(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $22, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale23(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $23, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale24(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $24, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale25(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $25, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale26(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $26, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale27(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $27, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale28(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $28, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale29(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $29, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale30(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $30, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale31(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx; shrdl $31, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __a; }) +#define mulscale32(a,d) \ + ({ int32_t __a=(a), __d=(d); \ + __asm__ __volatile__ ("imull %%edx" \ + : "=a" (__a), "=d" (__d) \ + : "a" (__a), "d" (__d) : "cc"); \ + __d; }) + +#define dmulscale(a,d,S,D,c) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D), __c=(c); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl %%cl, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D), "c" (__c) : "ebx", "cc"); \ + __a; }) +#define dmulscale1(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $1, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale2(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $2, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale3(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $3, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale4(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $4, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale5(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $5, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale6(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $6, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale7(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $7, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale8(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $8, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale9(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $9, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale10(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $10, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale11(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $11, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale12(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $12, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale13(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $13, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale14(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $14, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale15(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $15, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale16(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $16, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale17(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $17, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale18(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $18, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale19(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $19, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale20(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $20, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale21(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $21, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale22(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $22, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale23(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $23, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale24(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $24, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale25(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $25, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale26(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $26, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale27(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $27, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale28(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $28, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale29(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $29, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale30(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $30, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale31(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx; shrdl $31, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __a; }) +#define dmulscale32(a,d,S,D) \ + ({ int32_t __a=(a), __d=(d), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; movl %%eax, %%ebx; movl %%esi, %%eax; movl %%edx, %%esi; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%esi, %%edx" \ + : "=a" (__a), "=d" (__d), "=S" (__S) \ + : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ + __d; }) + +#define tmulscale1(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $1, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale2(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $2, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale3(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $3, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale4(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $4, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale5(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $5, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale6(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $6, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale7(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $7, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale8(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $8, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale9(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $9, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale10(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $10, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale11(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $11, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale12(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $12, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale13(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $13, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale14(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $14, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale15(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $15, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale16(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $16, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale17(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $17, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale18(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $18, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale19(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $19, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale20(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $20, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale21(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $21, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale22(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $22, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale23(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $23, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale24(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $24, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale25(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $25, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale26(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $26, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale27(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $27, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale28(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $28, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale29(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $29, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale30(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $30, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale31(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx; shrdl $31, %%edx, %%eax" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __a; }) +#define tmulscale32(a,d,b,c,S,D) \ + ({ int32_t __a=(a), __d=(d), __b=(b), __c=(c), __S=(S), __D=(D); \ + __asm__ __volatile__ ("imull %%edx; xchgl %%ebx, %%eax; xchgl %%ecx, %%edx; " \ + "imull %%edx; addl %%eax, %%ebx; adcl %%edx, %%ecx; movl %%esi, %%eax; " \ + "imull %%edi; addl %%ebx, %%eax; adcl %%ecx, %%edx" \ + : "=a" (__a), "=d" (__d), "=b" (__b), "=c" (__c) \ + : "a" (__a), "d" (__d), "b" (__b), "c" (__c), "S" (__S), "D" (__D) : "cc"); \ + __d; }) + +#ifdef USE_ASM_DIVSCALE +#define divscale(a,b,c) \ + ({ int32_t __a=(a), __b=(b), __c=(c); \ + __asm__ __volatile__ ("movl %%eax, %%edx; shll %%cl, %%eax; negb %%cl; sarl %%cl, %%edx; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "c" (__c), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale1(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("addl %%eax, %%eax; sbbl %%edx, %%edx; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale2(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("movl %%eax, %%edx; sarl $30, %%edx; leal (,%%eax,4), %%eax; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale3(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("movl %%eax, %%edx; sarl $29, %%edx; leal (,%%eax,8), %%eax; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale4(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("movl %%eax, %%edx; sarl $28, %%edx; shll $4, %%eax; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale5(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("movl %%eax, %%edx; sarl $27, %%edx; shll $5, %%eax; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale6(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("movl %%eax, %%edx; sarl $26, %%edx; shll $6, %%eax; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale7(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("movl %%eax, %%edx; sarl $25, %%edx; shll $7, %%eax; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale8(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("movl %%eax, %%edx; sarl $24, %%edx; shll $8, %%eax; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale9(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("movl %%eax, %%edx; sarl $23, %%edx; shll $9, %%eax; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale10(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("movl %%eax, %%edx; sarl $22, %%edx; shll $10, %%eax; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale11(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("movl %%eax, %%edx; sarl $21, %%edx; shll $11, %%eax; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale12(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("movl %%eax, %%edx; sarl $20, %%edx; shll $12, %%eax; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale13(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("movl %%eax, %%edx; sarl $19, %%edx; shll $13, %%eax; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale14(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("movl %%eax, %%edx; sarl $18, %%edx; shll $14, %%eax; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale15(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("movl %%eax, %%edx; sarl $17, %%edx; shll $15, %%eax; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale16(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("movl %%eax, %%edx; sarl $16, %%edx; shll $16, %%eax; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale17(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("movl %%eax, %%edx; sarl $15, %%edx; shll $17, %%eax; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale18(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("movl %%eax, %%edx; sarl $14, %%edx; shll $18, %%eax; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale19(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("movl %%eax, %%edx; sarl $13, %%edx; shll $19, %%eax; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale20(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("movl %%eax, %%edx; sarl $12, %%edx; shll $20, %%eax; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale21(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("movl %%eax, %%edx; sarl $11, %%edx; shll $21, %%eax; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale22(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("movl %%eax, %%edx; sarl $10, %%edx; shll $22, %%eax; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale23(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("movl %%eax, %%edx; sarl $9, %%edx; shll $23, %%eax; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale24(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("movl %%eax, %%edx; sarl $8, %%edx; shll $24, %%eax; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale25(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("movl %%eax, %%edx; sarl $7, %%edx; shll $25, %%eax; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale26(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("movl %%eax, %%edx; sarl $6, %%edx; shll $26, %%eax; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale27(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("movl %%eax, %%edx; sarl $5, %%edx; shll $27, %%eax; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale28(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("movl %%eax, %%edx; sarl $4, %%edx; shll $28, %%eax; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale29(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("movl %%eax, %%edx; sarl $3, %%edx; shll $29, %%eax; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale30(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("movl %%eax, %%edx; sarl $2, %%edx; shll $30, %%eax; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale31(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("movl %%eax, %%edx; sarl $1, %%edx; shll $31, %%eax; idivl %%ebx" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ + __a; }) +#define divscale32(d,b) \ + ({ int32_t __d=(d), __b=(b), __r; \ + __asm__ __volatile__ ("xorl %%eax, %%eax; idivl %%ebx" \ + : "=a" (__r), "=d" (__d) : "d" (__d), "b" (__b) : "cc"); \ + __r; }) +#endif // defined USE_ASM_DIVSCALE + +#define readpixel(D) \ + ({ void *__D=(D); int32_t __a; \ + __asm__ __volatile__ ("movb (%%edi), %%al" \ + : "=a" (__a): "D" (__D) : "cc"); \ + __a; }) +#define drawpixel(D,a) \ + ({ void *__D=(D); int32_t __a=(a); \ + __asm__ __volatile__ ("movb %%al, (%%edi)" \ + : : "D" (__D), "a" (__a) : "memory", "cc"); \ + 0; }) +#define drawpixels(D,a) \ + ({ void *__D=(D); int32_t __a=(a); \ + __asm__ __volatile__ ("movw %%ax, (%%edi)" \ + : : "D" (__D), "a" (__a) : "memory", "cc"); \ + 0; }) +#define drawpixelses(D,a) \ + ({ void *__D=(D); int32_t __a=(a); \ + __asm__ __volatile__ ("movl %%eax, (%%edi)" \ + : : "D" (__D), "a" (__a) : "memory", "cc"); \ + 0; }) +#define clearbuf(D,c,a) \ + ({ void *__D=(D); int32_t __c=(c), __a=(a); \ + __asm__ __volatile__ ("rep; stosl" \ + : "=&D" (__D), "=&c" (__c) : "0" (__D), "1" (__c), "a" (__a) : "memory", "cc"); \ + 0; }) +#define copybuf(S,D,c) \ + ({ const void *__S=(S), *__D=(D); int32_t __c=(c); \ + __asm__ __volatile__ ("rep; movsl" \ + : "=&S" (__S), "=&D" (__D), "=&c" (__c) : "0" (__S), "1" (__D), "2" (__c) : "memory", "cc"); \ + 0; }) + +#define mul3(a) \ + ({ int32_t __a=(a), __r; \ + __asm__ __volatile__ ("lea (%1,%1,2), %0" \ + : "=r" (__r) : "0" (__a) : "cc"); \ + __r; }) +#define mul5(a) \ + ({ int32_t __a=(a), __r; \ + __asm__ __volatile__ ("lea (%1,%1,4), %0" \ + : "=r" (__r) : "0" (__a) : "cc"); \ + __r; }) +#define mul9(a) \ + ({ int32_t __a=(a), __r; \ + __asm__ __volatile__ ("lea (%1,%1,8), %0" \ + : "=r" (__r) : "0" (__a) : "cc"); \ + __r; }) + +//returns eax/ebx, dmval = eax%edx; +#define divmod(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("xorl %%edx, %%edx; divl %%ebx; movl %%edx, "_DMVAL \ + : "+a" (__a) : "b" (__b) : "edx", "memory", "cc"); \ + __a; }) +//returns eax%ebx, dmval = eax/edx; +#define moddiv(a,b) \ + ({ int32_t __a=(a), __b=(b), __d; \ + __asm__ __volatile__ ("xorl %%edx, %%edx; divl %%ebx; movl %%eax, "_DMVAL \ + : "=d" (__d) : "a" (__a), "b" (__b) : "eax", "memory", "cc"); \ + __d; }) + +#define klabs(a) \ + ({ int32_t __a=(a); \ + __asm__ __volatile__ ("testl %%eax, %%eax; jns 0f; negl %%eax; 0:" \ + : "=a" (__a) : "a" (__a) : "cc"); \ + __a; }) +#define ksgn(b) \ + ({ int32_t __b=(b), __r; \ + __asm__ __volatile__ ("addl %%ebx, %%ebx; sbbl %%eax, %%eax; cmpl %%ebx, %%eax; adcb $0, %%al" \ + : "=a" (__r) : "b" (__b) : "cc"); \ + __r; }) + +#define umin(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("subl %%ebx, %%eax; sbbl %%ecx, %%ecx; andl %%ecx, %%eax; addl %%ebx, %%eax" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "ecx", "cc"); \ + __a; }) +#define umax(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("subl %%ebx, %%eax; sbbl %%ecx, %%ecx; xorl $0xffffffff, %%ecx; andl %%ecx, %%eax; addl %%ebx, %%eax" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "ecx", "cc"); \ + __a; }) + +#define kmin(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("cmpl %%ebx, %%eax; jl 0f; movl %%ebx, %%eax; 0:" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "cc"); \ + __a; }) +#define kmax(a,b) \ + ({ int32_t __a=(a), __b=(b); \ + __asm__ __volatile__ ("cmpl %%ebx, %%eax; jg 0f; movl %%ebx, %%eax; 0:" \ + : "=a" (__a) : "a" (__a), "b" (__b) : "cc"); \ + __a; }) + +#define swapchar(a,b) \ + ({ void *__a=(a), *__b=(b); \ + __asm__ __volatile__ ("movb (%%eax), %%cl; movb (%%ebx), %%ch; movb %%cl, (%%ebx); movb %%ch, (%%eax)" \ + : : "a" (__a), "b" (__b) : "ecx", "memory", "cc"); \ + 0; }) +#define swapshort(a,b) \ + ({ void *__a=(a), *__b=(b); \ + __asm__ __volatile__ ("movw (%%eax), %%cx; movw (%%ebx), %%dx; movw %%cx, (%%ebx); movw %%dx, (%%eax)" \ + : : "a" (__a), "b" (__b) : "ecx", "edx", "memory", "cc"); \ + 0; }) +#define swaplong(a,b) \ + ({ void *__a=(a), *__b=(b); \ + __asm__ __volatile__ ("movl (%%eax), %%ecx; movl (%%ebx), %%edx; movl %%ecx, (%%ebx); movl %%edx, (%%eax)" \ + : : "a" (__a), "b" (__b) : "ecx", "edx", "memory", "cc"); \ + 0; }) +#define swapbuf4(a,b,c) \ + ({ void *__a=(a), *__b=(b); int32_t __c=(c); \ + __asm__ __volatile__ ("0: movl (%%eax), %%esi; movl (%%ebx), %%edi; movl %%esi, (%%ebx); " \ + "movl %%edi, (%%eax); addl $4, %%eax; addl $4, %%ebx; decl %%ecx; jnz 0b" \ + : : "a" (__a), "b" (__b), "c" (__c) : "esi", "edi", "memory", "cc"); \ + 0; }) +#define swap64bit(a,b) \ + ({ void *__a=(a), *__b=(b); \ + __asm__ __volatile__ ("movl (%%eax), %%ecx; movl (%%ebx), %%edx; movl %%ecx, (%%ebx); " \ + "movl 4(%%eax), %%ecx; movl %%edx, (%%eax); movl 4(%%ebx), %%edx; " \ + "movl %%ecx, 4(%%ebx); movl %%edx, 4(%%eax)" \ + : : "a" (__a), "b" (__b) : "ecx", "edx", "memory", "cc"); \ + 0; }) + +//swapchar2(ptr1,ptr2,xsiz); is the same as: +//swapchar(ptr1,ptr2); swapchar(ptr1+1,ptr2+xsiz); +#define swapchar2(a,b,S) \ + ({ void *__a=(a), *__b=(b); int32_t __S=(S); \ + __asm__ __volatile__ ("addl %%ebx, %%esi; movw (%%eax), %%cx; movb (%%ebx), %%dl; " \ + "movb %%cl, (%%ebx); movb (%%esi), %%dh; movb %%ch, (%%esi); " \ + "movw %%dx, (%%eax)" \ + : "=S" (__S) : "a" (__a), "b" (__b), "S" (__S) : "ecx", "edx", "memory", "cc"); \ + 0; }) + + +#define qinterpolatedown16(a,c,d,S) \ + ({ void *__a=(void*)(a); int32_t __c=(c), __d=(d), __S=(S); \ + __asm__ __volatile__ ("movl %%ecx, %%ebx; shrl $1, %%ecx; jz 1f; " \ + "0: leal (%%edx,%%esi,), %%edi; sarl $16, %%edx; movl %%edx, (%%eax); " \ + "leal (%%edi,%%esi,), %%edx; sarl $16, %%edi; movl %%edi, 4(%%eax); " \ + "addl $8, %%eax; decl %%ecx; jnz 0b; testl $1, %%ebx; jz 2f; " \ + "1: sarl $16, %%edx; movl %%edx, (%%eax); 2:" \ + : "=a" (__a), "=c" (__c), "=d" (__d) : "a" (__a), "c" (__c), "d" (__d), "S" (__S) \ + : "ebx", "edi", "memory", "cc"); \ + 0; }) + +#define qinterpolatedown16short(a,c,d,S) \ + ({ void *__a=(void*)(a); int32_t __c=(c), __d=(d), __S=(S); \ + __asm__ __volatile__ ("testl %%ecx, %%ecx; jz 3f; testb $2, %%al; jz 0f; movl %%edx, %%ebx; " \ + "sarl $16, %%ebx; movw %%bx, (%%eax); addl %%esi, %%edx; addl $2, %%eax; " \ + "decl %%ecx; jz 3f; " \ + "0: subl $2, %%ecx; jc 2f; " \ + "1: movl %%edx, %%ebx; addl %%esi, %%edx; sarl $16, %%ebx; movl %%edx, %%edi; " \ + "andl $0xffff0000, %%edi; addl %%esi, %%edx; addl %%edi, %%ebx; " \ + "movl %%ebx, (%%eax); addl $4, %%eax; subl $2, %%ecx; jnc 1b; testb $1, %%cl; " \ + "jz 3f; " \ + "2: movl %%edx, %%ebx; sarl $16, %%ebx; movw %%bx, (%%eax); 3:" \ + : "=a" (__a), "=c" (__c), "=d" (__d) : "a" (__a), "c" (__c), "d" (__d), "S" (__S) \ + : "ebx", "edi", "memory", "cc"); \ + 0; }) + + +//}}} + +#endif // __pragmas_x86_h__ +#endif // __pragmas_h__ diff --git a/polymer/eduke32/build/include/pragmas_x86_msvc.h b/polymer/eduke32/build/include/pragmas_x86_msvc.h new file mode 100644 index 000000000..43fd20a8c --- /dev/null +++ b/polymer/eduke32/build/include/pragmas_x86_msvc.h @@ -0,0 +1,690 @@ +// +// Microsoft C inline assembler +// + +//{{{ + +#ifdef __pragmas_h__ +#ifndef __pragmas_x86_h__ +#define __pragmas_x86_h__ + +static __inline int32_t sqr(int32_t a) +{ + _asm { + mov eax, a + imul eax, eax + } +} + +static __inline int32_t scale(int32_t a, int32_t d, int32_t c) +{ + _asm { + mov eax, a + imul d + idiv c + } +} + +static __inline int32_t mulscale(int32_t a, int32_t d, int32_t c) +{ + _asm { + mov ecx, c + mov eax, a + imul d + shrd eax, edx, cl + } +} + +#define MULSCALE(x) \ +static __inline int32_t mulscale##x (int32_t a, int32_t d) \ +{ \ + _asm mov eax, a \ + _asm imul d \ + _asm shrd eax, edx, x \ +} + +MULSCALE(1) MULSCALE(2) MULSCALE(3) MULSCALE(4) +MULSCALE(5) MULSCALE(6) MULSCALE(7) MULSCALE(8) +MULSCALE(9) MULSCALE(10) MULSCALE(11) MULSCALE(12) +MULSCALE(13) MULSCALE(14) MULSCALE(15) MULSCALE(16) +MULSCALE(17) MULSCALE(18) MULSCALE(19) MULSCALE(20) +MULSCALE(21) MULSCALE(22) MULSCALE(23) MULSCALE(24) +MULSCALE(25) MULSCALE(26) MULSCALE(27) MULSCALE(28) +MULSCALE(29) MULSCALE(30) MULSCALE(31) +#undef MULSCALE +static __inline int32_t mulscale32(int32_t a, int32_t d) +{ + _asm { + mov eax, a + imul d + mov eax, edx + } +} + +static __inline int32_t dmulscale(int32_t a, int32_t d, int32_t S, int32_t D, int32_t c) +{ + _asm { + mov ecx, c + mov eax, a + imul d + mov ebx, eax + mov eax, S + mov esi, edx + imul D + add eax, ebx + adc edx, esi + shrd eax, edx, cl + } +} + +#define DMULSCALE(x) \ +static __inline int32_t dmulscale##x (int32_t a, int32_t d, int32_t S, int32_t D) \ +{ \ + _asm mov eax, a \ + _asm imul d \ + _asm mov ebx, eax \ + _asm mov eax, S \ + _asm mov esi, edx \ + _asm imul D \ + _asm add eax, ebx \ + _asm adc edx, esi \ + _asm shrd eax, edx, x \ +} + +DMULSCALE(1) DMULSCALE(2) DMULSCALE(3) DMULSCALE(4) +DMULSCALE(5) DMULSCALE(6) DMULSCALE(7) DMULSCALE(8) +DMULSCALE(9) DMULSCALE(10) DMULSCALE(11) DMULSCALE(12) +DMULSCALE(13) DMULSCALE(14) DMULSCALE(15) DMULSCALE(16) +DMULSCALE(17) DMULSCALE(18) DMULSCALE(19) DMULSCALE(20) +DMULSCALE(21) DMULSCALE(22) DMULSCALE(23) DMULSCALE(24) +DMULSCALE(25) DMULSCALE(26) DMULSCALE(27) DMULSCALE(28) +DMULSCALE(29) DMULSCALE(30) DMULSCALE(31) +#undef DMULSCALE +static __inline int32_t dmulscale32(int32_t a, int32_t d, int32_t S, int32_t D) +{ + _asm { + mov eax, a + imul d + mov ebx, eax + mov eax, S + mov esi, edx + imul D + add eax, ebx + adc edx, esi + mov eax, edx + } +} + +#define TMULSCALE(x) \ +static __inline int32_t tmulscale##x (int32_t a, int32_t d, int32_t b, int32_t c, int32_t S, int32_t D) \ +{ \ + _asm mov eax, a \ + _asm mov ebx, b \ + _asm imul d \ + _asm xchg eax, ebx \ + _asm mov ecx, c \ + _asm xchg edx, ecx \ + _asm imul edx \ + _asm add ebx, eax \ + _asm adc ecx, edx \ + _asm mov eax, S \ + _asm imul D \ + _asm add eax, ebx \ + _asm adc edx, ecx \ + _asm shrd eax, edx, x \ +} + +TMULSCALE(1) TMULSCALE(2) TMULSCALE(3) TMULSCALE(4) +TMULSCALE(5) TMULSCALE(6) TMULSCALE(7) TMULSCALE(8) +TMULSCALE(9) TMULSCALE(10) TMULSCALE(11) TMULSCALE(12) +TMULSCALE(13) TMULSCALE(14) TMULSCALE(15) TMULSCALE(16) +TMULSCALE(17) TMULSCALE(18) TMULSCALE(19) TMULSCALE(20) +TMULSCALE(21) TMULSCALE(22) TMULSCALE(23) TMULSCALE(24) +TMULSCALE(25) TMULSCALE(26) TMULSCALE(27) TMULSCALE(28) +TMULSCALE(29) TMULSCALE(30) TMULSCALE(31) +#undef TMULSCALE +static __inline int32_t tmulscale32(int32_t a, int32_t d, int32_t b, int32_t c, int32_t S, int32_t D) +{ + _asm { + mov eax, a + mov ebx, b + imul d + xchg eax, ebx + mov ecx, c + xchg edx, ecx + imul edx + add ebx, eax + adc ecx, edx + mov eax, S + imul D + add eax, ebx + adc edx, ecx + mov eax, edx + } +} + +#ifdef USE_ASM_DIVSCALE +static __inline int32_t divscale(int32_t a, int32_t b, int32_t c) +{ + _asm { + mov eax, a + mov ecx, c + mov edx, eax + shl eax, cl + neg cl + sar edx, cl + idiv b + } +} + +static __inline int32_t divscale1(int32_t a, int32_t b) +{ + _asm { + mov eax, a + add eax, eax + sbb edx, edx + idiv b + } +} + +static __inline int32_t divscale2(int32_t a, int32_t b) +{ + _asm { + mov eax, a + mov edx, eax + sar edx, 30 + lea eax, [eax*4] + idiv b + } +} + +static __inline int32_t divscale3(int32_t a, int32_t b) +{ + _asm { + mov eax, a + mov edx, eax + sar edx, 29 + lea eax, [eax*8] + idiv b + } +} + +#define DIVSCALE(x,y) \ +static __inline int32_t divscale##y(int32_t a, int32_t b) \ +{ \ + _asm mov eax, a \ + _asm mov edx, eax \ + _asm sar edx, x \ + _asm shl eax, y \ + _asm idiv b \ +} + +DIVSCALE(28, 4) DIVSCALE(27, 5) DIVSCALE(26, 6) DIVSCALE(25, 7) +DIVSCALE(24, 8) DIVSCALE(23, 9) DIVSCALE(22, 10) DIVSCALE(21, 11) +DIVSCALE(20, 12) DIVSCALE(19, 13) DIVSCALE(18, 14) DIVSCALE(17, 15) +DIVSCALE(16, 16) DIVSCALE(15, 17) DIVSCALE(14, 18) DIVSCALE(13, 19) +DIVSCALE(12, 20) DIVSCALE(11, 21) DIVSCALE(10, 22) DIVSCALE(9, 23) +DIVSCALE(8, 24) DIVSCALE(7, 25) DIVSCALE(6, 26) DIVSCALE(5, 27) +DIVSCALE(4, 28) DIVSCALE(3, 29) DIVSCALE(2, 30) DIVSCALE(1, 31) + +static __inline int32_t divscale32(int32_t d, int32_t b) +{ + _asm { + mov edx, d + xor eax, eax + idiv b + } +} +#endif // defined USE_ASM_DIVSCALE + +static __inline char readpixel(void *d) +{ + _asm { + mov edx, d + mov al, byte ptr[edx] + } +} + +static __inline void drawpixel(void *d, char a) +{ + _asm { + mov edx, d + mov al, a + mov byte ptr[edx], al + } +} + +static __inline void drawpixels(void *d, int16_t a) +{ + _asm { + mov edx, d + mov ax, a + mov word ptr[edx], ax + } +} + +static __inline void drawpixelses(void *d, int32_t a) +{ + _asm { + mov edx, d + mov eax, a + mov dword ptr[edx], eax + } +} + +static __inline void clearbuf(void *d, int32_t c, int32_t a) +{ + _asm { + mov edi, d + mov ecx, c + mov eax, a + rep stosd + } +} + +static __inline void clearbufbyte(void *d, int32_t c, int32_t a) +{ + _asm { + mov edi, d + mov ecx, c + mov eax, a + cmp ecx, 4 + jae longcopy + test cl, 1 + jz preskip + stosb + preskip : + shr ecx, 1 + rep stosw + jmp endit + longcopy : + test edi, 1 + jz skip1 + stosb + dec ecx + skip1 : + test edi, 2 + jz skip2 + stosw + sub ecx, 2 + skip2 : + mov ebx, ecx + shr ecx, 2 + rep stosd + test bl, 2 + jz skip3 + stosw + skip3 : + test bl, 1 + jz endit + stosb + endit : + } +} + +static __inline void copybuf(const void *s, void *d, int32_t c) +{ + _asm { + mov esi, s + mov edi, d + mov ecx, c + rep movsd + } +} + +static __inline void copybufbyte(const void *s, void *d, int32_t c) +{ + _asm { + mov esi, s + mov edi, d + mov ecx, c + cmp ecx, 4 + jae longcopy + test cl, 1 + jz preskip + movsb + preskip : + shr ecx, 1 + rep movsw + jmp endit + longcopy : + test edi, 1 + jz skip1 + movsb + dec ecx + skip1 : + test edi, 2 + jz skip2 + movsw + sub ecx, 2 + skip2 : + mov ebx, ecx + shr ecx, 2 + rep movsd + test bl, 2 + jz skip3 + movsw + skip3 : + test bl, 1 + jz endit + movsb + endit : + } +} + +static __inline void copybufreverse(const void *s, void *d, int32_t c) +{ + _asm { + mov esi, s + mov edi, d + mov ecx, c + shr ecx, 1 + jnc skipit1 + mov al, byte ptr[esi] + dec esi + mov byte ptr[edi], al + inc edi + skipit1 : + shr ecx, 1 + jnc skipit2 + mov ax, word ptr[esi-1] + sub esi, 2 + ror ax, 8 + mov word ptr[edi], ax + add edi, 2 + skipit2: + test ecx, ecx + jz endloop + begloop : + mov eax, dword ptr[esi-3] + sub esi, 4 + bswap eax + mov dword ptr[edi], eax + add edi, 4 + dec ecx + jnz begloop + endloop : + } +} + +static __inline void qinterpolatedown16(int32_t a, int32_t c, int32_t d, int32_t s) +{ + _asm { + mov eax, a + mov ecx, c + mov edx, d + mov esi, s + mov ebx, ecx + shr ecx, 1 + jz skipbegcalc + begqcalc : + lea edi, [edx+esi] + sar edx, 16 + mov dword ptr[eax], edx + lea edx, [edi+esi] + sar edi, 16 + mov dword ptr[eax+4], edi + add eax, 8 + dec ecx + jnz begqcalc + test ebx, 1 + jz skipbegqcalc2 + skipbegcalc : + sar edx, 16 + mov dword ptr[eax], edx + skipbegqcalc2 : + } +} + +static __inline void qinterpolatedown16short(int32_t a, int32_t c, int32_t d, int32_t s) +{ + _asm { + mov eax, a + mov ecx, c + mov edx, d + mov esi, s + test ecx, ecx + jz endit + test al, 2 + jz skipalignit + mov ebx, edx + sar ebx, 16 + mov word ptr[eax], bx + add edx, esi + add eax, 2 + dec ecx + jz endit + skipalignit : + sub ecx, 2 + jc finishit + begqcalc : + mov ebx, edx + add edx, esi + sar ebx, 16 + mov edi, edx + and edi, 0ffff0000h + add edx, esi + add ebx, edi + mov dword ptr[eax], ebx + add eax, 4 + sub ecx, 2 + jnc begqcalc + test cl, 1 + jz endit + finishit : + mov ebx, edx + sar ebx, 16 + mov word ptr[eax], bx + endit : + } +} + +static __inline int32_t mul3(int32_t a) +{ + _asm { + mov eax, a + lea eax, [eax+eax*2] + } +} + +static __inline int32_t mul5(int32_t a) +{ + _asm { + mov eax, a + lea eax, [eax+eax*4] + } +} + +static __inline int32_t mul9(int32_t a) +{ + _asm { + mov eax, a + lea eax, [eax+eax*8] + } +} + +//returns eax/ebx, dmval = eax%edx; +static __inline int32_t divmod(int32_t a, int32_t b) +{ + _asm { + mov eax, a + xor edx, edx + div b + mov dmval, edx + } +} + +//returns eax%ebx, dmval = eax/edx; +static __inline int32_t moddiv(int32_t a, int32_t b) +{ + _asm { + mov eax, a + xor edx, edx + div b + mov dmval, eax + mov eax, edx + } +} + +static __inline int32_t klabs(int32_t a) +{ + _asm { + mov eax, a + test eax, eax + jns skipnegate + neg eax + skipnegate : + } +} + +static __inline int32_t ksgn(int32_t b) +{ + _asm { + mov ebx, b + add ebx, ebx + sbb eax, eax + cmp eax, ebx + adc al, 0 + } +} + +//eax = (unsigned min)umin(eax,ebx) +static __inline int32_t umin(int32_t a, int32_t b) +{ + _asm { + mov eax, a + sub eax, b + sbb ecx, ecx + and eax, ecx + add eax, b + } +} + +//eax = (unsigned max)umax(eax,ebx) +static __inline int32_t umax(int32_t a, int32_t b) +{ + _asm { + mov eax, a + sub eax, b + sbb ecx, ecx + xor ecx, 0xffffffff + and eax, ecx + add eax, b + } +} + +static __inline int32_t kmin(int32_t a, int32_t b) +{ + _asm { + mov eax, a + mov ebx, b + cmp eax, ebx + jl skipit + mov eax, ebx + skipit : + } +} + +static __inline int32_t kmax(int32_t a, int32_t b) +{ + _asm { + mov eax, a + mov ebx, b + cmp eax, ebx + jg skipit + mov eax, ebx + skipit : + } +} + +static __inline void swapchar(void *a, void *b) +{ + _asm { + mov eax, a + mov ebx, b + mov cl, [eax] + mov ch, [ebx] + mov[ebx], cl + mov[eax], ch + } +} + +static __inline void swapshort(void *a, void *b) +{ + _asm { + mov eax, a + mov ebx, b + mov cx, [eax] + mov dx, [ebx] + mov[ebx], cx + mov[eax], dx + } +} + +static __inline void swaplong(void *a, void *b) +{ + _asm { + mov eax, a + mov ebx, b + mov ecx, [eax] + mov edx, [ebx] + mov[ebx], ecx + mov[eax], edx + } +} + +static __inline void swapbuf4(void *a, void *b, int32_t c) +{ + _asm { + mov eax, a + mov ebx, b + mov ecx, c + begswap : + mov esi, [eax] + mov edi, [ebx] + mov[ebx], esi + mov[eax], edi + add eax, 4 + add ebx, 4 + dec ecx + jnz short begswap + } +} + +static __inline void swap64bit(void *a, void *b) +{ + _asm { + mov eax, a + mov ebx, b + mov ecx, [eax] + mov edx, [ebx] + mov[ebx], ecx + mov ecx, [eax+4] + mov[eax], edx + mov edx, [ebx+4] + mov[ebx+4], ecx + mov[eax+4], edx + } +} + +//swapchar2(ptr1,ptr2,xsiz); is the same as: +//swapchar(ptr1,ptr2); swapchar(ptr1+1,ptr2+xsiz); +static __inline void swapchar2(void *a, void *b, int32_t s) +{ + _asm { + mov eax, a + mov ebx, b + mov esi, s + add esi, ebx + mov cx, [eax] + mov dl, [ebx] + mov[ebx], cl + mov dh, [esi] + mov[esi], ch + mov[eax], dx + } +} +//}}} + +#endif // __pragmas_x86_h__ +#endif // __pragmas_h__ diff --git a/polymer/eduke32/build/src/pragmas.c b/polymer/eduke32/build/src/pragmas.c index 8336508c5..db26f7212 100644 --- a/polymer/eduke32/build/src/pragmas.c +++ b/polymer/eduke32/build/src/pragmas.c @@ -171,33 +171,6 @@ void clearbufbyte(void *d, int32_t c, int32_t a) #define ASM __asm__ __volatile__ - -int32_t boundmulscale(int32_t a, int32_t b, int32_t c) -{ - ASM( - "imull %%ebx\n\t" - "movl %%edx, %%ebx\n\t" // mov ebx, edx - "shrdl %%cl, %%edx, %%eax\n\t" // mov eax, edx, cl - "sarl %%cl, %%edx\n\t" // sar edx, cl - "xorl %%eax, %%edx\n\t" // xor edx, eax - "js 0f\n\t" // js checkit - "xorl %%eax, %%edx\n\t" // xor edx, eax - "jz 1f\n\t" // js skipboundit - "cmpl $0xffffffff, %%edx\n\t" // cmp edx, 0xffffffff - "je 1f\n\t" // je skipboundit - "0:\n\t" // checkit: - "movl %%ebx, %%eax\n\t" // mov eax, ebx - "sarl $31, %%eax\n\t" // sar eax, 31 - "xorl $0x7fffffff, %%eax\n\t" // xor eax, 0x7fffffff - "1:" // skipboundit: - : "+a"(a), "+b"(b), "+c"(c) // input eax ebx ecx - : - : "edx", "cc" - ); - return a; -} - - void clearbufbyte(void *D, int32_t c, int32_t a) { ASM(