Various additional optimizations: add CLASSIC_SLICE_BY_4 mode to unroll some of the loops in a-c, replace integer divisions by a divisor unknown at compile time with usage of libdivide, clean up pragmas further by removing more old stuff that wasn't used anywhere. This is another one of those nasty commits that make people cry. DONT_BUILD.

git-svn-id: https://svn.eduke32.com/eduke32@4658 1a8010ca-5511-0410-912e-c29ae57300e0
2024-12-24 18:50:47 +00:00 · 2014-10-25 03:29:21 +00:00 · 2014-10-25 03:29:21 +00:00 · 6c6b68d534
commit 6c6b68d534
parent 6f332e645d
37 changed files with 1903 additions and 914 deletions
--- a/polymer/eduke32/build/Makefile.deps
+++ b/polymer/eduke32/build/Makefile.deps
@ -5,7 +5,7 @@ $(ENGINE_OBJ)/a.$o: $(ENGINE_SRC)/a.$(asm)
 $(ENGINE_OBJ)/baselayer.$o: $(ENGINE_SRC)/baselayer.c $(ENGINE_INC)/compat.h $(ENGINE_INC)/baselayer.h $(ENGINE_INC)/build.h $(ENGINE_INC)/osd.h
 $(ENGINE_OBJ)/build.$o: $(ENGINE_SRC)/build.c $(ENGINE_INC)/build.h $(ENGINE_INC)/pragmas.h $(ENGINE_INC)/compat.h $(ENGINE_INC)/baselayer.h $(ENGINE_INC)/editor.h
 $(ENGINE_OBJ)/cache1d.$o: $(ENGINE_SRC)/cache1d.c $(ENGINE_INC)/compat.h $(ENGINE_INC)/cache1d.h $(ENGINE_INC)/pragmas.h $(ENGINE_INC)/baselayer.h
-$(ENGINE_OBJ)/compat.$o: $(ENGINE_SRC)/compat.c $(ENGINE_INC)/compat.h
+$(ENGINE_OBJ)/compat.$o: $(ENGINE_SRC)/compat.c $(ENGINE_INC)/compat.h $(ENGINE_INC)/libdivide.h
 $(ENGINE_OBJ)/config.$o: $(ENGINE_SRC)/config.c $(ENGINE_INC)/compat.h $(ENGINE_INC)/osd.h $(ENGINE_INC)/editor.h
 $(ENGINE_OBJ)/crc32.$o: $(ENGINE_SRC)/crc32.c $(ENGINE_INC)/crc32.h
 $(ENGINE_OBJ)/defs.$o: $(ENGINE_SRC)/defs.c $(ENGINE_INC)/build.h $(ENGINE_INC)/baselayer.h $(ENGINE_INC)/scriptfile.h $(ENGINE_INC)/compat.h
--- a/polymer/eduke32/build/include/a.h
+++ b/polymer/eduke32/build/include/a.h
@ -8,6 +8,8 @@

 #include "compat.h"

+#define CLASSIC_SLICE_BY_4
+
 /** Definitions of high-precision integer types. **/
 // Should be used for values that represent coordinates with which calculations
 // like dot product are carried out. Substituting 32-bit ints for these will
--- a/polymer/eduke32/build/include/compat.h
+++ b/polymer/eduke32/build/include/compat.h
@ -92,6 +92,8 @@

 #define WITHKPLIB

+#include "libdivide.h"
+
 // Define this to rewrite all 'B' versions to library functions. This
 // is for platforms which give us a standard sort of C library so we
 // link directly. Platforms like PalmOS which don't have a standard C
@ -146,31 +148,6 @@
 #if _MSC_VER < 1800
 # define inline __inline

-# ifndef _WIN64
-static inline float nearbyintf(float x) 
-{ 
-    uint32_t w1, w2;
-    __asm fnstcw w1
-    w2 = w1 | 0x00000020;
-    __asm
-    {
-        fldcw w2
-        fld x
-        frndint
-        fclex
-        fldcw w1
-    }
-}
-# else
-#include <math.h>
-static inline float nearbyintf(float x)
-{
-    if (x >= 0.0)
-        return floorf(x + 0.5);
-    else
-        return floorf(x - 0.5);
-}
-# endif
 #endif

 #include <math.h>
@ -178,31 +155,8 @@ static inline long lround(double num)
 {
    return (long) (num > 0 ? num + 0.5 : ceil(num - 0.5));
 }
-
-#if defined(_WIN64)
-#include <emmintrin.h>
-static inline int32_t Blrintf(const float x)
-{
-    __m128 xx = _mm_load_ss(&x);
-   return _mm_cvtss_si32(xx);
-}
-#else
-static inline int32_t Blrintf(const float x)
-{
-    int n;
-    __asm fld x;
-    __asm fistp n;
-    return n;
-}   
-#endif
 #else
 # define longlong(x) x##ll
-#define Blrintf lrintf
-#endif
-
-#if defined __OPENDINGUX__
-//ugly hack
-#define nearbyintf rintf
 #endif

 #if defined(__arm__)
@ -398,60 +352,23 @@ static inline uint16_t B_UNBUF16(const uint8_t *buf) { return (buf[1] << 8) | (b
 static inline uint32_t B_UNBUF32(const uint8_t *buf) { return (buf[3] << 24) | (buf[2] << 16) | (buf[1] << 8) | (buf[0]); }
 static inline uint64_t B_UNBUF64(const uint8_t *buf) { return ((uint64_t)buf[7] << 56) | ((uint64_t)buf[6] << 48) | ((uint64_t)buf[5] << 40) | ((uint64_t)buf[4] << 32) | (buf[3] << 24) | (buf[2] << 16) | (buf[1] << 8) | (buf[0]); }

-#if defined(USE_MSC_PRAGMAS)
-static inline void ftol(float f, int32_t *a)
+#if defined(BITNESS64)
+#include <emmintrin.h>
+static inline int32_t Blrintf(const float x)
 {
-    _asm
-    {
-        mov eax, a
-            fld f
-            fistp dword ptr [eax]
-    }
+    __m128 xx = _mm_load_ss(&x);
+    return _mm_cvtss_si32(xx);
 }
-
-static inline void dtol(double d, int32_t *a)
+#elif defined (_MSC_VER)
+static inline int32_t Blrintf(const float x)
 {
-    _asm
-    {
-        mov eax, a
-            fld d
-            fistp dword ptr [eax]
-    }
+    int n;
+    __asm fld x;
+    __asm fistp n;
+    return n;
 }
-#elif defined(USE_GCC_PRAGMAS)
-
-static inline void ftol(float f, int32_t *a)
-{
-    __asm__ __volatile__(
-#if 0 //(__GNUC__ >= 3)
-        "flds %1; fistpl %0;"
 #else
-        "flds %1; fistpl (%0);"
-#endif
-        : "=r"(a) : "m"(f) : "memory","cc");
-}
-
-static inline void dtol(double d, int32_t *a)
-{
-    __asm__ __volatile__(
-#if 0 //(__GNUC__ >= 3)
-        "fldl %1; fistpl %0;"
-#else
-        "fldl %1; fistpl (%0);"
-#endif
-        : "=r"(a) : "m"(d) : "memory","cc");
-}
-
-#else
-static inline void ftol(float f, int32_t *a)
-{
-    *a = (int32_t)f;
-}
-
-static inline void dtol(double d, int32_t *a)
-{
-    *a = (int32_t)d;
-}
+#define Blrintf lrintf
 #endif

 #if B_LITTLE_ENDIAN == 1
@ -792,7 +709,7 @@ char *Bgetsystemdrives(void);
 int32_t Bfilelength(int32_t fd);
 char *Bstrtoken(char *s, const char *delim, char **ptrptr, int32_t chop);
 char *Bstrtolower(char *str);
-int32_t Bwildmatch (const char *i, const char *j);
+#define Bwildmatch wildmatch

 #if !defined(_WIN32)
 char *Bstrlwr(char *);
--- a/polymer/eduke32/build/include/hightile.h
+++ b/polymer/eduke32/build/include/hightile.h
@ -46,16 +46,16 @@ static inline int have_basepal_tint(void)

 static inline void hictinting_apply(float *color, int32_t palnum)
 {
-    color[0] *= (float)hictinting[palnum].r / 255.f;
-    color[1] *= (float)hictinting[palnum].g / 255.f;
-    color[2] *= (float)hictinting[palnum].b / 255.f;
+    color[0] *= (float)hictinting[palnum].r * (1.f/255.f);
+    color[1] *= (float)hictinting[palnum].g * (1.f/255.f);
+    color[2] *= (float)hictinting[palnum].b * (1.f/255.f);
 }

 static inline void hictinting_apply_ub(uint8_t *color, int32_t palnum)
 {
-    color[0] = (uint8_t)(color[0] * (float)hictinting[palnum].r / 255.f);
-    color[1] = (uint8_t)(color[1] * (float)hictinting[palnum].g / 255.f);
-    color[2] = (uint8_t)(color[2] * (float)hictinting[palnum].b / 255.f);
+    color[0] = (uint8_t)(color[0] * (float)hictinting[palnum].r * (1.f/255.f));
+    color[1] = (uint8_t)(color[1] * (float)hictinting[palnum].g * (1.f/255.f));
+    color[2] = (uint8_t)(color[2] * (float)hictinting[palnum].b * (1.f/255.f));
 }

 // texcacheheader cachead.flags bits
--- a/polymer/eduke32/build/include/kplib.h
+++ b/polymer/eduke32/build/include/kplib.h
@ -2,6 +2,10 @@
 extern "C" {
 #endif

+#ifndef __compat_h__
+#include "compat.h"
+#endif
+
 typedef struct
 {
    FILE *fil;    //0:no file open, !=0:open file (either stand-alone or zip)
@ -52,6 +56,7 @@ static inline int32_t filnamcmp(const char *j, const char *i)
        i++, j++;
    return *i != '\0';
 }
+extern int32_t wildmatch(const char *match, const char *wild);

 #ifdef EXTERNC
 }
--- a/polymer/eduke32/build/include/libdivide.h
+++ b/polymer/eduke32/build/include/libdivide.h
--- a/polymer/eduke32/build/include/pragmas.h
+++ b/polymer/eduke32/build/include/pragmas.h
@ -30,39 +30,93 @@ extern int32_t dmval;
 #define wo(x)	((int16_t)(x))		// word cast
 #define by(x)	((uint8_t)(x))		// byte cast

-// XXX: Only for testing on x86. Don't use from outside; it doesn't account for
-// whether we're compiling for e.g. x86_64 which will never use asm anyway.
-//#define USE_ASM_DIVSCALE
+#define LIBDIVIDE_ALWAYS
+#define DIVTABLESIZE 16384
+
+extern libdivide_s64pad_t divtable64[DIVTABLESIZE];
+extern libdivide_s32pad_t divtable32[DIVTABLESIZE];
+
+#if defined(__arm__) || defined(LIBDIVIDE_ALWAYS)
+static inline uint32_t divideu32(uint32_t n, uint32_t d)
+{
+    static libdivide_u32_t udiv;
+    static uint32_t lastd;
+
+    if (d == lastd)
+        goto skip;
+
+    lastd = d;
+    udiv = libdivide_u32_gen(d);
+skip:
+    return libdivide_u32_do(n, &udiv);
+}
+
+static inline int32_t tabledivide64(int64_t n, int32_t d)
+{
+    static libdivide_s64_t sdiv;
+    static int32_t lastd;
+    libdivide_s64_t *dptr = ((unsigned) d < DIVTABLESIZE) ? (libdivide_s64_t *)&divtable64[d] : &sdiv;
+
+    if (d == lastd || dptr != &sdiv)
+        goto skip;
+
+    lastd = d;
+    sdiv = libdivide_s64_gen(d);
+skip:
+    return libdivide_s64_do(n, dptr);
+}
+
+static inline int32_t tabledivide32(int32_t n, int32_t d)
+{
+    static libdivide_s32_t sdiv;
+    static int32_t lastd;
+    libdivide_s32_t *dptr = ((unsigned) d < DIVTABLESIZE) ? (libdivide_s32_t *)&divtable32[d] : &sdiv;
+
+    if (d == lastd || dptr != &sdiv)
+        goto skip;
+
+    lastd = d;
+    sdiv = libdivide_s32_gen(d);
+skip:
+    return libdivide_s32_do(n, dptr);
+}
+#else
+static inline uint32_t divideu32(uint32_t n, uint32_t d) { return n / d; }
+
+static inline int32_t tabledivide64(int64_t n, int32_t d) { return ((unsigned) d < DIVTABLESIZE) ?
+    libdivide_s64_do(n, (libdivide_s64_t *) &divtable64[d]) : n / d; }
+
+static inline int32_t tabledivide32(int32_t n, int32_t d) { return ((unsigned) d < DIVTABLESIZE) ?
+    libdivide_s32_do(n, (libdivide_s32_t *) &divtable32[d]) : n / d; }
+#endif
+
+extern uint32_t divideu32_noinline(uint32_t n, uint32_t d);
+extern int32_t tabledivide32_noinline(int32_t n, int32_t d);
+extern int32_t tabledivide64_noinline(int64_t n, int32_t d);

-#if !defined USE_ASM_DIVSCALE
 #ifdef GEKKO
 #include <math.h>
 static inline int32_t divscale(int32_t eax, int32_t ebx, int32_t ecx)
 {
-    return ldexp(eax, ecx) / ebx;
+    return tabledivide64(ldexp(eax, ecx), ebx);
 }
-
-# define _scaler(a) \
-    static inline int32_t divscale##a(int32_t eax, int32_t ebx) \
-{ \
-    return divscale(eax, ebx, a); \
-} \
-
 #else
-static inline int32_t divscale(int32_t eax, int32_t ebx, int32_t ecx) { return dw((qw(eax) << by(ecx)) / ebx); }
-
-# define _scaler(a) \
-    static inline int32_t divscale##a(int32_t eax, int32_t ebx) \
-{ \
-    return dw((qw(eax) << by(a)) / ebx); \
-} \
-
+static inline int32_t divscale(int32_t eax, int32_t ebx, int32_t ecx)
+{
+    const int64_t numer = qw(eax) << by(ecx);
+    return dw(tabledivide64(numer, ebx));
+}
 #endif

+# define _scaler(a) static inline int32_t divscale##a(int32_t eax, int32_t ebx) { return divscale(eax, ebx, a); }
 PRAGMA_FUNCS _scaler(32)
-
 #undef _scaler
-#endif  // !defined USE_ASM_DIVSCALE
+
+static inline int32_t scale(int32_t eax, int32_t edx, int32_t ecx)
+{
+    const int64_t numer = qw(eax) * edx;
+    return dw(tabledivide64(numer, ecx));
+}

 #if defined(__GNUC__) && defined(GEKKO)

@ -115,11 +169,6 @@ static inline void swap64bit(void* a, void* b) { int64_t t = *((int64_t*)b); *((

 static inline char readpixel(void* s)    { return (*((char*)(s))); }
 static inline void drawpixel(void* s, char a)    { *((char*)(s)) = a; }
-static inline void drawpixels(void* s, int16_t a)  { *((int16_t*)(s)) = a; }
-static inline void drawpixelses(void* s, int32_t a) { *((int32_t*)(s)) = a; }
-
-static inline int32_t divmod(int32_t a, int32_t b) { uint32_t _a=(uint32_t)a, _b=(uint32_t)b; dmval = _a%_b; return _a/_b; }
-static inline int32_t moddiv(int32_t a, int32_t b) { uint32_t _a=(uint32_t)a, _b=(uint32_t)b; dmval = _a/_b; return _a%_b; }

 static inline int32_t klabs(int32_t a) { const uint32_t m = a >> (sizeof(int) * CHAR_BIT - 1); return (a ^ m) - m; }
 static inline int32_t ksgn(int32_t a)  { return (a>0)-(a<0); }
@ -130,7 +179,6 @@ static inline int32_t kmin(int32_t a, int32_t b) { if ((int32_t)a < (int32_t)b)
 static inline int32_t kmax(int32_t a, int32_t b) { if ((int32_t)a < (int32_t)b) return b; return a; }

 static inline int32_t sqr(int32_t eax) { return (eax) * (eax); }
-static inline int32_t scale(int32_t eax, int32_t edx, int32_t ecx) { return dw((qw(eax) * edx) / ecx); }
 static inline int32_t mulscale(int32_t eax, int32_t edx, int32_t ecx) { return dw((qw(eax) * edx) >> by(ecx)); }
 static inline int32_t dmulscale(int32_t eax, int32_t edx, int32_t esi, int32_t edi, int32_t ecx) { return dw(((qw(eax) * edx) + (qw(esi) * edi)) >> by(ecx)); }

@ -155,9 +203,15 @@ void copybufreverse(const void *S, void *D, int32_t c);
 static inline void swapbufreverse(void *s, void *d, int32_t c)
 {
    uint8_t *src = (uint8_t*)s, *dst = (uint8_t*)d;
-    while (c--) {
-        swapchar(dst++, src--);
-    }
+    do
+    {
+        swapchar(dst, src);
+        swapchar(dst+1, src-1);
+        swapchar(dst+2, src-2);
+        swapchar(dst+3, src-3);
+        dst += 4, src -= 4;
+    } while (--c > 4);
+    while (c--) swapchar(dst++, src--);
 }

 #ifdef EXTERNC
--- a/polymer/eduke32/build/include/pragmas_arm.h
+++ b/polymer/eduke32/build/include/pragmas_arm.h
@ -5,7 +5,6 @@
 #ifndef __pragmas_arm_h__
 #define __pragmas_arm_h__

-// TODO: implement libdivide.h
 #define _scaler(a) \
 static inline int32_t mulscale##a(int32_t eax, int32_t edx) \
 { \
@ -30,11 +29,6 @@ static inline void swap64bit(void* a, void* b) { int64_t t = *((int64_t*) b); *(

 static inline char readpixel(void* s)    { return (*((char*) (s))); }
 static inline void drawpixel(void* s, char a)    { *((char*) (s)) = a; }
-static inline void drawpixels(void* s, int16_t a)  { *((int16_t*) (s)) = a; }
-static inline void drawpixelses(void* s, int32_t a) { *((int32_t*) (s)) = a; }
-
-static inline int32_t divmod(int32_t a, int32_t b) { uint32_t _a=(uint32_t) a, _b=(uint32_t) b; dmval = _a%_b; return _a/_b; }
-static inline int32_t moddiv(int32_t a, int32_t b) { uint32_t _a=(uint32_t) a, _b=(uint32_t) b; dmval = _a/_b; return _a%_b; }

 static inline int32_t klabs(int32_t a) { const uint32_t m = a >> (sizeof(int) * CHAR_BIT - 1); return (a ^ m) - m; }
 static inline int32_t ksgn(int32_t a)  { return (a>0)-(a<0); }
@ -45,7 +39,6 @@ static inline int32_t kmin(int32_t a, int32_t b) { if ((int32_t) a < (int32_t) b
 static inline int32_t kmax(int32_t a, int32_t b) { if ((int32_t) a < (int32_t) b) return b; return a; }

 static inline int32_t sqr(int32_t eax) { return (eax) * (eax); }
-static inline int32_t scale(int32_t eax, int32_t edx, int32_t ecx) { return dw((qw(eax) * qw(edx)) / qw(ecx)); }
 static inline int32_t mulscale(int32_t eax, int32_t edx, int32_t ecx) { return dw((qw(eax) * qw(edx)) >> by(ecx)); }
 static inline int32_t dmulscale(int32_t eax, int32_t edx, int32_t esi, int32_t edi, int32_t ecx) { return dw(((qw(eax) * qw(edx)) + (qw(esi) * qw(edi))) >> by(ecx)); }

--- a/polymer/eduke32/build/include/pragmas_ppc.h
+++ b/polymer/eduke32/build/include/pragmas_ppc.h
@ -6,8 +6,6 @@

 #define sqr(a) ((a)*(a))

-int32_t scale(int32_t a, int32_t d, int32_t c);
-
 #define _scaler(x) \
 static inline int32_t mulscale##x(int32_t a, int32_t d) \
 { \
@ -118,26 +116,6 @@ static inline void drawpixel(void *d, char a)
    *(char*) d = a;
 }

-static inline void drawpixels(void *d, int16_t a)
-{
-    __asm__(
-        " sthbrx %0, 0, %1\n"
-        :
-    : "r"(&a), "r"(d)
-        : "memory"
-        );
-}
-
-static inline void drawpixelses(void *d, int32_t a)
-{
-    __asm__(
-        " stwbrx %0, 0, %1\n"
-        :
-    : "r"(&a), "r"(d)
-        : "memory"
-        );
-}
-
 void clearbufbyte(void *d, int32_t c, int32_t a);

 static inline void clearbuf(void *d, int32_t c, int32_t a)
@ -266,32 +244,6 @@ static inline void swap64bit(void *a, void *b)
    *(double*) b = t;
 }

-static inline int32_t divmod(int32_t a, int32_t b)
-{
-    int32_t div;
-    __asm__(
-        " divwu %0, %2, %3\n"
-        " mullw %1, %0, %3\n"
-        " subf  %1, %1, %2\n"
-        : "=&r"(div), "=&r"(dmval)
-        : "r"(a), "r"(b)
-        );
-    return div;
-}
-
-static inline int32_t moddiv(int32_t a, int32_t b)
-{
-    int32_t mod;
-    __asm__(
-        " divwu %0, %2, %3\n"
-        " mullw %1, %0, %3\n"
-        " subf  %1, %1, %2\n"
-        : "=&r"(dmval), "=&r"(mod)
-        : "r"(a), "r"(b)
-        );
-    return mod;
-}
-
 static inline int32_t umin(int32_t a, int32_t b) { if ((uint32_t) a < (uint32_t) b) return a; return b; }
 static inline int32_t umax(int32_t a, int32_t b) { if ((uint32_t) a < (uint32_t) b) return b; return a; }
 static inline int32_t kmin(int32_t a, int32_t b) { if ((int32_t) a < (int32_t) b) return a; return b; }
--- a/polymer/eduke32/build/include/pragmas_x86_gcc.h
+++ b/polymer/eduke32/build/include/pragmas_x86_gcc.h
@ -33,13 +33,6 @@ void copybufreverse(const void *S, void *D, int32_t c);
 #define sqr(a) __builtin_sqr(a)
 #endif

-#define scale(a,d,c) \
-	({ int32_t __a=(a), __d=(d), __c=(c); \
-	   __asm__ __volatile__ ("imull %%edx; idivl %%ecx" \
-		: "=a" (__a), "=d" (__d) \
-		: "0" (__a), "1" (__d), "c" (__c) : "cc"); \
-	 __a; })
-
 #define mulscale(a,d,c) \
 	({ int32_t __a=(a), __d=(d), __c=(c); \
 	   __asm__ __volatile__ ("imull %%edx; shrdl %%cl, %%edx, %%eax" \
@ -471,174 +464,6 @@ void copybufreverse(const void *S, void *D, int32_t c);
 		: "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \
 	 __d; })

-#ifdef USE_ASM_DIVSCALE
-#define divscale(a,b,c) \
-	({ int32_t __a=(a), __b=(b), __c=(c); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; shll %%cl, %%eax; negb %%cl; sarl %%cl, %%edx; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "c" (__c), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale1(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("addl %%eax, %%eax; sbbl %%edx, %%edx; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale2(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; sarl $30, %%edx; leal (,%%eax,4), %%eax; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale3(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; sarl $29, %%edx; leal (,%%eax,8), %%eax; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale4(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; sarl $28, %%edx; shll $4, %%eax; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale5(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; sarl $27, %%edx; shll $5, %%eax; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale6(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; sarl $26, %%edx; shll $6, %%eax; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale7(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; sarl $25, %%edx; shll $7, %%eax; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale8(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; sarl $24, %%edx; shll $8, %%eax; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale9(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; sarl $23, %%edx; shll $9, %%eax; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale10(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; sarl $22, %%edx; shll $10, %%eax; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale11(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; sarl $21, %%edx; shll $11, %%eax; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale12(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; sarl $20, %%edx; shll $12, %%eax; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale13(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; sarl $19, %%edx; shll $13, %%eax; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale14(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; sarl $18, %%edx; shll $14, %%eax; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale15(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; sarl $17, %%edx; shll $15, %%eax; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale16(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; sarl $16, %%edx; shll $16, %%eax; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale17(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; sarl $15, %%edx; shll $17, %%eax; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale18(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; sarl $14, %%edx; shll $18, %%eax; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale19(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; sarl $13, %%edx; shll $19, %%eax; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale20(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; sarl $12, %%edx; shll $20, %%eax; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale21(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; sarl $11, %%edx; shll $21, %%eax; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale22(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; sarl $10, %%edx; shll $22, %%eax; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale23(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; sarl $9, %%edx; shll $23, %%eax; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale24(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; sarl $8, %%edx; shll $24, %%eax; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale25(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; sarl $7, %%edx; shll $25, %%eax; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale26(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; sarl $6, %%edx; shll $26, %%eax; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale27(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; sarl $5, %%edx; shll $27, %%eax; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale28(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; sarl $4, %%edx; shll $28, %%eax; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale29(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; sarl $3, %%edx; shll $29, %%eax; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale30(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; sarl $2, %%edx; shll $30, %%eax; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale31(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("movl %%eax, %%edx; sarl $1, %%edx; shll $31, %%eax; idivl %%ebx" \
-		: "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \
-	 __a; })
-#define divscale32(d,b) \
-	({ int32_t __d=(d), __b=(b), __r; \
-	   __asm__ __volatile__ ("xorl %%eax, %%eax; idivl %%ebx" \
-		: "=a" (__r), "=d" (__d) : "d" (__d), "b" (__b) : "cc"); \
-	 __r; })
-#endif  // defined USE_ASM_DIVSCALE
-
 #define readpixel(D) \
 	({ void *__D=(D); int32_t __a; \
 	   __asm__ __volatile__ ("movb (%%edi), %%al" \
@ -649,16 +474,6 @@ void copybufreverse(const void *S, void *D, int32_t c);
 	   __asm__ __volatile__ ("movb %%al, (%%edi)" \
 		: : "D" (__D), "a" (__a) : "memory", "cc"); \
 	 0; })
-#define drawpixels(D,a) \
-	({ void *__D=(D); int32_t __a=(a); \
-	   __asm__ __volatile__ ("movw %%ax, (%%edi)" \
-		: : "D" (__D), "a" (__a) : "memory", "cc"); \
-	 0; })
-#define drawpixelses(D,a) \
-	({ void *__D=(D); int32_t __a=(a); \
-	   __asm__ __volatile__ ("movl %%eax, (%%edi)" \
-		: : "D" (__D), "a" (__a) : "memory", "cc"); \
-	 0; })
 #define clearbuf(D,c,a) \
 	({ void *__D=(D); int32_t __c=(c), __a=(a); \
 	   __asm__ __volatile__ ("rep; stosl" \
@ -670,19 +485,6 @@ void copybufreverse(const void *S, void *D, int32_t c);
 		: "=&S" (__S), "=&D" (__D), "=&c" (__c) : "0" (__S), "1" (__D), "2" (__c) : "memory", "cc"); \
 	 0; })

-//returns eax/ebx, dmval = eax%edx;
-#define divmod(a,b) \
-	({ int32_t __a=(a), __b=(b); \
-	   __asm__ __volatile__ ("xorl %%edx, %%edx; divl %%ebx; movl %%edx, "_DMVAL \
-		: "+a" (__a) : "b" (__b) : "edx", "memory", "cc"); \
-	 __a; })
-//returns eax%ebx, dmval = eax/edx;
-#define moddiv(a,b) \
-	({ int32_t __a=(a), __b=(b), __d; \
-	   __asm__ __volatile__ ("xorl %%edx, %%edx; divl %%ebx; movl %%eax, "_DMVAL \
-		: "=d" (__d) : "a" (__a), "b" (__b) : "eax", "memory", "cc"); \
-	 __d; })
-
 #define klabs(a) \
 	({ int32_t __a=(a); \
 	   __asm__ __volatile__ ("testl %%eax, %%eax; jns 0f; negl %%eax; 0:" \
--- a/polymer/eduke32/build/include/pragmas_x86_msvc.h
+++ b/polymer/eduke32/build/include/pragmas_x86_msvc.h
@ -16,15 +16,6 @@ static __inline int32_t sqr(int32_t a)
    }
 }

-static __inline int32_t scale(int32_t a, int32_t d, int32_t c)
-{
-    _asm {
-        mov eax, a
-            imul d
-            idiv c
-    }
-}
-
 static __inline int32_t mulscale(int32_t a, int32_t d, int32_t c)
 {
    _asm {
@ -99,80 +90,6 @@ static __inline int32_t dmulscale32(int32_t a, int32_t d, int32_t S, int32_t D)
    }
 }

-#ifdef USE_ASM_DIVSCALE
-static __inline int32_t divscale(int32_t a, int32_t b, int32_t c)
-{
-    _asm {
-        mov eax, a
-            mov ecx, c
-            mov edx, eax
-            shl eax, cl
-            neg cl
-            sar edx, cl
-            idiv b
-    }
-}
-
-static __inline int32_t divscale1(int32_t a, int32_t b)
-{
-    _asm {
-        mov eax, a
-            add eax, eax
-            sbb edx, edx
-            idiv b
-    }
-}
-
-static __inline int32_t divscale2(int32_t a, int32_t b)
-{
-    _asm {
-        mov eax, a
-            mov edx, eax
-            sar edx, 30
-            lea eax, [eax*4]
-            idiv b
-    }
-}
-
-static __inline int32_t divscale3(int32_t a, int32_t b)
-{
-    _asm {
-        mov eax, a
-            mov edx, eax
-            sar edx, 29
-            lea eax, [eax*8]
-            idiv b
-    }
-}
-
-#define DIVSCALE(x,y) \
-static __inline int32_t divscale##y(int32_t a, int32_t b) \
-{ \
-	_asm mov eax, a \
-	_asm mov edx, eax \
-	_asm sar edx, x \
-	_asm shl eax, y \
-	_asm idiv b \
-}
-
-DIVSCALE(28, 4)	DIVSCALE(27, 5)	DIVSCALE(26, 6)	DIVSCALE(25, 7)
-DIVSCALE(24, 8)	DIVSCALE(23, 9)	DIVSCALE(22, 10)	DIVSCALE(21, 11)
-DIVSCALE(20, 12)	DIVSCALE(19, 13)	DIVSCALE(18, 14)	DIVSCALE(17, 15)
-DIVSCALE(16, 16)	DIVSCALE(15, 17)	DIVSCALE(14, 18)	DIVSCALE(13, 19)
-DIVSCALE(12, 20)	DIVSCALE(11, 21)	DIVSCALE(10, 22)	DIVSCALE(9, 23)
-DIVSCALE(8, 24)	DIVSCALE(7, 25)	DIVSCALE(6, 26)	DIVSCALE(5, 27)
-DIVSCALE(4, 28)	DIVSCALE(3, 29)	DIVSCALE(2, 30)	DIVSCALE(1, 31)
-
-static __inline int32_t divscale32(int32_t d, int32_t b)
-{
-    _asm {
-        mov edx, d
-            xor eax, eax
-            idiv b
-    }
-}
-#endif  // defined USE_ASM_DIVSCALE
-
 static __inline char readpixel(void *d)
 {
    _asm {
@ -190,24 +107,6 @@ static __inline void drawpixel(void *d, char a)
    }
 }

-static __inline void drawpixels(void *d, int16_t a)
-{
-    _asm {
-        mov edx, d
-            mov ax, a
-            mov word ptr[edx], ax
-    }
-}
-
-static __inline void drawpixelses(void *d, int32_t a)
-{
-    _asm {
-        mov edx, d
-            mov eax, a
-            mov dword ptr[edx], eax
-    }
-}
-
 static __inline void clearbuf(void *d, int32_t c, int32_t a)
 {
    _asm {
@ -415,29 +314,6 @@ static __inline void qinterpolatedown16short(int32_t a, int32_t c, int32_t d, in
    }
 }

-//returns eax/ebx, dmval = eax%edx;
-static __inline int32_t divmod(int32_t a, int32_t b)
-{
-    _asm {
-        mov eax, a
-            xor edx, edx
-            div b
-            mov dmval, edx
-    }
-}
-
-//returns eax%ebx, dmval = eax/edx;
-static __inline int32_t moddiv(int32_t a, int32_t b)
-{
-    _asm {
-        mov eax, a
-            xor edx, edx
-            div b
-            mov dmval, eax
-            mov eax, edx
-    }
-}
-
 static __inline int32_t klabs(int32_t a)
 {
    _asm {
--- a/polymer/eduke32/build/src/a-c.c
+++ b/polymer/eduke32/build/src/a-c.c
@ -7,6 +7,7 @@
 // See the included license file "BUILDLIC.TXT" for license info.

 #include "a.h"
+#include "pragmas.h"

 #ifdef ENGINE_USING_A_C

@ -71,6 +72,17 @@ void hlineasm4(int32_t cnt, int32_t skiploadincs, int32_t paloffs, uint32_t by,
        const int32_t logx = glogx, logy = glogy;
        char *pp = (char *)p;

+        for (; cnt>=4; cnt -= 4)
+        {
+            *pp = palptr[buf[((bx>>(32-logx))<<logy)+(by>>(32-logy))]]; pp--;
+            *pp = palptr[buf[(((bx-bxinc)>>(32-logx))<<logy)+((by-byinc)>>(32-logy))]]; pp--;
+            *pp = palptr[buf[(((bx-(bxinc<<1))>>(32-logx))<<logy)+((by-(byinc<<1))>>(32-logy))]]; pp--;
+            *pp = palptr[buf[(((bx-(bxinc*3))>>(32-logx))<<logy)+((by-(byinc*3))>>(32-logy))]]; pp--;
+
+            bx -= bxinc<<2;
+            by -= byinc<<2;
+        }
+
        for (; cnt>=0; cnt--)
        {
            *pp = palptr[buf[((bx>>(32-logx))<<logy)+(by>>(32-logy))]];
@ -121,10 +133,7 @@ static inline uint32_t ourmulscale32(uint32_t a, uint32_t b)

 static inline int32_t getpix(int32_t logy, const char *buf, uint32_t vplc)
 {
-    if (logy != 0)
-        return buf[vplc>>logy];
-    else
-        return buf[ourmulscale32(vplc,globaltilesizy)];
+    return logy ? buf[vplc>>logy] : buf[ourmulscale32(vplc,globaltilesizy)];
 }

 void setupvlineasm(int32_t neglogy) { glogy = neglogy; }
@ -138,18 +147,45 @@ int32_t vlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, in

    cnt++;

-    do
+    if (logy)
    {
-        if (logy != 0)
+#ifdef CLASSIC_SLICE_BY_4
+        for (; cnt>=4; cnt-=4)
+        {
            *pp = pal[buf[vplc>>logy]];
-        else
-            *pp = pal[buf[ourmulscale32(vplc,globaltilesizy)]];
-
-        pp += ourbpl;
-        vplc += vinc;
+            *(pp+ourbpl) = pal[buf[(vplc+vinc)>>logy]];
+            *(pp+(ourbpl<<1)) = pal[buf[(vplc+(vinc<<1))>>logy]];
+            *(pp+(ourbpl*3)) = pal[buf[(vplc+(vinc*3))>>logy ]];
+            pp += ourbpl<<2;
+            vplc += vinc<<2;
+        }
+#endif
+        while (cnt--)
+        {
+            *pp = pal[buf[vplc>>logy]];
+            pp += ourbpl;
+            vplc += vinc;
+        }
+    }
+    else
+    {
+#ifdef CLASSIC_SLICE_BY_4
+        for (; cnt>=4; cnt-=4)
+        {
+            *pp = pal[buf[ourmulscale32(vplc, globaltilesizy)]];
+            *(pp+ourbpl) = pal[buf[ourmulscale32((vplc+vinc),globaltilesizy)]];
+            *(pp+(ourbpl<<1)) = pal[buf[ourmulscale32((vplc+(vinc<<1)), globaltilesizy)]];
+            *(pp+(ourbpl*3)) = pal[buf[ourmulscale32((vplc+(vinc*3)), globaltilesizy)]];
+            pp += ourbpl<<2;
+            vplc += vinc<<2;
+        }
+#endif
+        while (cnt--)
+        {
+            *pp = pal[buf[ourmulscale32(vplc,globaltilesizy)]], pp += ourbpl;
+            vplc += vinc;
+        }
    }
-    while (--cnt);
-
    return vplc;
 }

@ -191,9 +227,6 @@ typedef uint32_t uint32_vec4 __attribute__ ((vector_size (16)));
 // cnt >= 1
 void vlineasm4(int32_t cnt, char *p)
 {
-    char ch;
-    int32_t i;
-
    char *const pal[4] = {(char *)palookupoffse[0], (char *)palookupoffse[1], (char *)palookupoffse[2], (char *)palookupoffse[3]};
    char *const buf[4] = {(char *)bufplce[0], (char *)bufplce[1], (char *)bufplce[2], (char *)bufplce[3]};
 #ifdef USE_VECTOR_EXT
@ -205,25 +238,86 @@ void vlineasm4(int32_t cnt, char *p)
 #endif
    const int32_t logy = glogy, ourbpl = bpl;

-    do
+    if (!logy)
    {
-        for (i=0; i<4; i++)
+        do
        {
-            ch = getpix(logy, buf[i], vplc[i]);
-            p[i] = pal[i][ch];
-#if !defined USE_VECTOR_EXT
-            vplc[i] += vinc[i];
+            p[0] = pal[0][buf[0][ourmulscale32(vplc[0],globaltilesizy)]];
+            p[1] = pal[1][buf[1][ourmulscale32(vplc[1],globaltilesizy)]];
+            p[2] = pal[2][buf[2][ourmulscale32(vplc[2],globaltilesizy)]];
+            p[3] = pal[3][buf[3][ourmulscale32(vplc[3],globaltilesizy)]];
+
+#if defined USE_VECTOR_EXT
+            vplc += vinc;
+#else
+            vplc[0] += vinc[0];
+            vplc[1] += vinc[1];
+            vplc[2] += vinc[2];
+            vplc[3] += vinc[3];
 #endif
+            p += ourbpl;
        }
-#ifdef USE_VECTOR_EXT
+        while (--cnt);
+
+        goto skip;
+    }
+    
+    // just fucking shoot me
+#ifdef CLASSIC_SLICE_BY_4
+    for (; cnt>=4;cnt-=4)
+    {
+        p[0]                = pal[0][buf[0][ vplc[0]>>logy ]];
+        p[1]                = pal[1][buf[1][ vplc[1]>>logy ]];
+        p[2]                = pal[2][buf[2][ vplc[2]>>logy ]];
+        p[3]                = pal[3][buf[3][ vplc[3]>>logy ]];
+        (p+ourbpl)[0]       = pal[0][buf[0][ (vplc[0]+vinc[0])>>logy ]];
+        (p+ourbpl)[1]       = pal[1][buf[1][ (vplc[1]+vinc[1])>>logy ]];
+        (p+ourbpl)[2]       = pal[2][buf[2][ (vplc[2]+vinc[2])>>logy ]];
+        (p+ourbpl)[3]       = pal[3][buf[3][ (vplc[3]+vinc[3])>>logy ]];
+        (p+(ourbpl<<1))[0]  = pal[0][buf[0][ (vplc[0]+(vinc[0]<<1))>>logy ]];
+        (p+(ourbpl<<1))[1]  = pal[1][buf[1][ (vplc[1]+(vinc[1]<<1))>>logy ]];
+        (p+(ourbpl<<1))[2]  = pal[2][buf[2][ (vplc[2]+(vinc[2]<<1))>>logy ]];
+        (p+(ourbpl<<1))[3]  = pal[3][buf[3][ (vplc[3]+(vinc[3]<<1))>>logy ]];
+        (p+(ourbpl*3))[0]   = pal[0][buf[0][ (vplc[0]+(vinc[0]*3))>>logy ]];
+        (p+(ourbpl*3))[1]   = pal[1][buf[1][ (vplc[1]+(vinc[1]*3))>>logy ]];
+        (p+(ourbpl*3))[2]   = pal[2][buf[2][ (vplc[2]+(vinc[2]*3))>>logy ]];
+        (p+(ourbpl*3))[3]   = pal[3][buf[3][ (vplc[3]+(vinc[3]*3))>>logy ]];
+
+#if defined USE_VECTOR_EXT
+        vplc += vinc<<2;
+#else
+        vplc[0] += vinc[0]<<2;
+        vplc[1] += vinc[1]<<2;
+        vplc[2] += vinc[2]<<2;
+        vplc[3] += vinc[3]<<2;
+#endif
+        p += ourbpl<<2;
+    }
+#endif
+
+    while (cnt--)
+    {
+        p[0] = pal[0][buf[0][vplc[0]>>logy]];
+        p[1] = pal[1][buf[1][vplc[1]>>logy]];
+        p[2] = pal[2][buf[2][vplc[2]>>logy]];
+        p[3] = pal[3][buf[3][vplc[3]>>logy]];
+
+#if defined USE_VECTOR_EXT
        vplc += vinc;
+#else
+        vplc[0] += vinc[0];
+        vplc[1] += vinc[1];
+        vplc[2] += vinc[2];
+        vplc[3] += vinc[3];
 #endif
        p += ourbpl;
    }
-    while (--cnt);

-    for (i=0; i<4; i++)
-        vplce[i] = vplc[i];
+skip:
+    vplce[0] = vplc[0];
+    vplce[1] = vplc[1];
+    vplce[2] = vplc[2];
+    vplce[3] = vplc[3];
 }

 #ifdef USE_SATURATE_VPLC
@ -251,10 +345,26 @@ int32_t mvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, i

    cnt++;

+    if (!logy)
+    {
+        do
+        {
+            ch = buf[ourmulscale32(vplc,globaltilesizy)];
+            if (ch != 255) *pp = pal[ch];
+            pp += ourbpl;
+            vplc += vinc;
+            saturate_vplc(vplc, vinc);
+        }
+        while (--cnt);
+
+        return vplc;
+    }
+
    do
    {
-        ch = getpix(logy, buf, vplc);
-        if (ch != 255) *pp = pal[ch];
+        
+        if (buf[vplc>>logy] != 255)
+            *pp = pal[buf[vplc>>logy]];
        pp += ourbpl;
        vplc += vinc;
        saturate_vplc(vplc, vinc);
@ -267,9 +377,6 @@ int32_t mvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, i
 // cnt >= 1
 void mvlineasm4(int32_t cnt, char *p)
 {
-    char ch;
-    int32_t i;
-
    char *const pal[4] = {(char *)palookupoffse[0], (char *)palookupoffse[1], (char *)palookupoffse[2], (char *)palookupoffse[3]};
    char *const buf[4] = {(char *)bufplce[0], (char *)bufplce[1], (char *)bufplce[2], (char *)bufplce[3]};
 #ifdef USE_VECTOR_EXT
@ -280,28 +387,73 @@ void mvlineasm4(int32_t cnt, char *p)
    uint32_t vplc[4] = {vplce[0], vplce[1], vplce[2], vplce[3]};
 #endif
    const int32_t logy = glogy, ourbpl = bpl;
+    char ch;

-    do
+    if (logy)
    {
-        for (i=0; i<4; i++)
+        do
        {
-            ch = getpix(logy, buf[i], vplc[i]);
-            if (ch != 255) p[i] = pal[i][ch];
-#if !defined USE_VECTOR_EXT
-            vplc[i] += vinc[i];
-            saturate_vplc(vplc[i], vinc[i]);
-#endif
-        }
-#ifdef USE_VECTOR_EXT
-        vplc += vinc;
-        saturate_vplc_vec(vplc, vinc);
-#endif
-        p += ourbpl;
-    }
-    while (--cnt);
+            ch = buf[0][vplc[0]>>logy];
+            if (ch != 255) p[0] = pal[0][ch];
+            ch = buf[1][vplc[1]>>logy];
+            if (ch != 255) p[1] = pal[1][ch];
+            ch = buf[2][vplc[2]>>logy];
+            if (ch != 255) p[2] = pal[2][ch];
+            ch = buf[3][vplc[3]>>logy];
+            if (ch != 255) p[3] = pal[3][ch];

-    for (i=0; i<4; i++)
-        vplce[i] = vplc[i];
+#if !defined USE_VECTOR_EXT
+            vplc[0] += vinc[0];
+            vplc[1] += vinc[1];
+            vplc[2] += vinc[2];
+            vplc[3] += vinc[3];
+            saturate_vplc(vplc[0], vinc[0]);
+            saturate_vplc(vplc[1], vinc[1]);
+            saturate_vplc(vplc[2], vinc[2]);
+            saturate_vplc(vplc[3], vinc[3]);
+#else
+            vplc += vinc;
+            saturate_vplc_vec(vplc, vinc);
+#endif
+            p += ourbpl;
+        }
+        while (--cnt);
+    }
+    else
+    {
+        do
+        {
+            ch = buf[0][ourmulscale32(vplc[0],globaltilesizy)];
+            if (ch != 255) p[0] = pal[0][ch];
+            ch = buf[1][ourmulscale32(vplc[1],globaltilesizy)];
+            if (ch != 255) p[1] = pal[1][ch];
+            ch = buf[2][ourmulscale32(vplc[2],globaltilesizy)];
+            if (ch != 255) p[2] = pal[2][ch];
+            ch = buf[3][ourmulscale32(vplc[3],globaltilesizy)];
+            if (ch != 255) p[3] = pal[3][ch];
+
+#if !defined USE_VECTOR_EXT
+            vplc[0] += vinc[0];
+            vplc[1] += vinc[1];
+            vplc[2] += vinc[2];
+            vplc[3] += vinc[3];
+            saturate_vplc(vplc[0], vinc[0]);
+            saturate_vplc(vplc[1], vinc[1]);
+            saturate_vplc(vplc[2], vinc[2]);
+            saturate_vplc(vplc[3], vinc[3]);
+#else
+            vplc += vinc;
+            saturate_vplc_vec(vplc, vinc);
+#endif
+            p += ourbpl;
+        }
+        while (--cnt);
+    }
+
+    vplce[0] = vplc[0];
+    vplce[1] = vplc[1];
+    vplce[2] = vplc[2];
+    vplce[3] = vplc[3];
 }

 #ifdef USE_ASM64
@ -335,7 +487,8 @@ int32_t tvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, i
        do
        {
            ch = getpix(logy, buf, vplc);
-            if (ch != 255) *pp = trans[(*pp)|(pal[ch]<<8)];
+            if (ch != 255)
+                *pp = trans[(*pp)|(pal[ch]<<8)];
            pp += ourbpl;
            vplc += vinc;
            saturate_vplc_trans(vplc, vinc);
@ -374,7 +527,7 @@ void tvlineasm2(uint32_t vplc2, int32_t vinc1, intptr_t bufplc1, intptr_t bufplc
 {
    char ch;

-    int32_t cnt = (asm2-p-1)/bpl;  // >= 1
+    int32_t cnt = tabledivide32(asm2-p-1, bpl);  // >= 1
    const int32_t vinc2 = asm1;

    const char *const buf1 = (char *)bufplc1;
@ -533,7 +686,7 @@ void mspritevline(int32_t bx, int32_t by, int32_t cnt, intptr_t bufplc, intptr_t
    for (; cnt>1; cnt--)
    {
        ch = gbuf[(bx>>16)*glogy+(by>>16)];
-        if (ch != 255)(*(char *)p) = gpal[ch];
+        if (ch != 255) (*(char *)p) = gpal[ch];
        bx += gbxinc;
        by += gbyinc;
        p += bpl;
@ -557,7 +710,7 @@ void tspritevline(int32_t bx, int32_t by, int32_t cnt, intptr_t bufplc, intptr_t
        for (; cnt>1; cnt--)
        {
            ch = gbuf[(bx>>16)*glogy+(by>>16)];
-            if (ch != 255) *((char *)p) = gtrans[(*((char *)p))+(gpal[ch]<<8)];
+            if (ch != 255) *((char *)p) =  gtrans[(*((char *)p))+(gpal[ch]<<8)];
            bx += gbxinc;
            by += gbyinc;
            p += bpl;
--- a/polymer/eduke32/build/src/cache1d.c
+++ b/polymer/eduke32/build/src/cache1d.c
@ -128,7 +128,7 @@ void initcache(intptr_t dacachestart, int32_t dacachesize)
    int32_t i;

    for (i=1; i<200; i++)
-        lockrecip[i] = (1<<28)/(200-i);
+        lockrecip[i] = tabledivide32_noinline(1<<28, 200-i);

    // The following code was relocated here from engine.c, since this
    // function is only ever called once (from there), and it seems to
--- a/polymer/eduke32/build/src/compat.c
+++ b/polymer/eduke32/build/src/compat.c
@ -797,33 +797,32 @@ char *Bstrtolower(char *str)
 //Brute-force case-insensitive, slash-insensitive, * and ? wildcard matcher
 //Given: string i and string j. string j can have wildcards
 //Returns: 1:matches, 0:doesn't match
-int32_t Bwildmatch(const char *i, const char *j)
-{
-    const char *k;
-    char c0, c1;
+#ifndef WITHKPLIB
+extern char toupperlookup[256];

-    if (!*j) return(1);
+static int32_t wildmatch(const char *match, const char *wild)
+{
    do
    {
-        if (*j == '*')
+        if (*match && (toupperlookup[*wild] == toupperlookup[*match] || *wild == '?'))
        {
-            for (k=i,j++; *k; k++) if (Bwildmatch(k,j)) return(1);
+            wild++, match++;
            continue;
        }
-        if (!*i) return(0);
-        if (*j == '?') { i++; j++; continue; }
-        c0 = *i; if ((c0 >= 'a') && (c0 <= 'z')) c0 -= 32;
-        c1 = *j; if ((c1 >= 'a') && (c1 <= 'z')) c1 -= 32;
-#ifdef _WIN32
-        if (c0 == '/') c0 = '\\';
-        if (c1 == '/') c1 = '\\';
-#endif
-        if (c0 != c1) return(0);
-        i++; j++;
-    }
-    while (*j);
-    return(!*i);
+        else if ((*match|*wild) == '\0')
+            return 1;
+        else if (*wild == '*')
+        {
+            while (*wild == '*') wild++;
+            if (*wild == '\0') return 1;
+            while (*match && toupperlookup[*match] != toupperlookup[*wild]) match++;
+            if (toupperlookup[*match] == toupperlookup[*wild])
+                continue;
+        }
+        return 0;
+    } while (1);
 }
+#endif

 #if !defined(_WIN32)
 char *Bstrlwr(char *s)
@ -917,3 +916,6 @@ int access(const char *pathname, int mode)
 }
 #endif

+#define LIBDIVIDE_BODY
+#include "libdivide.h"
+
--- a/polymer/eduke32/build/src/dxtfilter.c
+++ b/polymer/eduke32/build/src/dxtfilter.c
@ -153,7 +153,7 @@ int32_t dxtfilter(int32_t fil, const texcachepicture *pict, const char *pic, voi
        for (j=stride; (unsigned)j<miplen; j+=stride)
            for (k=0; k<8; k++) *cptr++ = pic[j+k];

-        dxt_handle_io(fil, (miplen/stride)<<3, midbuf, packbuf);
+        dxt_handle_io(fil, tabledivide32(miplen, stride)<<3, midbuf, packbuf);
    }

    //rgb0,rgb1
@ -162,7 +162,7 @@ int32_t dxtfilter(int32_t fil, const texcachepicture *pict, const char *pic, voi
        for (j=0; (unsigned)j<miplen; j+=stride)
            { *(int16_t *)cptr = dxt_hicosub(*(int16_t *)(&pic[offs+j+k])); cptr += 2; }

-    dxt_handle_io(fil, (miplen/stride)<<2, midbuf, packbuf);
+    dxt_handle_io(fil, tabledivide32(miplen, stride)<<2, midbuf, packbuf);

    //index_4x4
    cptr = (char *)midbuf;
@ -176,7 +176,7 @@ int32_t dxtfilter(int32_t fil, const texcachepicture *pict, const char *pic, voi
        cptr += 4;
    }

-    dxt_handle_io(fil, (miplen/stride)<<2, midbuf, packbuf);
+    dxt_handle_io(fil, tabledivide32(miplen, stride)<<2, midbuf, packbuf);

    return 0;
 }
@ -196,7 +196,7 @@ int32_t dedxtfilter(int32_t fil, const texcachepicture *pict, char *pic, void *m
    if (stride == 16) //If DXT3...
    {
        //alpha_4x4
-        if (dedxt_handle_io(fil, (pict->size/stride)*8, midbuf, pict->size, packbuf, ispacked))
+        if (dedxt_handle_io(fil, tabledivide32(pict->size, stride)*8, midbuf, pict->size, packbuf, ispacked))
            return -1;

        cptr = (char *)midbuf;
@ -206,7 +206,7 @@ int32_t dedxtfilter(int32_t fil, const texcachepicture *pict, char *pic, void *m
    }

    //rgb0,rgb1
-    if (dedxt_handle_io(fil, (pict->size/stride)*4, midbuf, pict->size, packbuf, ispacked))
+    if (dedxt_handle_io(fil, tabledivide32(pict->size, stride)*4, midbuf, pict->size, packbuf, ispacked))
        return -1;

    cptr = (char *)midbuf;
@ -220,7 +220,7 @@ int32_t dedxtfilter(int32_t fil, const texcachepicture *pict, char *pic, void *m
    }

    //index_4x4:
-    if (dedxt_handle_io(fil, (pict->size/stride)*4, midbuf, pict->size, packbuf, ispacked))
+    if (dedxt_handle_io(fil, tabledivide32(pict->size, stride)*4, midbuf, pict->size, packbuf, ispacked))
        return -1;

    cptr = (char *)midbuf;
--- a/polymer/eduke32/build/src/engine.c
+++ b/polymer/eduke32/build/src/engine.c
@ -144,7 +144,8 @@ static char voxlock[MAXVOXELS][MAXVOXMIPS];
 int32_t voxscale[MAXVOXELS];

 static int32_t ggxinc[MAXXSIZ+1], ggyinc[MAXXSIZ+1];
-static int32_t lowrecip[1024], nytooclose, nytoofar;
+static int32_t lowrecip[1024], nytooclose;
+static const int32_t nytoofar = 65536*16384-1048576;
 static uint32_t distrecip[65536+256];

 static int32_t *lookups = NULL;
@ -3549,7 +3550,7 @@ static int32_t setup_globals_cf1(const sectortype *sec, int32_t pal, int32_t zd,
        j = sec->wallptr;
        ox = wall[wall[j].point2].x - wall[j].x;
        oy = wall[wall[j].point2].y - wall[j].y;
-        i = nsqrtasm(uhypsq(ox,oy)); if (i == 0) i = 1024; else i = 1048576/i;
+        i = nsqrtasm(uhypsq(ox,oy)); if (i == 0) i = 1024; else i = tabledivide32(1048576, i);
        globalx1 = mulscale10(dmulscale10(ox,singlobalang,-oy,cosglobalang),i);
        globaly1 = mulscale10(dmulscale10(ox,cosglobalang,oy,singlobalang),i);
        globalx2 = -globalx1;
@ -4110,6 +4111,7 @@ static void transmaskwallscan(int32_t x1, int32_t x2, int32_t saturatevplc)
 #endif

 // cntup16>>16 iterations
+
 static void nonpow2_mhline(intptr_t bufplc, uint32_t bx, int32_t cntup16, int32_t junk, uint32_t by, char *p)
 {
    char ch;
@ -4126,7 +4128,7 @@ static void nonpow2_mhline(intptr_t bufplc, uint32_t bx, int32_t cntup16, int32_

    for (cntup16>>=16; cntup16>0; cntup16--)
    {
-        ch = buf[(bx/xdiv)*yspan + by/ydiv];
+        ch = buf[(divideu32(bx, xdiv))*yspan + divideu32(by, ydiv)];

        if (ch != 255) *p = pal[ch];
        bx += xinc;
@ -4155,7 +4157,7 @@ static void nonpow2_thline(intptr_t bufplc, uint32_t bx, int32_t cntup16, int32_
    {
        for (cntup16>>=16; cntup16>0; cntup16--)
        {
-            ch = buf[(bx/xdiv)*yspan + by/ydiv];
+            ch = buf[divideu32(bx, xdiv)*yspan + divideu32(by, ydiv)];
            if (ch != 255) *p = trans[(*p)|(pal[ch]<<8)];
            bx += xinc;
            by += yinc;
@ -4166,7 +4168,7 @@ static void nonpow2_thline(intptr_t bufplc, uint32_t bx, int32_t cntup16, int32_
    {
        for (cntup16>>=16; cntup16>0; cntup16--)
        {
-            ch = buf[(bx/xdiv)*yspan + by/ydiv];
+            ch = buf[divideu32(bx, xdiv)*yspan + divideu32(by, ydiv)];
            if (ch != 255) *p = trans[((*p)<<8)|pal[ch]];
            bx += xinc;
            by += yinc;
@ -4287,14 +4289,8 @@ static void tslopevlin(uint8_t *p, int32_t i, const intptr_t *slopalptr, int32_t
        v = by + ytov*i;
        ch = *(uint8_t *)(slopalptr[0] + buf[((u>>(32-logx))<<logy)+(v>>(32-logy))]);

-        if (transmode)
-        {
-            if (ch != 255) *p = trans[*p|(pal[ch]<<8)];
-        }
-        else
-        {
-            if (ch != 255) *p = trans[(*p<<8)|pal[ch]];
-        }
+        if (ch != 255)
+            *p = trans[transmode ? *p|(pal[ch]<<8) : (*p<<8)|pal[ch]];

        slopalptr--;
        p += pinc;
@ -5314,8 +5310,8 @@ static void drawvox(int32_t dasprx, int32_t daspry, int32_t dasprz, int32_t dasp
    daxscale = scale(daxscale,xdimenscale,xdimen<<8);
    dayscale = scale(dayscale,mulscale16(xdimenscale,viewingrangerecip),xdimen<<8);

-    daxscalerecip = (1<<30)/daxscale;
-    dayscalerecip = (1<<30)/dayscale;
+    daxscalerecip = tabledivide32_noinline(1<<30, daxscale);
+    dayscalerecip = tabledivide32_noinline(1<<30, dayscale);

    longptr = (int32_t *)davoxptr;
    daxsiz = B_LITTLE32(longptr[0]); daysiz = B_LITTLE32(longptr[1]); //dazsiz = B_LITTLE32(longptr[2]);
@ -5772,7 +5768,22 @@ draw_as_face_sprite:
        if ((cstat&8) > 0)
            swaplong(&y1, &y2);

-        for (x=lx; x<=rx; x++)
+        x = lx;
+#ifdef CLASSIC_SLICE_BY_4
+        for (; x<=rx-4; x+=4)
+        {
+            uwall[x] =   max(startumost[x+windowx1]-windowy1,   (int16_t) startum);
+            uwall[x+1] = max(startumost[x+windowx1+1]-windowy1, (int16_t) startum);
+            uwall[x+2] = max(startumost[x+windowx1+2]-windowy1, (int16_t) startum);
+            uwall[x+3] = max(startumost[x+windowx1+3]-windowy1, (int16_t) startum);
+
+            dwall[x] =   min(startdmost[x+windowx1]-windowy1,   (int16_t) startdm);
+            dwall[x+1] = min(startdmost[x+windowx1+1]-windowy1, (int16_t) startdm);
+            dwall[x+2] = min(startdmost[x+windowx1+2]-windowy1, (int16_t) startdm);
+            dwall[x+3] = min(startdmost[x+windowx1+3]-windowy1, (int16_t) startdm);
+        }
+#endif
+        for (; x<=rx; x++)
        {
            uwall[x] = max(startumost[x+windowx1]-windowy1,(int16_t)startum);
            dwall[x] = min(startdmost[x+windowx1]-windowy1,(int16_t)startdm);
@ -5801,13 +5812,31 @@ draw_as_face_sprite:
                break;
            case 1:
                k = smoststart[i] - xb1[j];
-                for (x=dalx2; x<=darx2; x++)
+                x = dalx2;
+#ifdef CLASSIC_SLICE_BY_4 // ok, this one is really by 2 ;)
+                for (x=dalx2; x<=darx2-2; x+=2)
+                {
+                    if (smost[k+x] > uwall[x]) uwall[x] = smost[k+x];
+                    if (smost[k+x+1] > uwall[x+1]) uwall[x+1] = smost[k+x+1];
+                }
+#endif
+                for (; x<=darx2; x++)
                    if (smost[k+x] > uwall[x]) uwall[x] = smost[k+x];
                if ((dalx2 == lx) && (darx2 == rx)) daclip |= 1;
                break;
            case 2:
                k = smoststart[i] - xb1[j];
-                for (x=dalx2; x<=darx2; x++)
+                x = dalx2;
+#ifdef CLASSIC_SLICE_BY_4
+                for (; x<=darx2-4; x+=4)
+                {
+                    if (smost[k+x] < dwall[x]) dwall[x] = smost[k+x];
+                    if (smost[k+x+1] < dwall[x+1]) dwall[x+1] = smost[k+x+1];
+                    if (smost[k+x+2] < dwall[x+2]) dwall[x+2] = smost[k+x+2];
+                    if (smost[k+x+3] < dwall[x+3]) dwall[x+3] = smost[k+x+3];
+                }
+#endif
+                for (; x<=darx2; x++)
                    if (smost[k+x] < dwall[x]) dwall[x] = smost[k+x];
                if ((dalx2 == lx) && (darx2 == rx)) daclip |= 2;
                break;
@ -7842,12 +7871,41 @@ static void dosetaspect(void)
        oxyaspect = xyaspect;
        j = xyaspect*320;
        horizlookup2[horizycent-1] = divscale26(131072,j);
-        for (i=ydim*4-1; i>=0; i--)
-            if (i != (horizycent-1))
-            {
-                horizlookup[i] = divscale28(1,i-(horizycent-1));
-                horizlookup2[i] = divscale14(klabs(horizlookup[i]),j);
-            }
+        for (i=0; i < horizycent-1-4; i += 4)
+        {
+            horizlookup[i]   = divscale28(1, i  -(horizycent-1));
+            horizlookup[i+1] = divscale28(1, i+1-(horizycent-1));
+            horizlookup[i+2] = divscale28(1, i+2-(horizycent-1));
+            horizlookup[i+3] = divscale28(1, i+3-(horizycent-1));
+
+            horizlookup2[i]   = divscale14(klabs(horizlookup[i]),   j);
+            horizlookup2[i+1] = divscale14(klabs(horizlookup[i+1]), j);
+            horizlookup2[i+2] = divscale14(klabs(horizlookup[i+2]), j);
+            horizlookup2[i+3] = divscale14(klabs(horizlookup[i+3]), j);
+        }
+        for (; i < horizycent-1; i++)
+        {
+            horizlookup[i] = divscale28(1, i-(horizycent-1));
+            horizlookup2[i] = divscale14(klabs(horizlookup[i]), j);
+        }
+
+        for (i=horizycent; i < ydim*4-1-4; i += 4)
+        {
+            horizlookup[i]   = divscale28(1, i  -(horizycent-1));
+            horizlookup[i+1] = divscale28(1, i+1-(horizycent-1));
+            horizlookup[i+2] = divscale28(1, i+2-(horizycent-1));
+            horizlookup[i+3] = divscale28(1, i+3-(horizycent-1));
+
+            horizlookup2[i]   = divscale14(klabs(horizlookup[i]),   j);
+            horizlookup2[i+1] = divscale14(klabs(horizlookup[i+1]), j);
+            horizlookup2[i+2] = divscale14(klabs(horizlookup[i+2]), j);
+            horizlookup2[i+3] = divscale14(klabs(horizlookup[i+3]), j);
+        }
+        for (; i < ydim*4-1; i++)
+        {
+            horizlookup[i] = divscale28(1, i-(horizycent-1));
+            horizlookup2[i] = divscale14(klabs(horizlookup[i]), j);
+        }
    }

    if (xdimen != oxdimen || viewingrange != oviewingrange)
@ -7856,7 +7914,6 @@ static void dosetaspect(void)

        no_radarang2 = 0;
        oviewingrange = viewingrange;
-        oxdimen = xdimen;

        xinc = mulscale32(viewingrange*320,xdimenrecip);
        x = (640<<16)-mulscale1(xinc,xdimen);
@ -7880,15 +7937,28 @@ static void dosetaspect(void)
            radarang2[i] = (int16_t)((radarang[k]+j)>>6);
        }

+        if (xdimen != oxdimen)
        {
            EDUKE32_STATIC_ASSERT((uint64_t) MAXXDIM*(ARRAY_SIZE(distrecip)-1) <= INT32_MAX);

-            for (i=1; i<(int32_t) ARRAY_SIZE(distrecip); i++)
+            i = 1;
+
+#ifdef CLASSIC_SLICE_BY_4
+            for (; i<(int32_t) ARRAY_SIZE(distrecip)-4; i+=4)
+            {
                distrecip[i] = (xdimen * i)>>20;
+                distrecip[i+1] = (xdimen * (i+1))>>20;
+                distrecip[i+2] = (xdimen * (i+2))>>20;
+                distrecip[i+3] = (xdimen * (i+3))>>20;
+            }
+#endif
+            for (; i<(int32_t) ARRAY_SIZE(distrecip); i++)
+                distrecip[i] = (xdimen * i)>>20;
+
+            nytooclose = xdimen*2100;
        }

-        nytooclose = xdimen*2100;
-        nytoofar = 65536*16384-1048576;
+        oxdimen = xdimen;
    }
 }

@ -7920,9 +7990,19 @@ static int32_t loadtables(void)
    if (tablesloaded == 0)
    {
        int32_t i;
+        libdivide_s64_t d;
+        libdivide_s32_t d32;

        initksqrt();

+        for (i=1; i<DIVTABLESIZE; i++)
+        {
+            d = libdivide_s64_gen(i);
+            divtable64[i].magic = d.magic, divtable64[i].more = d.more;
+            d32 = libdivide_s32_gen(i);
+            divtable32[i].magic = d32.magic, divtable32[i].more = d32.more;
+        }
+
        for (i=0; i<2048; i++)
            reciptable[i] = divscale30(2048, i+2048);

@ -9569,8 +9649,8 @@ killsprite:
            p1eq = equation(pos.x, pos.y, dot.x, dot.y);
            p2eq = equation(pos.x, pos.y, dot2.x, dot2.y);

-            middle.x = (dot.x + dot2.x) / 2;
-            middle.y = (dot.y + dot2.y) / 2;
+            middle.x = (dot.x + dot2.x) * .5f;
+            middle.y = (dot.y + dot2.y) * .5f;

            i = spritesortcnt;
            while (i)
@ -9963,11 +10043,11 @@ void drawmapview(int32_t dax, int32_t day, int32_t zoome, int16_t ang)

            //relative alignment stuff
            ox = x2-x1; oy = y2-y1;
-            i = ox*ox+oy*oy; if (i == 0) continue; i = (65536*16384)/i;
+            i = ox*ox+oy*oy; if (i == 0) continue; i = tabledivide32_noinline(65536*16384, i);
            globalx1 = mulscale10(dmulscale10(ox,bakgxvect,oy,bakgyvect),i);
            globaly1 = mulscale10(dmulscale10(ox,bakgyvect,-oy,bakgxvect),i);
            ox = y1-y4; oy = x4-x1;
-            i = ox*ox+oy*oy; if (i == 0) continue; i = (65536*16384)/i;
+            i = ox*ox+oy*oy; if (i == 0) continue; i = tabledivide32_noinline(65536*16384, i);
            globalx2 = mulscale10(dmulscale10(ox,bakgxvect,oy,bakgyvect),i);
            globaly2 = mulscale10(dmulscale10(ox,bakgyvect,-oy,bakgxvect),i);

@ -13170,14 +13250,14 @@ static int32_t clipsprite_try(const spritetype *spr, int32_t xmin, int32_t ymin,
        if ((spr->cstat&48)!=32)  // face/wall sprite
        {
            int32_t tempint1 = clipmapinfo.sector[k].CM_XREPEAT;
-            maxcorrection = (maxcorrection * (int32_t)spr->xrepeat)/tempint1;
+            maxcorrection = tabledivide32_noinline(maxcorrection * (int32_t)spr->xrepeat, tempint1);
        }
        else  // floor sprite
        {
            int32_t tempint1 = clipmapinfo.sector[k].CM_XREPEAT;
            int32_t tempint2 = clipmapinfo.sector[k].CM_YREPEAT;
-            maxcorrection = max((maxcorrection * (int32_t)spr->xrepeat)/tempint1,
-                                (maxcorrection * (int32_t)spr->yrepeat)/tempint2);
+            maxcorrection = max(tabledivide32_noinline(maxcorrection * (int32_t)spr->xrepeat, tempint1),
+                                tabledivide32_noinline(maxcorrection * (int32_t)spr->yrepeat, tempint2));
        }

        maxcorrection -= MAXCLIPDIST;
@ -15140,9 +15220,9 @@ void clearview(int32_t dacol)
    {
        palette_t p = getpal(dacol);

-        bglClearColor(((float)p.r)/255.0,
-                      ((float)p.g)/255.0,
-                      ((float)p.b)/255.0,
+        bglClearColor((float)p.r * (1.f/255.f),
+                      (float)p.g * (1.f/255.f),
+                      (float)p.b * (1.f/255.f),
                      0);
        bglClear(GL_COLOR_BUFFER_BIT);
        return;
@ -15179,9 +15259,9 @@ void clearallviews(int32_t dacol)
        palette_t p = getpal(dacol);

        bglViewport(0,0,xdim,ydim); glox1 = -1;
-        bglClearColor(((float)p.r)/255.0,
-                      ((float)p.g)/255.0,
-                      ((float)p.b)/255.0,
+        bglClearColor((float)p.r * (1.f/255.f),
+                      (float)p.g * (1.f/255.f),
+                      (float)p.b * (1.f/255.f),
                      0);
        bglClear(GL_COLOR_BUFFER_BIT);
        return;
@ -15740,8 +15820,8 @@ void drawline256(int32_t x1, int32_t y1, int32_t x2, int32_t y2, char col)
        //bglEnable(GL_BLEND);	// When using line antialiasing, this is needed
        bglBegin(GL_LINES);
        bglColor4ub(p.r,p.g,p.b,255);
-        bglVertex2f((float)x1/4096.0,(float)y1/4096.0);
-        bglVertex2f((float)x2/4096.0,(float)y2/4096.0);
+        bglVertex2f((float)x1 * (1.f/4096.f), (float)y1 * (1.f/4096.f));
+        bglVertex2f((float)x2 * (1.f/4096.f), (float)y2 * (1.f/4096.f));
        bglEnd();
        //bglDisable(GL_BLEND);

--- a/polymer/eduke32/build/src/kplib.c
+++ b/polymer/eduke32/build/src/kplib.c
@ -37,6 +37,7 @@ credits.
 #include <sys/stat.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include "pragmas.h"

 #if defined(__POWERPC__) || defined(GEKKO)
 #define BIGENDIAN 1
@ -820,14 +821,14 @@ static int32_t kpngrend(const char *kfilebuf, int32_t kfilength,
            //Save code by making grayscale look like a palette color scheme
            if ((!kcoltype) || (kcoltype == 4))
            {
-                j = 0xff000000; k = (255 / ((1<<bitdepth)-1))*0x10101;
+                j = 0xff000000; k = (tabledivide32(255, ((1<<bitdepth)-1)))*0x10101;
                paleng = (1<<bitdepth);
                for (i=0; i<paleng; i++,j+=k) palcol[i] = LSWAPIB(j);
            }
        }
        else if (i == (int32_t)LSWAPIB(0x45544c50)) //PLTE (must be before IDAT)
        {
-            paleng = leng/3;
+            paleng = tabledivide32(leng, 3);
            for (i=paleng-1; i>=0; i--) palcol[i] = LSWAPIB((LSWAPIL(*(int32_t *)&filptr[i*3])>>8)|0xff000000);
        }
        else if (i == (int32_t)LSWAPIB(0x44474b62)) //bKGD (must be after PLTE and before IDAT)
@ -835,7 +836,7 @@ static int32_t kpngrend(const char *kfilebuf, int32_t kfilength,
            switch (kcoltype)
            {
            case 0: case 4:
-                        bakcol = (((int32_t)filptr[0]<<8)+(int32_t)filptr[1])*255/((1<<bitdepth)-1);
+                        bakcol = (((int32_t)filptr[0]<<8)+(int32_t)filptr[1])*tabledivide32(255, ((1<<bitdepth)-1));
                bakcol = bakcol*0x10101+0xff000000; break;
            case 2: case 6:
                        if (bitdepth == 8)
@ -843,7 +844,7 @@ static int32_t kpngrend(const char *kfilebuf, int32_t kfilength,
                        else
                        {
                            for (i=0,bakcol=0xff000000; i<3; i++)
-                                bakcol += ((((((int32_t)filptr[i<<1])<<8)+((int32_t)filptr[(i<<1)+1]))/257)<<(16-(i<<3)));
+                                bakcol += tabledivide32(((((int32_t)filptr[i<<1])<<8)+((int32_t)filptr[(i<<1)+1])), 257)<<(16-(i<<3));
                        }
                break;
            case 3:
@ -2202,7 +2203,7 @@ static int32_t kddsrend(const char *buf, int32_t leng,

    if (!(dxt&1))
    {
-        for (z=256-1; z>0; z--) lut[z] = (255<<16)/z;
+        for (z=256-1; z>0; z--) lut[z] = tabledivide32_noinline(255<<16, z);
        lut[0] = (1<<16);
    }
    if (dxt == 1) stride = (xsiz<<1); else stride = (xsiz<<2);
@ -2419,7 +2420,7 @@ int32_t kprender(const char *buf, int32_t leng, intptr_t frameptr, int32_t bpl,

 extern char toupperlookup[256];

-static int32_t wildmatch(const char *match, const char *wild)
+int32_t wildmatch(const char *match, const char *wild)
 {
    do
    {
@ -2428,13 +2429,13 @@ static int32_t wildmatch(const char *match, const char *wild)
            wild++, match++;
            continue;
        }
-        else if (*match + *wild == '\0')
+        else if ((*match|*wild) == '\0')
            return 1;
        else if (*wild == '*')
        {
            while (*wild == '*') wild++;
            if (*wild == '\0') return 1;
-            while (toupperlookup[*match] != toupperlookup[*wild] && *match) match++;
+            while (*match && toupperlookup[*match] != toupperlookup[*wild]) match++;
            if (toupperlookup[*match] == toupperlookup[*wild])
                continue;
        }
--- a/polymer/eduke32/build/src/mdsprite.c
+++ b/polymer/eduke32/build/src/mdsprite.c
@ -1004,10 +1004,7 @@ void updateanimation(md2model_t *m, const spritetype *tspr, uint8_t lpal)
        return;
    }

-    if (smooth->mdsmooth)  // VERIFY: (smooth->mdsmooth) implies (tile2model[tile].smoothduration!=0) ?
-        ftol((1.0f / (float)(tile2model[tile].smoothduration)) * 66.f, &fps);
-    else
-        fps = anim->fpssc;
+    fps = smooth->mdsmooth ? Blrintf((1.0f / (float) (tile2model[tile].smoothduration)) * 66.f) : anim->fpssc;

    i = (mdtims - sprext->mdanimtims)*((fps*timerticspersec)/120);

--- a/polymer/eduke32/build/src/osd.c
+++ b/polymer/eduke32/build/src/osd.c
@ -1459,7 +1459,7 @@ void OSD_Draw(void)
            while (j > -1)
            {
                osdrowscur++;
-                j -= 200/osd->draw.rows;
+                j -= tabledivide32_noinline(200, osd->draw.rows);
                if (osdrowscur > osd->draw.rows-1)
                    break;
            }
@ -1470,7 +1470,7 @@ void OSD_Draw(void)
            while (j > -1)
            {
                osdrowscur--;
-                j -= 200/osd->draw.rows;
+                j -= tabledivide32_noinline(200, osd->draw.rows);
                if (osdrowscur < 1)
                    break;
            }
--- a/polymer/eduke32/build/src/polymer.c
+++ b/polymer/eduke32/build/src/polymer.c
@ -2830,9 +2830,7 @@ static float calc_ypancoef(char curypanning, int16_t curpicnum, int32_t dopancor

        if (dopancor)
        {
-            int32_t yoffs;
-
-            ftol((ypancoef - tilesiz[curpicnum].y) * (255.0f / ypancoef), &yoffs);
+            int32_t yoffs = Blrintf((ypancoef - tilesiz[curpicnum].y) * (255.0f / ypancoef));
            if (curypanning > 256 - yoffs)
                curypanning -= yoffs;
        }
--- a/polymer/eduke32/build/src/polymost.c
+++ b/polymer/eduke32/build/src/polymost.c
@ -558,8 +558,9 @@ static inline void fogcalc(int32_t tile, int32_t shade, int32_t vis, int32_t pal
        }
        else
        {
-            fogresult = (r_usenewshading == 3 && shade > 0) ? 0 : -(FOGDISTCONST * shade)/combvis;
-            fogresult2 = (FOGDISTCONST * (numshades-1-shade))/combvis;
+            combvis = 1.f/combvis;
+            fogresult = (r_usenewshading == 3 && shade > 0) ? 0 : -(FOGDISTCONST * shade) * combvis;
+            fogresult2 = (FOGDISTCONST * (numshades-1-shade)) * combvis;
        }
    }
 }
@ -663,7 +664,7 @@ static void resizeglcheck(void)
    if ((glox1 != windowx1) || (gloy1 != windowy1) || (glox2 != windowx2) || (gloy2 != windowy2))
    {
        const int32_t ourxdimen = (windowx2-windowx1+1);
-        const float ratio = get_projhack_ratio();
+        float ratio = get_projhack_ratio();
        const int32_t fovcorrect = (ratio==0) ? 0 : (int32_t)(ourxdimen*ratio - ourxdimen);
        float m[4][4];

@ -675,9 +676,10 @@ static void resizeglcheck(void)

        bglMatrixMode(GL_PROJECTION);
        memset(m,0,sizeof(m));
-        m[0][0] = fydimen / ratio; m[0][2] = 1.f;
+        ratio = 1.f/ratio;
+        m[0][0] = fydimen * ratio; m[0][2] = 1.f;
        m[1][1] = fxdimen; m[1][2] = 1.f;
-        m[2][2] = 1.f; m[2][3] = fydimen / ratio;
+        m[2][2] = 1.f; m[2][3] = fydimen * ratio;
        m[3][2] =-1.f;
        bglLoadMatrixf(&m[0][0]);

@ -2380,9 +2382,7 @@ static void calc_ypanning(int32_t refposz, float ryp0, float ryp1,
    {
        // Carry out panning "correction" to make it look like classic in some
        // cases, but failing in the general case.
-        int32_t yoffs;
-
-        ftol((i-tilesiz[globalpicnum].y)*(255.f/i), &yoffs);
+        int32_t yoffs = Blrintf((i-tilesiz[globalpicnum].y)*(255.f/i));

        if (ypan > 256-yoffs)
            ypan -= yoffs;
@ -2543,11 +2543,11 @@ static void polymost_drawalls(int32_t bunch)
                else domost(x0,fy0,x1,fy1);

                if (r_parallaxskypanning)
-                    vv[0] += dd[0]*((float)sec->floorypanning)*((float)i)/256.0;
+                    vv[0] += dd[0]*((float)sec->floorypanning)*((float)i)*(1.f/256.f);

                gdx = 0; gdy = 0; gdo = dd[0];
                gux = gdo *
-                    (t * (float) ((uint64_t) (xdimscale * yxaspect) * viewingrange)) / (16384.0*65536.0*65536.0*5.0*1024.0);
+                    (t * (float) ((uint64_t) (xdimscale * yxaspect) * viewingrange)) * (1.f/(16384.0*65536.0*65536.0*5.0*1024.0));
                guy = 0; //guo calculated later
                gvx = 0; gvy = vv[1]; gvo = vv[0];

@ -2559,7 +2559,7 @@ static void polymost_drawalls(int32_t bunch)
                do
                {
                    globalpicnum = dapskyoff[y&((1<<dapskybits)-1)]+i;
-                    guo = gdo*(t*((float)(globalang-(y<<(11-dapskybits))))/2048.0 + (float)((r_parallaxskypanning)?sec->floorxpanning:0)) - gux*ghalfx;
+                    guo = gdo*(t*((float)(globalang-(y<<(11-dapskybits)))) * (1.f/2048.f) + (float)((r_parallaxskypanning)?sec->floorxpanning:0)) - gux*ghalfx;
                    y++;
                    ox = fx; fx = ((float)((y<<(11-dapskybits))-globalang))*oz+ghalfx;
                    if (fx > x1) { fx = x1; i = -1; }
@ -2798,7 +2798,7 @@ static void polymost_drawalls(int32_t bunch)
                i = (1<<(picsiz[globalpicnum]>>4)); if (i != tilesiz[globalpicnum].y) i += i;

                //Hack to draw black rectangle below sky when looking down...
-                gdx = 0; gdy = gxyaspect / -262144.f; gdo = -ghoriz*gdy;
+                gdx = 0; gdy = gxyaspect * (1.f/-262144.f); gdo = -ghoriz*gdy;
                gux = 0; guy = 0; guo = 0;
                gvx = 0; gvy = 0; gvo = 0;
                oy = -vv[0]/vv[1];
@ -2819,7 +2819,7 @@ static void polymost_drawalls(int32_t bunch)
                else domost(x1,cy1,x0,cy0);

                if (r_parallaxskypanning)
-                    vv[0] += dd[0]*((float)sec->ceilingypanning)*((float)i)/256.f;
+                    vv[0] += dd[0]*(float)sec->ceilingypanning*(float)i/256.f;
                
                gdx = 0; gdy = 0; gdo = dd[0];
                gux = gdo * 
@ -2835,7 +2835,7 @@ static void polymost_drawalls(int32_t bunch)
                do
                {
                    globalpicnum = dapskyoff[y&((1<<dapskybits)-1)]+i;
-                    guo = gdo*(t*((float)(globalang-(y<<(11-dapskybits))))/2048.0 + (float)((r_parallaxskypanning)?sec->ceilingxpanning:0)) - gux*ghalfx;
+                    guo = gdo*(t*((float)(globalang-(y<<(11-dapskybits)))) * 1.f/2048.f + (float)((r_parallaxskypanning)?sec->ceilingxpanning:0)) - gux*ghalfx;
                    y++;
                    ox = fx; fx = ((float)((y<<(11-dapskybits))-globalang))*oz+ghalfx;
                    if (fx > x1) { fx = x1; i = -1; }
--- a/polymer/eduke32/build/src/pragmas.c
+++ b/polymer/eduke32/build/src/pragmas.c
@ -7,167 +7,19 @@
 // inline versions. I'll eventually convert these to macro-inline
 // equivalents.		--Jonathon

-//#include "pragmas.h"
 #include "compat.h"
+#include "pragmas.h"
+
+libdivide_s64pad_t divtable64[DIVTABLESIZE];
+libdivide_s32pad_t divtable32[DIVTABLESIZE];
+
+uint32_t divideu32_noinline(uint32_t n, uint32_t d) { return divideu32(n, d); }
+int32_t tabledivide32_noinline(int32_t n, int32_t d) { return tabledivide32(n, d); }
+int32_t tabledivide64_noinline(int64_t n, int32_t d) { return tabledivide64(n, d); }

 int32_t dmval;

-#if defined(__GNUC__) && defined(GEKKO)
-
-// naked function (no prolog/epilog)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunused-parameter"
-#pragma GCC diagnostic ignored "-Wreturn-type"
-int32_t scale(int32_t a, int32_t d, int32_t c) ATTRIBUTE((naked));
-int32_t scale(int32_t a, int32_t d, int32_t c)
-{
-//	return ((int64_t)a * d) / c;
-
-	__asm__ __volatile__ (
-		" mullw   6, 3, 4\n"
-		" mulhw   4, 3, 4\n"
-		" mr      3, 6\n"
-
-		" srawi.  0, 5, 31\n"
-		" cmpwi cr1, 4, 0\n"
-		" crxor   7, 0, 4\n"
-
-		" xor     5, 0, 5\n"
-		" subf.   5, 0, 5\n"
-
-		" beq     DivByZero\n"
-		" bge   cr1, Div64Common\n"
-
-		" subfic  3, 3, 0\n"
-		" subfze  4, 4\n"
-
-		"Div64Common:\n"
-		" cmplw   4, 5\n"
-
-		" cntlzw  6, 5\n"
-		" xor     4, 4, 3\n"
-		" slw     5, 5, 6\n"
-		" rotlw   4, 4, 6\n"
-		" slw     3, 3, 6\n"
-		" li      7, 2\n"
-		" xor     4, 4, 3\n"
-
-		" bge DivOverflow\n"
-		" mtctr   7\n"
-
-		"Div64Compute:\n"
-		" srwi    6, 5, 16\n"
-		" divwu   7, 4, 6\n"
-		" mullw   6, 7, 6\n"
-		" subf    4, 6, 4\n"
-		" slwi    4, 4, 16\n"
-		" inslwi  4, 3, 16, 16\n"
-		" slwi    3, 3, 16\n"
-		" clrlwi  6, 5, 16\n"
-		" mullw   6, 7, 6\n"
-		" subfc   4, 6, 4\n"
-		" subfe.  6, 6, 6\n"
-		" add     3, 3, 7\n"
-		" bge Div64Done\n"
-		"Div64Correct:\n"
-		" addc    4, 4, 5\n"
-		" addze.  6, 6\n"
-		" subi    3, 3, 1\n"
-		" blt     Div64Correct\n"
-
-		"Div64Done:\n"
-		" bdnz    Div64Compute\n"
-
-		" cmpwi   3, 0\n"
-		" bso   cr1, Div64QuotientNeg\n"
-
-		" blt     DivOverflow\n"
-		" blr\n"
-
-		"Div64QuotientNeg:\n"
-		" neg.    3, 3\n"
-		" blelr\n"
-
-		"DivOverflow:\n"
-		" cror    4, 7, 7\n"
-
-		"DivByZero:\n"
-		" lis     3, 0x8000\n"
-		" bltlr cr1\n"
-		" subi    3, 3, 1\n"
-		" blr\n"
-	);
-}
-#pragma GCC diagnostic pop
-
-void clearbufbyte(void *d, int32_t c, int32_t a)
-{
-	if (a==0) {
-		uint8_t *dd = (uint8_t*)d;
-		int32_t align = (32 - (int32_t)d) & 31;
-
-		if (align && c >= align) {
-			uint32_t izero = 0;
-			double fzero = 0;
-			c -= align;
-
-			if (align&1) {
-				*dd = izero;
-				dd += 1;
-			}
-			if (align&2) {
-				*(uint16_t*)dd = izero;
-				dd += 2;
-			}
-			if (align&4) {
-				*(uint32_t*)dd = izero;
-				dd += 4;
-			}
-			if (align&8) {
-				*(double*)dd = fzero;
-				dd += 8;
-			}
-			if (align&16) {
-				*(double*)dd = fzero;
-				*(double*)(dd+8) = fzero;
-				dd += 16;
-			}
-		}
-		align = c >> 5;
-		while (align) {
-			__asm__ (
-				" dcbz  0, %0\n"
-				" addi %0, %0, 32\n"
-				: "+r"(dd)
-				:
-				: "memory"
-			);
-			align--;
-		}
-		if ((c &= 31)) {
-			while (c--) {
-				*dd++ = 0;
-			}
-		}
-		return;
-	}
-	__asm__ __volatile__ (
-		" add    %1, %1, %2\n"
-		" neg.   %2, %2\n"
-		" beq 2f\n"
-		"1:\n"
-		" stbx   %0, %1, %2\n"
-		" addic. %2, %2, 1\n"
-		" rotrwi %0, %0, 8\n"
-		" bne 1b\n"
-		"2:\n"
-		: "+r"(a), "+b"(d), "+r"(c)
-		:
-		: "cc", "xer", "memory"
-	);
-}
-
-#elif defined(__GNUC__) && defined(__i386__) && !defined(NOASM)	// NOASM
+#if defined(__GNUC__) && defined(__i386__) && !defined(NOASM)	// NOASM

 //
 // GCC Inline Assembler version
@ -297,7 +149,158 @@ void copybufreverse(const void *S, void *D, int32_t c)
 // Microsoft C Inline Assembler version
 //

-#else				// _MSC_VER
+#elif defined(__GNUC__) && defined(GEKKO)
+
+// naked function (no prolog/epilog)
+// FIXME: this function produces unused parameter warnings and a missing return warning
+int32_t scale(int32_t a, int32_t d, int32_t c)
+{
+    //	return ((int64_t)a * d) / c;
+
+    __asm__ __volatile__ (
+        " mullw   6, 3, 4\n"
+        " mulhw   4, 3, 4\n"
+        " mr      3, 6\n"
+
+        " srawi.  0, 5, 31\n"
+        " cmpwi cr1, 4, 0\n"
+        " crxor   7, 0, 4\n"
+
+        " xor     5, 0, 5\n"
+        " subf.   5, 0, 5\n"
+
+        " beq     DivByZero\n"
+        " bge   cr1, Div64Common\n"
+
+        " subfic  3, 3, 0\n"
+        " subfze  4, 4\n"
+
+        "Div64Common:\n"
+        " cmplw   4, 5\n"
+
+        " cntlzw  6, 5\n"
+        " xor     4, 4, 3\n"
+        " slw     5, 5, 6\n"
+        " rotlw   4, 4, 6\n"
+        " slw     3, 3, 6\n"
+        " li      7, 2\n"
+        " xor     4, 4, 3\n"
+
+        " bge DivOverflow\n"
+        " mtctr   7\n"
+
+        "Div64Compute:\n"
+        " srwi    6, 5, 16\n"
+        " divwu   7, 4, 6\n"
+        " mullw   6, 7, 6\n"
+        " subf    4, 6, 4\n"
+        " slwi    4, 4, 16\n"
+        " inslwi  4, 3, 16, 16\n"
+        " slwi    3, 3, 16\n"
+        " clrlwi  6, 5, 16\n"
+        " mullw   6, 7, 6\n"
+        " subfc   4, 6, 4\n"
+        " subfe.  6, 6, 6\n"
+        " add     3, 3, 7\n"
+        " bge Div64Done\n"
+        "Div64Correct:\n"
+        " addc    4, 4, 5\n"
+        " addze.  6, 6\n"
+        " subi    3, 3, 1\n"
+        " blt     Div64Correct\n"
+
+        "Div64Done:\n"
+        " bdnz    Div64Compute\n"
+
+        " cmpwi   3, 0\n"
+        " bso   cr1, Div64QuotientNeg\n"
+
+        " blt     DivOverflow\n"
+        " blr\n"
+
+        "Div64QuotientNeg:\n"
+        " neg.    3, 3\n"
+        " blelr\n"
+
+        "DivOverflow:\n"
+        " cror    4, 7, 7\n"
+
+        "DivByZero:\n"
+        " lis     3, 0x8000\n"
+        " bltlr cr1\n"
+        " subi    3, 3, 1\n"
+        " blr\n"
+        );
+}
+
+void clearbufbyte(void *d, int32_t c, int32_t a)
+{
+    if (a==0) {
+        uint8_t *dd = (uint8_t*)d;
+        int32_t align = (32 - (int32_t)d) & 31;
+
+        if (align && c >= align) {
+            uint32_t izero = 0;
+            double fzero = 0;
+            c -= align;
+
+            if (align&1) {
+                *dd = izero;
+                dd += 1;
+            }
+            if (align&2) {
+                *(uint16_t*)dd = izero;
+                dd += 2;
+            }
+            if (align&4) {
+                *(uint32_t*)dd = izero;
+                dd += 4;
+            }
+            if (align&8) {
+                *(double*)dd = fzero;
+                dd += 8;
+            }
+            if (align&16) {
+                *(double*)dd = fzero;
+                *(double*)(dd+8) = fzero;
+                dd += 16;
+            }
+        }
+        align = c >> 5;
+        while (align) {
+            __asm__ (
+                " dcbz  0, %0\n"
+                " addi %0, %0, 32\n"
+                : "+r"(dd)
+                :
+                : "memory"
+                );
+            align--;
+        }
+        if ((c &= 31)) {
+            while (c--) {
+                *dd++ = 0;
+            }
+        }
+        return;
+    }
+    __asm__ __volatile__(
+        " add    %1, %1, %2\n"
+        " neg.   %2, %2\n"
+        " beq 2f\n"
+        "1:\n"
+        " stbx   %0, %1, %2\n"
+        " addic. %2, %2, 1\n"
+        " rotrwi %0, %0, 8\n"
+        " bne 1b\n"
+        "2:\n"
+        : "+r"(a), "+b"(d), "+r"(c)
+        :
+        : "cc", "xer", "memory"
+        );
+}
+
+#else

 //
 // Generic C version
--- a/polymer/eduke32/eduke32.vcxproj
+++ b/polymer/eduke32/eduke32.vcxproj
@ -127,7 +127,7 @@
    <NMakeReBuildCommandLine Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nmake /f Makefile.msvc veryclean all DEBUG=1 WINBITS=64</NMakeReBuildCommandLine>
    <NMakeCleanCommandLine Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nmake /f Makefile.msvc veryclean WINBITS=64</NMakeCleanCommandLine>
    <NMakeOutput Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">eduke32.exe</NMakeOutput>
-    <NMakePreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">USE_OPENGL;POLYMER</NMakePreprocessorDefinitions>
+    <NMakePreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">USE_OPENGL;POLYMER;NOASM</NMakePreprocessorDefinitions>
    <NMakeIncludeSearchPath Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(NMakeIncludeSearchPath);build\include;source\jmact;source\jaudiolib\include;source\enet\include;</NMakeIncludeSearchPath>
    <NMakeForcedIncludes Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(NMakeForcedIncludes)</NMakeForcedIncludes>
    <NMakeAssemblySearchPath Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(NMakeAssemblySearchPath)</NMakeAssemblySearchPath>
@ -138,7 +138,7 @@
    <NMakeReBuildCommandLine Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nmake /f Makefile.msvc veryclean all WINBITS=64</NMakeReBuildCommandLine>
    <NMakeCleanCommandLine Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nmake /f Makefile.msvc veryclean WINBITS=64</NMakeCleanCommandLine>
    <NMakeOutput Condition="'$(Configuration)|$(Platform)'=='Release|x64'">eduke32.exe</NMakeOutput>
-    <NMakePreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">USE_OPENGL;POLYMER</NMakePreprocessorDefinitions>
+    <NMakePreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">USE_OPENGL;POLYMER;NOASM</NMakePreprocessorDefinitions>
    <NMakeIncludeSearchPath Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(NMakeIncludeSearchPath);build\include;source\jmact;source\jaudiolib\include;source\enet\include;</NMakeIncludeSearchPath>
    <NMakeForcedIncludes Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(NMakeForcedIncludes)</NMakeForcedIncludes>
    <NMakeAssemblySearchPath Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(NMakeAssemblySearchPath)</NMakeAssemblySearchPath>
@ -172,7 +172,7 @@
    <NMakeReBuildCommandLine Condition="'$(Configuration)|$(Platform)'=='Debug-SDL|x64'">nmake /f Makefile.msvc veryclean all DEBUG=1 WINBITS=64 RENDERTYPE=SDL</NMakeReBuildCommandLine>
    <NMakeCleanCommandLine Condition="'$(Configuration)|$(Platform)'=='Debug-SDL|x64'">nmake /f Makefile.msvc veryclean WINBITS=64 RENDERTYPE=SDL</NMakeCleanCommandLine>
    <NMakeOutput Condition="'$(Configuration)|$(Platform)'=='Debug-SDL|x64'">eduke32.exe</NMakeOutput>
-    <NMakePreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug-SDL|x64'">USE_OPENGL;POLYMER</NMakePreprocessorDefinitions>
+    <NMakePreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug-SDL|x64'">USE_OPENGL;POLYMER;NOASM</NMakePreprocessorDefinitions>
    <NMakeIncludeSearchPath Condition="'$(Configuration)|$(Platform)'=='Debug-SDL|x64'">$(NMakeIncludeSearchPath);build\include;source\jmact;source\jaudiolib\include;source\enet\include;</NMakeIncludeSearchPath>
    <NMakeForcedIncludes Condition="'$(Configuration)|$(Platform)'=='Debug-SDL|x64'">$(NMakeForcedIncludes)</NMakeForcedIncludes>
    <NMakeAssemblySearchPath Condition="'$(Configuration)|$(Platform)'=='Debug-SDL|x64'">$(NMakeAssemblySearchPath)</NMakeAssemblySearchPath>
@ -183,7 +183,7 @@
    <NMakeReBuildCommandLine Condition="'$(Configuration)|$(Platform)'=='Release-SDL|x64'">nmake /f Makefile.msvc veryclean all WINBITS=64 RENDERTYPE=SDL</NMakeReBuildCommandLine>
    <NMakeCleanCommandLine Condition="'$(Configuration)|$(Platform)'=='Release-SDL|x64'">nmake /f Makefile.msvc veryclean WINBITS=64 RENDERTYPE=SDL</NMakeCleanCommandLine>
    <NMakeOutput Condition="'$(Configuration)|$(Platform)'=='Release-SDL|x64'">eduke32.exe</NMakeOutput>
-    <NMakePreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release-SDL|x64'">USE_OPENGL;POLYMER</NMakePreprocessorDefinitions>
+    <NMakePreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release-SDL|x64'">USE_OPENGL;POLYMER;NOASM</NMakePreprocessorDefinitions>
    <NMakeIncludeSearchPath Condition="'$(Configuration)|$(Platform)'=='Release-SDL|x64'">$(NMakeIncludeSearchPath);build\include;source\jmact;source\jaudiolib\include;source\enet\include;</NMakeIncludeSearchPath>
    <NMakeForcedIncludes Condition="'$(Configuration)|$(Platform)'=='Release-SDL|x64'">$(NMakeForcedIncludes)</NMakeForcedIncludes>
    <NMakeAssemblySearchPath Condition="'$(Configuration)|$(Platform)'=='Release-SDL|x64'">$(NMakeAssemblySearchPath)</NMakeAssemblySearchPath>
--- a/polymer/eduke32/source/actors.c
+++ b/polymer/eduke32/source/actors.c
@ -5029,7 +5029,7 @@ ACTOR_STATIC void G_MoveMisc(void)  // STATNUM 5
            case NEON5__STATIC:
            case NEON6__STATIC:

-                if ((g_globalRandom/(s->lotag+1)&31) > 4) s->shade = -127;
+                if (tabledivide32_noinline(g_globalRandom, (s->lotag+1)&31) > 4) s->shade = -127;
                else s->shade = 127;
                goto BOLT;

@ -6300,7 +6300,7 @@ ACTOR_STATIC void G_MoveEffectors(void)   //STATNUM 3

            //    if(t[5] > 0) { t[5]--; break; }

-            if ((g_globalRandom/(sh+1)&31) < 4 && !t[2])
+            if (tabledivide32_noinline(g_globalRandom, (sh+1)&31) < 4 && !t[2])
            {
                //       t[5] = 4+(g_globalRandom&7);
                sc->ceilingpal = s->owner>>8;
@ -6337,7 +6337,7 @@ ACTOR_STATIC void G_MoveEffectors(void)   //STATNUM 3

        case SE_4_RANDOM_LIGHTS:

-            if ((g_globalRandom/(sh+1)&31) < 4)
+            if (tabledivide32_noinline(g_globalRandom, (sh+1)&31) < 4)
            {
                t[1] = s->shade + (g_globalRandom&15);//Got really bright
                t[0] = s->shade + (g_globalRandom&15);
--- a/polymer/eduke32/source/android/in_android.c
+++ b/polymer/eduke32/source/android/in_android.c
@ -243,11 +243,11 @@ void CONTROL_Android_PollDevices(ControlInfo *info)
    //LOGI("CONTROL_Android_PollDevices %f %f",forwardmove,sidemove);
    //LOGI("CONTROL_Android_PollDevices %f %f",droidinput.pitch,droidinput.yaw);

-    info->dz     = (int32_t)nearbyintf(-droidinput.forwardmove * ANDROIDFORWARDMOVEFACTOR);
-    info->dx     = (int32_t)nearbyintf(droidinput.sidemove * ANDROIDSIDEMOVEFACTOR);
-    info->dpitch = (int32_t)nearbyint(droidinput.pitch * ANDROIDPITCHFACTOR +
+    info->dz     = (int32_t)Blrintf(-droidinput.forwardmove * ANDROIDFORWARDMOVEFACTOR);
+    info->dx     = (int32_t)Blrintf(droidinput.sidemove * ANDROIDSIDEMOVEFACTOR);
+    info->dpitch = (int32_t)Blrintf(droidinput.pitch * ANDROIDPITCHFACTOR +
            droidinput.pitch_joystick * ANDROIDPITCHFACTORJOYSTICK);
-    info->dyaw   = (int32_t)nearbyint(-droidinput.yaw * ANDROIDYAWFACTOR -
+    info->dyaw   = (int32_t)Blrintf(-droidinput.yaw * ANDROIDYAWFACTOR -
            droidinput.yaw_joystick * ANDROIDYAWFACTORJOYSTICK);

    /*
--- a/polymer/eduke32/source/demo.c
+++ b/polymer/eduke32/source/demo.c
@ -886,7 +886,7 @@ nextdemo_nomenu:
                    rotatesprite(120<<16,16<<16,32768,0,SLIDEBAR,0,0,2+8+16+1024,(xdim*125)/320,0,(xdim*155)/320,ydim-1);
                    rotatesprite(150<<16,16<<16,32768,0,SLIDEBAR,0,0,2+8+16+1024,(xdim*155)/320,0,xdim-1,ydim-1);

-                    j = (182<<16) - ((((120*(g_demo_totalCnt-g_demo_cnt))<<4)/g_demo_totalCnt)<<12);
+                    j = (182<<16) - (tabledivide32_noinline((120*(g_demo_totalCnt-g_demo_cnt))<<4, g_demo_totalCnt)<<12);
                    rotatesprite_fs(j,(16<<16)+(1<<15),32768,0,SLIDEBAR+1,0,0,2+8+16+1024);

                    j=(g_demo_totalCnt-g_demo_cnt)/REALGAMETICSPERSEC;
--- a/polymer/eduke32/source/game.c
+++ b/polymer/eduke32/source/game.c
@ -809,7 +809,7 @@ vec2_t G_ScreenText(const int32_t font,
            {
                size.x = xbetween;

-                xbetween = (length == 1) ? 0 : ((xbetween - linewidth) / (length - 1));
+                xbetween = (length == 1) ? 0 : tabledivide32_noinline((xbetween - linewidth), (length - 1));

                linewidth = size.x;
            }
@ -823,7 +823,7 @@ vec2_t G_ScreenText(const int32_t font,
        if (f & TEXT_YJUSTIFY)
        {
            const int32_t tempswap = ybetween;
-            ybetween = (lines == 1) ? 0 : ((ybetween - size.y) / (lines - 1));
+            ybetween = (lines == 1) ? 0 : tabledivide32_noinline(ybetween - size.y, lines - 1);
            size.y = tempswap;
        }

@ -1001,7 +1001,7 @@ vec2_t G_ScreenText(const int32_t font,

                    if (f & TEXT_XJUSTIFY)
                    {
-                        xbetween = (length == 1) ? 0 : ((xbetween - linewidth) / (length - 1));
+                        xbetween = (length == 1) ? 0 : tabledivide32_noinline(xbetween - linewidth, length - 1);

                        linewidth = size.x;
                    }
@ -2489,7 +2489,7 @@ static void G_PrintFPS(void)

        if (thisSec - LastSec)
        {
-            g_currentFrameRate = LastCount = FrameCount / (thisSec - LastSec);
+            g_currentFrameRate = LastCount = tabledivide32_noinline(FrameCount, thisSec - LastSec);
            LastSec = thisSec;
            FrameCount = 0;

@ -3483,7 +3483,9 @@ static void palaccum_add(palaccum_t *pa, const palette_t *pal, int32_t f)

 static void G_FadePalaccum(const palaccum_t *pa)
 {
-    setpalettefade(pa->r/pa->sumf, pa->g/pa->sumf, pa->b/pa->sumf, pa->maxf);
+    setpalettefade(tabledivide32_noinline(pa->r, pa->sumf),
+                   tabledivide32_noinline(pa->g, pa->sumf),
+                   tabledivide32_noinline(pa->b, pa->sumf), pa->maxf);
 }


@ -4502,7 +4504,7 @@ void G_DrawRooms(int32_t snum, int32_t smoothratio)
        else
        {
            tmpvr = vr;
-            tmpyx = (65536*ydim*8)/(xdim*5);
+            tmpyx = tabledivide32_noinline(65536*ydim*8, xdim*5);

            setaspect(mulscale16(tmpvr,viewingrange), yxaspect);
        }
@ -4581,7 +4583,7 @@ void G_DrawRooms(int32_t snum, int32_t smoothratio)
                setaspect(mulscale16(oviewingrange,i>>1), yxaspect);

                tmpvr = i>>1;
-                tmpyx = (65536*ydim*8)/(xdim*5);
+                tmpyx = tabledivide32_noinline(65536*ydim*8, xdim*5);
            }
        }
        else if (getrendermode() >= REND_POLYMOST && (ud.screen_tilting
@ -7646,7 +7648,7 @@ void G_DoSpriteAnimations(int32_t ourx, int32_t oury, int32_t oura, int32_t smoo
                l = s->z-actor[g_player[p].ps->i].floorz+(3<<8);
                // SET_SPRITE_NOT_TSPRITE
                if (l > 1024 && s->yrepeat > 32 && s->extra > 0)
-                    s->yoffset = (int8_t)(l/(s->yrepeat<<2));
+                    s->yoffset = (int8_t)tabledivide32_noinline(l, s->yrepeat<<2);
                else s->yoffset=0;
            }

@ -12908,8 +12910,8 @@ void A_SpawnWallGlass(int32_t i,int32_t wallnum,int32_t n)
    x1 -= ksgn(yv);
    y1 += ksgn(xv);

-    xv /= j;
-    yv /= j;
+    xv = tabledivide32_noinline(xv, j);
+    yv = tabledivide32_noinline(yv, j);

    for (j=n; j>0; j--)
    {
@ -12949,8 +12951,8 @@ void A_SpawnCeilingGlass(int32_t i,int32_t sectnum,int32_t n)
        x1 = wall[s].x;
        y1 = wall[s].y;

-        xv = (wall[s+1].x-x1)/(n+1);
-        yv = (wall[s+1].y-y1)/(n+1);
+        xv = tabledivide32_noinline(wall[s+1].x-x1, n+1);
+        yv = tabledivide32_noinline(wall[s+1].y-y1, n+1);

        for (j=n; j>0; j--)
        {
@ -12984,8 +12986,8 @@ void A_SpawnRandomGlass(int32_t i,int32_t wallnum,int32_t n)
    x1 = wall[wallnum].x;
    y1 = wall[wallnum].y;

-    xv = (wall[wall[wallnum].point2].x-wall[wallnum].x)/j;
-    yv = (wall[wall[wallnum].point2].y-wall[wallnum].y)/j;
+    xv = tabledivide32_noinline(wall[wall[wallnum].point2].x-wall[wallnum].x, j);
+    yv = tabledivide32_noinline(wall[wall[wallnum].point2].y-wall[wallnum].y, j);

    for (j=n; j>0; j--)
    {
--- a/polymer/eduke32/source/gameexec.c
+++ b/polymer/eduke32/source/gameexec.c
@ -266,7 +266,7 @@ int32_t A_GetFurthestAngle(int32_t iActor, int32_t angs)
        int32_t furthest_angle=0;
        int32_t d, j;
        int32_t greatestd = INT32_MIN;
-        int32_t angincs=2048/angs;
+        int32_t angincs=tabledivide32_noinline(2048, angs);
        hitdata_t hit;

        for (j=s->ang; j<(2048+s->ang); j+=angincs)
@ -303,7 +303,7 @@ int32_t A_FurthestVisiblePoint(int32_t iActor, spritetype *ts, int32_t *dax, int

        if ((!g_netServer && ud.multimode < 2) && ud.player_skill < 3)
            angincs = 2048/2;
-        else angincs = 2048/(1+(krand()&1));
+        else angincs = tabledivide32_noinline(2048, 1+(krand()&1));

        for (j=ts->ang; j<(2048+ts->ang); j+=(angincs-(krand()&511)))
        {
@ -4447,7 +4447,7 @@ finish_qsprintf:
                            /*OSD_Printf(OSDTEXT_GREEN "CON_RESIZEARRAY: resizing array %s from %d to %d\n",
                                aGameArrays[j].szLabel, aGameArrays[j].size, asize / GAR_ELTSZ);*/
                            aGameArrays[j].plValues = (intptr_t *)Xrealloc(aGameArrays[j].plValues, asize);
-                            aGameArrays[j].size = asize / GAR_ELTSZ;
+                            aGameArrays[j].size = asize/GAR_ELTSZ;
                            kread(fil, aGameArrays[j].plValues, asize);
                        }

@ -5513,7 +5513,11 @@ void A_Execute(int32_t iActor, int32_t iPlayer, int32_t lDist)
    else if (actor[vm.g_i].timetosleep > 1)
        actor[vm.g_i].timetosleep--;
    else if (actor[vm.g_i].timetosleep == 1)
+    {
+        if (g_scriptVersion == 13 && (vm.g_sp->picnum == FIRE || vm.g_sp->picnum == FIRE2))
+            return;
        changespritestat(vm.g_i, STAT_ZOMBIEACTOR);
+    }
 }

 void G_SaveMapState(void)
--- a/polymer/eduke32/source/gamevars.h
+++ b/polymer/eduke32/source/gamevars.h
@ -127,35 +127,79 @@ void Gv_FinalizeWeaponDefaults(void);
    { \
    default: \
        aGameVars[id].val.lValue operator lValue; \
-        return; \
+        break; \
    case GAMEVAR_PERPLAYER: \
-        if ((unsigned)vm.g_p > MAXPLAYERS-1) return; \
+        if ((unsigned)vm.g_p > MAXPLAYERS-1) break; \
        aGameVars[id].val.plValues[vm.g_p] operator lValue; \
-        return; \
+        break; \
    case GAMEVAR_PERACTOR: \
-        if ((unsigned)vm.g_i > MAXSPRITES-1) return; \
+        if ((unsigned)vm.g_i > MAXSPRITES-1) break; \
        aGameVars[id].val.plValues[vm.g_i] operator lValue; \
-        return; \
+        break; \
    case GAMEVAR_INTPTR: \
        *((int32_t *)aGameVars[id].val.lValue) operator (int32_t)lValue; \
-        return; \
+        break; \
    case GAMEVAR_SHORTPTR: \
        *((int16_t *)aGameVars[id].val.lValue) operator (int16_t)lValue; \
-        return; \
+        break; \
    case GAMEVAR_CHARPTR: \
        *((uint8_t *)aGameVars[id].val.lValue) operator (uint8_t)lValue; \
-        return; \
+        break; \
    } \
 }

+// even though libdivide is faster than straight division (when using the LUT) the overhead makes this slower on x86
+// ARM, however, has no hardware integer division
+#if defined(__arm__) || defined(LIBDIVIDE_ALWAYS)
+static inline void __fastcall Gv_DivVar(int32_t id, int32_t lValue)
+{
+    static libdivide_s32_t sdiv;
+    static int32_t lastlValue;
+    libdivide_s32_t *dptr = &sdiv;
+    intptr_t *iptr = &aGameVars[id].val.lValue;
+
+    if ((aGameVars[id].dwFlags & GAMEVAR_PERPLAYER && (unsigned) vm.g_p > MAXPLAYERS-1) ||
+        (aGameVars[id].dwFlags & GAMEVAR_PERACTOR && (unsigned) vm.g_i > MAXSPRITES-1)) return;
+
+    if ((unsigned) lValue < DIVTABLESIZE)
+        dptr = (libdivide_s32_t *)&divtable32[lValue];
+    else if (lValue != lastlValue)
+        sdiv = libdivide_s32_gen(lValue), lastlValue = lValue;
+
+    switch (aGameVars[id].dwFlags & (GAMEVAR_USER_MASK|GAMEVAR_PTR_MASK))
+    {
+    case GAMEVAR_PERPLAYER:
+        iptr = &aGameVars[id].val.plValues[vm.g_p];
+    default:
+        break;
+    case GAMEVAR_PERACTOR:
+        iptr = &aGameVars[id].val.plValues[vm.g_i];
+        break;
+    case GAMEVAR_INTPTR:
+        *((int32_t *) aGameVars[id].val.lValue) = (int32_t) libdivide_s32_do(*((int32_t *) aGameVars[id].val.lValue), dptr);
+        return;
+    case GAMEVAR_SHORTPTR:
+        *((int16_t *) aGameVars[id].val.lValue) = (int16_t) libdivide_s32_do(*((int16_t *) aGameVars[id].val.lValue), dptr);
+        return;
+    case GAMEVAR_CHARPTR:
+        *((uint8_t *) aGameVars[id].val.lValue) = (uint8_t) libdivide_s32_do(*((uint8_t *) aGameVars[id].val.lValue), dptr);
+        return;
+    }
+
+    *iptr = libdivide_s32_do(*iptr, dptr);
+}
+#else
+GV_VAROP(Gv_DivVar, /=)
+#endif
+
 GV_VAROP(Gv_AddVar, +=)
 GV_VAROP(Gv_SubVar, -=)
 GV_VAROP(Gv_MulVar, *=)
-GV_VAROP(Gv_DivVar, /=)
 GV_VAROP(Gv_ModVar, %=)
 GV_VAROP(Gv_AndVar, &=)
 GV_VAROP(Gv_XorVar, ^=)
 GV_VAROP(Gv_OrVar, |=)
+
 #endif

 #endif
--- a/polymer/eduke32/source/m32exec.c
+++ b/polymer/eduke32/source/m32exec.c
@ -809,7 +809,7 @@ skip_check:
                float fval = *((float *)&bits);
 // rounding must absolutely be!
 //OSD_Printf("ftoi: bits:%8x, scale=%d, fval=%f, (int32_t)(fval*scale)=%d\n", bits, scale, fval, (int32_t)(fval*scale));
-                Gv_SetVarX(*insptr, (int32_t)nearbyintf(fval * scale));
+                Gv_SetVarX(*insptr, (int32_t)Blrintf(fval * scale));
            }
            insptr += 2;
            continue;
--- a/polymer/eduke32/source/menus.c
+++ b/polymer/eduke32/source/menus.c
@ -726,17 +726,17 @@ static MenuEntry_t *MEL_RENDERERSETUP_GL3[] = {
 #endif

 #ifdef DROIDMENU
-static MenuRangeFloat_t MEO_COLCORR_GAMMA = MAKE_MENURANGE( &MF_Bluefont, 1.f, 1.f, 2.5f, 39.f, 0.f, &vid_gamma );
+static MenuRangeFloat_t MEO_COLCORR_GAMMA = MAKE_MENURANGE( &MF_Bluefont, 1, 1.f, 2.5f, 39.f, 0.f, &vid_gamma );
 #else
-static MenuRangeFloat_t MEO_COLCORR_GAMMA = MAKE_MENURANGE( &MF_Bluefont, 1.f, 0.2f, 4.f, 39.f, 0.f, &vid_gamma );
+static MenuRangeFloat_t MEO_COLCORR_GAMMA = MAKE_MENURANGE( &MF_Bluefont, 1, 0.2f, 4.f, 39, 0.f, &vid_gamma );
 #endif
 static MenuEntry_t ME_COLCORR_GAMMA = MAKE_MENUENTRY( &MF_Redfont, "Gamma:", RangeFloat, &MEO_COLCORR_GAMMA );
-static MenuRangeFloat_t MEO_COLCORR_CONTRAST = MAKE_MENURANGE( &MF_Bluefont, 1.f, 0.1f, 2.7f, 53.f, 0.f, &vid_contrast );
+static MenuRangeFloat_t MEO_COLCORR_CONTRAST = MAKE_MENURANGE( &MF_Bluefont, 1, 0.1f, 2.7f, 53, 0.f, &vid_contrast );
 static MenuEntry_t ME_COLCORR_CONTRAST = MAKE_MENUENTRY( &MF_Redfont, "Contrast:", RangeFloat, &MEO_COLCORR_CONTRAST );
-static MenuRangeFloat_t MEO_COLCORR_BRIGHTNESS = MAKE_MENURANGE( &MF_Bluefont, 1.f, -0.8f, 0.8f, 33.f, 0.f, &vid_brightness );
+static MenuRangeFloat_t MEO_COLCORR_BRIGHTNESS = MAKE_MENURANGE( &MF_Bluefont, 1, -0.8f, 0.8f, 33, 0.f, &vid_brightness );
 static MenuEntry_t ME_COLCORR_BRIGHTNESS = MAKE_MENUENTRY( &MF_Redfont, "Brightness:", RangeFloat, &MEO_COLCORR_BRIGHTNESS );
 static MenuEntry_t ME_COLCORR_RESET = MAKE_MENUENTRY( &MF_Redfont, "Reset To Defaults", Link, &MEO_NULL );
-static MenuRangeFloat_t MEO_COLCORR_AMBIENT = MAKE_MENURANGE(&MF_Bluefont, 1.f, 0.125f, 4.f, 32.f, 0.f, &r_ambientlight);
+static MenuRangeFloat_t MEO_COLCORR_AMBIENT = MAKE_MENURANGE(&MF_Bluefont, 1, 0.125f, 4.f, 32, 0.f, &r_ambientlight);
 static MenuEntry_t ME_COLCORR_AMBIENT = MAKE_MENUENTRY(&MF_Redfont, "Visibility:", RangeFloat, &MEO_COLCORR_AMBIENT);

 static MenuEntry_t *MEL_COLCORR[] = {
@ -3575,7 +3575,7 @@ static int32_t M_RunMenu_MenuMenu(MenuMenu_t *menu, MenuEntry_t *currentry, int3
                            case 2:
                            {
                                int32_t v;
-                                ftol(((float) *object->variable * 100.) / (float) object->onehundredpercent + 0.5, &v);
+                                v = Blrintf(((float) *object->variable * 100.f) / (float) object->onehundredpercent);
                                Bsprintf(tempbuf, "%d%%", v);
                                break;
                            }
@ -3602,7 +3602,7 @@ static int32_t M_RunMenu_MenuMenu(MenuMenu_t *menu, MenuEntry_t *currentry, int3
                    rotatesprite_fs(x, y - menu->scrollPos, z, 0, SLIDEBAR, s, p, 2|8|16|ROTATESPRITE_FULL16);

                    rotatesprite_fs(
-                    x + (1<<16) + ((float) scale((tilesiz[SLIDEBAR].x-2-tilesiz[SLIDEBAR+1].x)<<16, height, tilesiz[SLIDEBAR].y<<16) * (*object->variable - object->min) / (object->max - object->min)),
+                    x + (1<<16) + (int32_t)((float) scale((tilesiz[SLIDEBAR].x-2-tilesiz[SLIDEBAR+1].x)<<16, height, tilesiz[SLIDEBAR].y<<16) * (*object->variable - object->min) / (object->max - object->min)),
                    y + scale((tilesiz[SLIDEBAR].y-tilesiz[SLIDEBAR+1].y)<<15, height, tilesiz[SLIDEBAR].y<<16) - menu->scrollPos,
                    z, 0, SLIDEBAR+1, s, p, 2|8|16|ROTATESPRITE_FULL16);

@ -3621,7 +3621,7 @@ static int32_t M_RunMenu_MenuMenu(MenuMenu_t *menu, MenuEntry_t *currentry, int3
                            case 2:
                            {
                                int32_t v;
-                                ftol((*object->variable * 100.f) / object->onehundredpercent + 0.5f, &v);
+                                v = Blrintf((*object->variable * 100.f) / object->onehundredpercent);
                                Bsprintf(tempbuf, "%d%%", v);
                                break;
                            }
@ -3648,7 +3648,7 @@ static int32_t M_RunMenu_MenuMenu(MenuMenu_t *menu, MenuEntry_t *currentry, int3
                    rotatesprite_fs(x, y - menu->scrollPos, z, 0, SLIDEBAR, s, p, 2|8|16|ROTATESPRITE_FULL16);

                    rotatesprite_fs(
-                    x + (1<<16) + ((double) scale((tilesiz[SLIDEBAR].x-2-tilesiz[SLIDEBAR+1].x)<<16, height, tilesiz[SLIDEBAR].y<<16) * (*object->variable - object->min) / (object->max - object->min)),
+                    x + (1<<16) + (int32_t)((double) scale((tilesiz[SLIDEBAR].x-2-tilesiz[SLIDEBAR+1].x)<<16, height, tilesiz[SLIDEBAR].y<<16) * (*object->variable - object->min) / (object->max - object->min)),
                    y + scale((tilesiz[SLIDEBAR].y-tilesiz[SLIDEBAR+1].y)<<15, height, tilesiz[SLIDEBAR].y<<16) - menu->scrollPos,
                    z, 0, SLIDEBAR+1, s, p, 2|8|16|ROTATESPRITE_FULL16);

@ -3667,7 +3667,7 @@ static int32_t M_RunMenu_MenuMenu(MenuMenu_t *menu, MenuEntry_t *currentry, int3
                            case 2:
                            {
                                int32_t v;
-                                dtol((*object->variable * 100.) / object->onehundredpercent + 0.5, &v);
+                                v = Blrintf((*object->variable * 100.) / object->onehundredpercent);
                                Bsprintf(tempbuf, "%d%%", v);
                                break;
                            }
@ -4403,14 +4403,14 @@ static void M_RunMenuInput(Menu_t *cm)
                    case RangeInt32:
                    {
                        MenuRangeInt32_t *object = (MenuRangeInt32_t*)currentry->entry;
-                        const double interval = (double) (object->max - object->min) / (object->steps - 1);
+                        const float interval = (float) (object->max - object->min) / (float) (object->steps - 1);
                        int32_t step;
                        int32_t modification = 0;

                        if (currentry->disabled)
                            break;

-                        dtol((double) (*object->variable - object->min) / interval + 0.5, &step);
+                        step = Blrintf((float) (*object->variable - object->min) / interval);

                        if (I_SliderLeft())
                        {
@ -4438,7 +4438,7 @@ static void M_RunMenuInput(Menu_t *cm)
                            else if (step >= object->steps)
                                step = object->steps - 1;

-                            dtol(interval * step + object->min + 0.5, &temp);
+                            temp = Blrintf(interval * step + (object->min));

                            if (!M_MenuEntryRangeInt32Modify(currentry, temp))
                                *object->variable = temp;
@ -4456,7 +4456,7 @@ static void M_RunMenuInput(Menu_t *cm)
                        if (currentry->disabled)
                            break;

-                        ftol((*object->variable - object->min) / interval + 0.5, &step);
+                        step = Blrintf((*object->variable - object->min) / interval);

                        if (I_SliderLeft())
                        {
@ -4502,7 +4502,7 @@ static void M_RunMenuInput(Menu_t *cm)
                        if (currentry->disabled)
                            break;

-                        dtol((*object->variable - object->min) / interval + 0.5, &step);
+                        step = Blrintf((*object->variable - object->min) / interval);

                        if (I_SliderLeft())
                        {
--- a/polymer/eduke32/source/midi.c
+++ b/polymer/eduke32/source/midi.c
@ -39,6 +39,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 #include "midi.h"
 #include "mpu401.h"
 #include "compat.h"
+#include "pragmas.h"

 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
@ -297,7 +298,7 @@ static void _MIDI_MetaEvent
        break;

    case MIDI_TEMPO_CHANGE :
-        tempo = 60000000L / _MIDI_ReadNumber(Track->pos, 3);
+        tempo = tabledivide32_noinline(60000000L, _MIDI_ReadNumber(Track->pos, 3));
        MIDI_SetTempo(tempo);
        break;

@ -318,7 +319,7 @@ static void _MIDI_MetaEvent
            _MIDI_TimeBase += _MIDI_TimeBase;
            denominator--;
        }
-        _MIDI_TicksPerBeat = (_MIDI_Division * 4) / _MIDI_TimeBase;
+        _MIDI_TicksPerBeat = tabledivide32_noinline(_MIDI_Division * 4, _MIDI_TimeBase);
        break;
    }

@ -794,7 +795,7 @@ static void _MIDI_SetChannelVolume
    {
        remotevolume = volume * _MIDI_TotalVolume;
        remotevolume *= _MIDI_UserChannelVolume[ channel ];
-        remotevolume /= MIDI_MaxVolume;
+        remotevolume = tabledivide32_noinline(remotevolume, MIDI_MaxVolume);
        remotevolume >>= 8;

        status = _MIDI_RerouteFunctions[ channel ](0xB0 + channel,
@ -821,7 +822,7 @@ static void _MIDI_SetChannelVolume
    if (_MIDI_Funcs->SetVolume == NULL)
    {
        volume *= _MIDI_TotalVolume;
-        volume /= MIDI_MaxVolume;
+        volume = tabledivide32_noinline(volume, MIDI_MaxVolume);
    }

    // For user volume
@ -1315,8 +1316,8 @@ void MIDI_SetTempo
    int32_t tickspersecond;

    MIDI_Tempo = tempo;
-    tickspersecond = ((tempo) * _MIDI_Division) / 60;
-    _MIDI_FPSecondsPerTick = (1 << TIME_PRECISION) / tickspersecond;
+    tickspersecond = ((tempo) * _MIDI_Division)/60;
+    _MIDI_FPSecondsPerTick = tabledivide32_noinline(1 << TIME_PRECISION, tickspersecond);
    MPU_SetTempo(tempo);
 }

@ -1562,8 +1563,8 @@ void MIDI_SetSongTime

    MIDI_PauseSong();

-    mil = ((milliseconds % 1000) << TIME_PRECISION) / 1000;
-    sec = (milliseconds / 1000) << TIME_PRECISION;
+    mil = tabledivide32_noinline((milliseconds % 1000) << TIME_PRECISION, 1000);
+    sec = tabledivide32_noinline(milliseconds, 1000) << TIME_PRECISION;
    newtime = sec + mil;

    if (newtime < _MIDI_Time)
--- a/polymer/eduke32/source/mpu401.c
+++ b/polymer/eduke32/source/mpu401.c
@ -34,6 +34,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

 #include "mpu401.h"
 #include "compat.h"
+#include "pragmas.h"

 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
@ -441,7 +442,7 @@ void MPU_SetTempo(int32_t tempo)
 {
    MIDIPROPTEMPO prop;
    prop.cbStruct = sizeof(MIDIPROPTEMPO);
-    prop.dwTempo = 60000000l/tempo;
+    prop.dwTempo = tabledivide32_noinline(60000000l, tempo);
    midiStreamProperty(hmido, (LPBYTE)&prop, MIDIPROP_SET|MIDIPROP_TEMPO);
 }

--- a/polymer/eduke32/source/player.c
+++ b/polymer/eduke32/source/player.c
@ -113,9 +113,9 @@ static void A_DoWaterTracers(int32_t x1,int32_t y1,int32_t z1,int32_t x2,int32_t
    int16_t sect = -1;

    i = n+1;
-    xv = (x2-x1)/i;
-    yv = (y2-y1)/i;
-    zv = (z2-z1)/i;
+    xv = tabledivide32_noinline(x2-x1, i);
+    yv = tabledivide32_noinline(y2-y1, i);
+    zv = tabledivide32_noinline(z2-z1, i);

    if ((klabs(x1-x2)+klabs(y1-y2)) < 3084)
        return;
@ -147,15 +147,15 @@ static void A_HitscanProjTrail(const vec3_t *sv, const vec3_t *dv, int32_t ang,

    Bmemcpy(&destvect, dv, sizeof(vec3_t));

-    srcvect.x = sv->x + (sintable[(348+ang+512)&2047]/proj->offset);
-    srcvect.y = sv->y + (sintable[(ang+348)&2047]/proj->offset);
+    srcvect.x = sv->x + tabledivide32_noinline(sintable[(348+ang+512)&2047], proj->offset);
+    srcvect.y = sv->y + tabledivide32_noinline(sintable[(ang+348)&2047], proj->offset);
    srcvect.z = sv->z + 1024+(proj->toffset<<8);

    n = ((FindDistance2D(srcvect.x-destvect.x,srcvect.y-destvect.y))>>8)+1;

-    destvect.x = ((destvect.x-srcvect.x)/n);
-    destvect.y = ((destvect.y-srcvect.y)/n);
-    destvect.z = ((destvect.z-srcvect.z)/n);
+    destvect.x = tabledivide32_noinline((destvect.x-srcvect.x), n);
+    destvect.y = tabledivide32_noinline((destvect.y-srcvect.y), n);
+    destvect.z = tabledivide32_noinline((destvect.z-srcvect.z), n);

    srcvect.x += destvect.x>>2;
    srcvect.y += destvect.y>>2;
@ -379,7 +379,7 @@ static int32_t GetAutoAimAngle(int32_t i, int32_t p, int32_t atwith,
        }

        dst = safeldist(g_player[p].ps->i, &sprite[j]);
-        *zvel = ((spr->z - srcvect->z - cen)*vel) / dst;
+        *zvel = tabledivide32_noinline((spr->z - srcvect->z - cen)*vel, dst);

        if (!(flags&2) || sprite[j].picnum != RECON)
            *sa = getangle(spr->x-srcvect->x, spr->y-srcvect->y);
@ -530,7 +530,7 @@ static void A_PreFireHitscan(const spritetype *s, vec3_t *srcvect, int32_t *zvel
    const DukePlayer_t *targetps = g_player[j].ps;

    const int32_t d = safeldist(targetps->i, s);
-    *zvel = ((targetps->pos.z-srcvect->z)<<8) / d;
+    *zvel = tabledivide32_noinline((targetps->pos.z-srcvect->z)<<8, d);

    srcvect->z -= (4<<8);

@ -960,7 +960,7 @@ static int32_t A_ShootCustom(const int32_t i, const int32_t atwith, int16_t sa,
                sa = getangle(g_player[j].ps->opos.x-srcvect->x, g_player[j].ps->opos.y-srcvect->y);

                l = safeldist(g_player[j].ps->i, s);
-                zvel = ((g_player[j].ps->opos.z - srcvect->z)*vel) / l;
+                zvel = tabledivide32_noinline((g_player[j].ps->opos.z - srcvect->z)*vel, l);

                if (A_CheckEnemySprite(s) && (AC_MOVFLAGS(s, &actor[i]) & face_player_smart))
                    sa = s->ang + (krand() & 31) - 16;
@ -974,8 +974,8 @@ static int32_t A_ShootCustom(const int32_t i, const int32_t atwith, int16_t sa,

        zvel = A_GetShootZvel(zvel);
        j = A_InsertSprite(sect,
-            srcvect->x + (sintable[(348 + sa + 512) & 2047] / proj->offset),
-            srcvect->y + (sintable[(sa + 348) & 2047] / proj->offset),
+            srcvect->x + tabledivide32_noinline(sintable[(348 + sa + 512) & 2047], proj->offset),
+            srcvect->y + tabledivide32_noinline(sintable[(sa + 348) & 2047], proj->offset),
            srcvect->z - (1 << 8), atwith, 0, 14, 14, sa, vel, zvel, i, 4);

        sprite[j].xrepeat = proj->xrepeat;
@ -1017,7 +1017,7 @@ static int32_t A_ShootCustom(const int32_t i, const int32_t atwith, int16_t sa,
        {
            int32_t x;
            j = g_player[A_FindPlayer(s, &x)].ps->i;
-            zvel = ((sprite[j].z - srcvect->z) << 8) / (x + 1);
+            zvel = tabledivide32_noinline((sprite[j].z - srcvect->z) << 8, x + 1);
            sa = getangle(sprite[j].x - srcvect->x, sprite[j].y - srcvect->y);
        }

@ -1205,7 +1205,7 @@ int32_t A_ShootWithZvel(int32_t i, int32_t atwith, int32_t override_zvel)
                {
                    int32_t x;
                    j = g_player[A_FindPlayer(s,&x)].ps->i;
-                    zvel = ((sprite[j].z-srcvect.z)<<8) / (x+1);
+                    zvel = tabledivide32_noinline((sprite[j].z-srcvect.z)<<8, x+1);
                    sa = getangle(sprite[j].x-srcvect.x,sprite[j].y-srcvect.y);
                }
            }
@ -1352,7 +1352,7 @@ int32_t A_ShootWithZvel(int32_t i, int32_t atwith, int32_t override_zvel)
                //                sa = getangle(g_player[j].ps->opos.x-sx,g_player[j].ps->opos.y-sy);
                sa += 16-(krand()&31);
                hit.pos.x = safeldist(g_player[j].ps->i, s);
-                zvel = ((g_player[j].ps->opos.z - srcvect.z + (3<<8))*vel) / hit.pos.x;
+                zvel = tabledivide32_noinline((g_player[j].ps->opos.z - srcvect.z + (3<<8))*vel, hit.pos.x);
            }

            zvel = A_GetShootZvel(zvel);
@ -1438,7 +1438,7 @@ int32_t A_ShootWithZvel(int32_t i, int32_t atwith, int32_t override_zvel)
                }

                l = safeldist(g_player[j].ps->i, s);
-                zvel = ((g_player[j].ps->opos.z - srcvect.z)*vel) / l;
+                zvel = tabledivide32_noinline((g_player[j].ps->opos.z - srcvect.z)*vel, l);

                if (A_CheckEnemySprite(s) && (AC_MOVFLAGS(s, &actor[i]) & face_player_smart))
                    sa = s->ang+(krand()&31)-16;
@ -1635,7 +1635,7 @@ int32_t A_ShootWithZvel(int32_t i, int32_t atwith, int32_t override_zvel)
            {
                j = A_FindPlayer(s, NULL);
                l = safeldist(g_player[j].ps->i, s);
-                zvel = ((g_player[j].ps->opos.z-srcvect.z)*512) / l ;
+                zvel = tabledivide32_noinline((g_player[j].ps->opos.z-srcvect.z)*512, l);
            }
            else zvel = 0;

@ -1830,7 +1830,7 @@ static void G_DrawWeaponTile(int32_t x, int32_t y, int32_t tilenum, int32_t shad
                    // HACK: Draw the upper part of the chaingun two screen
                    // pixels (not texels; multiplied by weapon scale) lower
                    // first, preventing ugly horizontal seam.
-                    g_dts_yadd = (65536*2*200)/ydim;
+                    g_dts_yadd = tabledivide32_noinline(65536*2*200, ydim);
                    G_DrawTileScaled(x,y,tilenum,shadef[slot],orientation,p);
                    g_dts_yadd = 0;
                }
@ -2753,8 +2753,8 @@ void P_GetInput(int32_t snum)
    if (ud.config.MouseBias)
    {
        if (klabs(info[0].dyaw) > klabs(info[0].dpitch))
-            info[0].dpitch /= ud.config.MouseBias;
-        else info[0].dyaw /= ud.config.MouseBias;
+            info[0].dpitch = tabledivide32_noinline(info[0].dpitch, ud.config.MouseBias);
+        else info[0].dyaw = tabledivide32_noinline(info[0].dyaw, ud.config.MouseBias);
    }

    tics = totalclock-lastcontroltime;
--- a/polymer/eduke32/source/premap.c
+++ b/polymer/eduke32/source/premap.c
@ -537,7 +537,7 @@ void G_CacheMapData(void)
        if (bpp > 8 && totalclock - tc > TICRATE/4)
        {
            /*Bsprintf(tempbuf,"%d resources remaining\n",g_precacheCount-pc+1);*/
-            tc = min(100,100*pc/g_precacheCount);
+            tc = min(100, tabledivide32_noinline(100 * pc, g_precacheCount));
            Bsprintf(tempbuf,"Loaded %d%% (%d/%d textures)\n",tc,pc,g_precacheCount);
            G_DoLoadScreen(tempbuf, tc);
            tc = totalclock;
--- a/polymer/eduke32/source/savegame.c
+++ b/polymer/eduke32/source/savegame.c
@ -692,7 +692,7 @@ static void docmpsd(const void *ptr, void *dump, uint32_t size, uint32_t cnt, ui
    { \
        const UINT(Datbits) *p=(UINT(Datbits) *)ptr;    \
        UINT(Datbits) *op=(UINT(Datbits) *)dump;        \
-        uint32_t i, nelts=(size*cnt)/BYTES(Datbits);    \
+        uint32_t i, nelts=tabledivide32_noinline(size*cnt, BYTES(Datbits));    \
        if (nelts>65536)                                \
            CPELTS(32,Datbits);                         \
        else if (nelts>256)                             \
@ -831,7 +831,7 @@ readidx_##Idxbits##_##Datbits:               \

 #define CPDATA(Datbits) do \
        {                             \
-            uint32_t nelts=(sp->size*cnt)/BYTES(Datbits); \
+            uint32_t nelts=tabledivide32_noinline(sp->size*cnt, BYTES(Datbits)); \
            if (nelts>65536)          \
                CPELTS(32,Datbits);   \
            else if (nelts>256)       \
--- a/polymer/eduke32/source/sector.c
+++ b/polymer/eduke32/source/sector.c
@ -571,8 +571,9 @@ void G_OperateSectors(int32_t sn, int32_t ii)
            dax += wall[i].x;
            day += wall[i].y;
        }
-        dax /= (endwall-startwall+1);
-        day /= (endwall-startwall+1);
+
+        dax = tabledivide32_noinline(dax, (endwall-startwall+1));
+        day = tabledivide32_noinline(day, (endwall-startwall+1));

        //find any points with either same x or same y coordinate
        //  as center (dax, day) - should be 2 points found.