diff --git a/polymer/eduke32/build/Makefile.deps b/polymer/eduke32/build/Makefile.deps index b63587e3b..879da643f 100644 --- a/polymer/eduke32/build/Makefile.deps +++ b/polymer/eduke32/build/Makefile.deps @@ -5,7 +5,7 @@ $(ENGINE_OBJ)/a.$o: $(ENGINE_SRC)/a.$(asm) $(ENGINE_OBJ)/baselayer.$o: $(ENGINE_SRC)/baselayer.c $(ENGINE_INC)/compat.h $(ENGINE_INC)/baselayer.h $(ENGINE_INC)/build.h $(ENGINE_INC)/osd.h $(ENGINE_OBJ)/build.$o: $(ENGINE_SRC)/build.c $(ENGINE_INC)/build.h $(ENGINE_INC)/pragmas.h $(ENGINE_INC)/compat.h $(ENGINE_INC)/baselayer.h $(ENGINE_INC)/editor.h $(ENGINE_OBJ)/cache1d.$o: $(ENGINE_SRC)/cache1d.c $(ENGINE_INC)/compat.h $(ENGINE_INC)/cache1d.h $(ENGINE_INC)/pragmas.h $(ENGINE_INC)/baselayer.h -$(ENGINE_OBJ)/compat.$o: $(ENGINE_SRC)/compat.c $(ENGINE_INC)/compat.h +$(ENGINE_OBJ)/compat.$o: $(ENGINE_SRC)/compat.c $(ENGINE_INC)/compat.h $(ENGINE_INC)/libdivide.h $(ENGINE_OBJ)/config.$o: $(ENGINE_SRC)/config.c $(ENGINE_INC)/compat.h $(ENGINE_INC)/osd.h $(ENGINE_INC)/editor.h $(ENGINE_OBJ)/crc32.$o: $(ENGINE_SRC)/crc32.c $(ENGINE_INC)/crc32.h $(ENGINE_OBJ)/defs.$o: $(ENGINE_SRC)/defs.c $(ENGINE_INC)/build.h $(ENGINE_INC)/baselayer.h $(ENGINE_INC)/scriptfile.h $(ENGINE_INC)/compat.h diff --git a/polymer/eduke32/build/include/a.h b/polymer/eduke32/build/include/a.h index d6d227a1b..8f5e51242 100644 --- a/polymer/eduke32/build/include/a.h +++ b/polymer/eduke32/build/include/a.h @@ -8,6 +8,8 @@ #include "compat.h" +#define CLASSIC_SLICE_BY_4 + /** Definitions of high-precision integer types. **/ // Should be used for values that represent coordinates with which calculations // like dot product are carried out. Substituting 32-bit ints for these will diff --git a/polymer/eduke32/build/include/compat.h b/polymer/eduke32/build/include/compat.h index 140519c7e..2db4843b0 100644 --- a/polymer/eduke32/build/include/compat.h +++ b/polymer/eduke32/build/include/compat.h @@ -92,6 +92,8 @@ #define WITHKPLIB +#include "libdivide.h" + // Define this to rewrite all 'B' versions to library functions. This // is for platforms which give us a standard sort of C library so we // link directly. Platforms like PalmOS which don't have a standard C @@ -146,31 +148,6 @@ #if _MSC_VER < 1800 # define inline __inline -# ifndef _WIN64 -static inline float nearbyintf(float x) -{ - uint32_t w1, w2; - __asm fnstcw w1 - w2 = w1 | 0x00000020; - __asm - { - fldcw w2 - fld x - frndint - fclex - fldcw w1 - } -} -# else -#include -static inline float nearbyintf(float x) -{ - if (x >= 0.0) - return floorf(x + 0.5); - else - return floorf(x - 0.5); -} -# endif #endif #include @@ -178,31 +155,8 @@ static inline long lround(double num) { return (long) (num > 0 ? num + 0.5 : ceil(num - 0.5)); } - -#if defined(_WIN64) -#include -static inline int32_t Blrintf(const float x) -{ - __m128 xx = _mm_load_ss(&x); - return _mm_cvtss_si32(xx); -} -#else -static inline int32_t Blrintf(const float x) -{ - int n; - __asm fld x; - __asm fistp n; - return n; -} -#endif #else # define longlong(x) x##ll -#define Blrintf lrintf -#endif - -#if defined __OPENDINGUX__ -//ugly hack -#define nearbyintf rintf #endif #if defined(__arm__) @@ -398,60 +352,23 @@ static inline uint16_t B_UNBUF16(const uint8_t *buf) { return (buf[1] << 8) | (b static inline uint32_t B_UNBUF32(const uint8_t *buf) { return (buf[3] << 24) | (buf[2] << 16) | (buf[1] << 8) | (buf[0]); } static inline uint64_t B_UNBUF64(const uint8_t *buf) { return ((uint64_t)buf[7] << 56) | ((uint64_t)buf[6] << 48) | ((uint64_t)buf[5] << 40) | ((uint64_t)buf[4] << 32) | (buf[3] << 24) | (buf[2] << 16) | (buf[1] << 8) | (buf[0]); } -#if defined(USE_MSC_PRAGMAS) -static inline void ftol(float f, int32_t *a) +#if defined(BITNESS64) +#include +static inline int32_t Blrintf(const float x) { - _asm - { - mov eax, a - fld f - fistp dword ptr [eax] - } + __m128 xx = _mm_load_ss(&x); + return _mm_cvtss_si32(xx); } - -static inline void dtol(double d, int32_t *a) +#elif defined (_MSC_VER) +static inline int32_t Blrintf(const float x) { - _asm - { - mov eax, a - fld d - fistp dword ptr [eax] - } + int n; + __asm fld x; + __asm fistp n; + return n; } -#elif defined(USE_GCC_PRAGMAS) - -static inline void ftol(float f, int32_t *a) -{ - __asm__ __volatile__( -#if 0 //(__GNUC__ >= 3) - "flds %1; fistpl %0;" #else - "flds %1; fistpl (%0);" -#endif - : "=r"(a) : "m"(f) : "memory","cc"); -} - -static inline void dtol(double d, int32_t *a) -{ - __asm__ __volatile__( -#if 0 //(__GNUC__ >= 3) - "fldl %1; fistpl %0;" -#else - "fldl %1; fistpl (%0);" -#endif - : "=r"(a) : "m"(d) : "memory","cc"); -} - -#else -static inline void ftol(float f, int32_t *a) -{ - *a = (int32_t)f; -} - -static inline void dtol(double d, int32_t *a) -{ - *a = (int32_t)d; -} +#define Blrintf lrintf #endif #if B_LITTLE_ENDIAN == 1 @@ -792,7 +709,7 @@ char *Bgetsystemdrives(void); int32_t Bfilelength(int32_t fd); char *Bstrtoken(char *s, const char *delim, char **ptrptr, int32_t chop); char *Bstrtolower(char *str); -int32_t Bwildmatch (const char *i, const char *j); +#define Bwildmatch wildmatch #if !defined(_WIN32) char *Bstrlwr(char *); diff --git a/polymer/eduke32/build/include/hightile.h b/polymer/eduke32/build/include/hightile.h index e6a430955..e9db8fed2 100644 --- a/polymer/eduke32/build/include/hightile.h +++ b/polymer/eduke32/build/include/hightile.h @@ -46,16 +46,16 @@ static inline int have_basepal_tint(void) static inline void hictinting_apply(float *color, int32_t palnum) { - color[0] *= (float)hictinting[palnum].r / 255.f; - color[1] *= (float)hictinting[palnum].g / 255.f; - color[2] *= (float)hictinting[palnum].b / 255.f; + color[0] *= (float)hictinting[palnum].r * (1.f/255.f); + color[1] *= (float)hictinting[palnum].g * (1.f/255.f); + color[2] *= (float)hictinting[palnum].b * (1.f/255.f); } static inline void hictinting_apply_ub(uint8_t *color, int32_t palnum) { - color[0] = (uint8_t)(color[0] * (float)hictinting[palnum].r / 255.f); - color[1] = (uint8_t)(color[1] * (float)hictinting[palnum].g / 255.f); - color[2] = (uint8_t)(color[2] * (float)hictinting[palnum].b / 255.f); + color[0] = (uint8_t)(color[0] * (float)hictinting[palnum].r * (1.f/255.f)); + color[1] = (uint8_t)(color[1] * (float)hictinting[palnum].g * (1.f/255.f)); + color[2] = (uint8_t)(color[2] * (float)hictinting[palnum].b * (1.f/255.f)); } // texcacheheader cachead.flags bits diff --git a/polymer/eduke32/build/include/kplib.h b/polymer/eduke32/build/include/kplib.h index 21d919fd1..c430500d6 100644 --- a/polymer/eduke32/build/include/kplib.h +++ b/polymer/eduke32/build/include/kplib.h @@ -2,6 +2,10 @@ extern "C" { #endif +#ifndef __compat_h__ +#include "compat.h" +#endif + typedef struct { FILE *fil; //0:no file open, !=0:open file (either stand-alone or zip) @@ -52,6 +56,7 @@ static inline int32_t filnamcmp(const char *j, const char *i) i++, j++; return *i != '\0'; } +extern int32_t wildmatch(const char *match, const char *wild); #ifdef EXTERNC } diff --git a/polymer/eduke32/build/include/libdivide.h b/polymer/eduke32/build/include/libdivide.h new file mode 100644 index 000000000..10683aa39 --- /dev/null +++ b/polymer/eduke32/build/include/libdivide.h @@ -0,0 +1,1101 @@ +/* libdivide.h + Copyright 2010 ridiculous_fish + + Modified for EDuke32. zlib license. +*/ + +#ifndef __libdivide_h__ +#define __libdivide_h__ + +#if defined(_WIN32) || defined(WIN32) +#define LIBDIVIDE_WINDOWS 1 +#endif + +#if defined(_MSC_VER) +#define LIBDIVIDE_VC 1 +#endif + +#include +#include +#include + +#if LIBDIVIDE_USE_SSE2 + #include +#endif + +#if LIBDIVIDE_VC + #include +#endif + +#ifndef __has_builtin +#define __has_builtin(x) 0 // Compatibility with non-clang compilers. +#endif + +#ifdef __ICC +#define HAS_INT128_T 0 +#else +#define HAS_INT128_T __LP64__ +#endif + +#if defined(__x86_64__) || defined(_WIN64) || defined(_M_64) +#define LIBDIVIDE_IS_X86_64 1 +#endif + +#if defined(__i386__) +#define LIBDIVIDE_IS_i386 1 +#endif + +#if __GNUC__ || __clang__ +#define LIBDIVIDE_GCC_STYLE_ASM 1 +#endif + +/* Explanation of "more" field: bit 6 is whether to use shift path. If we are using the shift path, bit 7 is whether the divisor is negative in the signed case; in the unsigned case it is 0. Bits 0-4 is shift value (for shift path or mult path). In 32 bit case, bit 5 is always 0. We use bit 7 as the "negative divisor indicator" so that we can use sign extension to efficiently go to a full-width -1. + + +u32: [0-4] shift value + [5] ignored + [6] add indicator + [7] shift path + +s32: [0-4] shift value + [5] shift path + [6] add indicator + [7] indicates negative divisor + +u64: [0-5] shift value + [6] add indicator + [7] shift path + +s64: [0-5] shift value + [6] add indicator + [7] indicates negative divisor + magic number of 0 indicates shift path (we ran out of bits!) +*/ + +enum { + LIBDIVIDE_32_SHIFT_MASK = 0x1F, + LIBDIVIDE_64_SHIFT_MASK = 0x3F, + LIBDIVIDE_ADD_MARKER = 0x40, + LIBDIVIDE_U32_SHIFT_PATH = 0x80, + LIBDIVIDE_U64_SHIFT_PATH = 0x80, + LIBDIVIDE_S32_SHIFT_PATH = 0x20, + LIBDIVIDE_NEGATIVE_DIVISOR = 0x80 +}; + +// these are padded to optimize access via LUT +typedef struct { + uint32_t magic; + uint8_t more; +} libdivide_u32_t; + +typedef struct { + int32_t magic; + uint8_t more; +} libdivide_s32_t; + +typedef struct { + int32_t magic; + uint8_t more; + uint8_t filler[3]; +} libdivide_s32pad_t; + +typedef struct { + uint64_t magic; + uint8_t more; +} libdivide_u64_t; + +typedef struct { + int64_t magic; + uint8_t more; +} libdivide_s64_t; + +typedef struct { + int64_t magic; + uint8_t more; + uint8_t filler[7]; // should think of something useful to do with this... +} libdivide_s64pad_t; + +libdivide_s32_t libdivide_s32_gen(int32_t y); +libdivide_u32_t libdivide_u32_gen(uint32_t y); +libdivide_s64_t libdivide_s64_gen(int64_t y); +libdivide_u64_t libdivide_u64_gen(uint64_t y); + +int32_t libdivide_s32_do(int32_t numer, const libdivide_s32_t *denom); +uint32_t libdivide_u32_do(uint32_t numer, const libdivide_u32_t *denom); +int64_t libdivide_s64_do(int64_t numer, const libdivide_s64_t *denom); +uint64_t libdivide_u64_do(uint64_t y, const libdivide_u64_t *denom); + +int libdivide_u32_get_algorithm(const libdivide_u32_t *denom); +uint32_t libdivide_u32_do_alg0(uint32_t numer, const libdivide_u32_t *denom); +uint32_t libdivide_u32_do_alg1(uint32_t numer, const libdivide_u32_t *denom); +uint32_t libdivide_u32_do_alg2(uint32_t numer, const libdivide_u32_t *denom); + +int libdivide_u64_get_algorithm(const libdivide_u64_t *denom); +uint64_t libdivide_u64_do_alg0(uint64_t numer, const libdivide_u64_t *denom); +uint64_t libdivide_u64_do_alg1(uint64_t numer, const libdivide_u64_t *denom); +uint64_t libdivide_u64_do_alg2(uint64_t numer, const libdivide_u64_t *denom); + +int libdivide_s32_get_algorithm(const libdivide_s32_t *denom); +int32_t libdivide_s32_do_alg0(int32_t numer, const libdivide_s32_t *denom); +int32_t libdivide_s32_do_alg1(int32_t numer, const libdivide_s32_t *denom); +int32_t libdivide_s32_do_alg2(int32_t numer, const libdivide_s32_t *denom); +int32_t libdivide_s32_do_alg3(int32_t numer, const libdivide_s32_t *denom); +int32_t libdivide_s32_do_alg4(int32_t numer, const libdivide_s32_t *denom); + +int libdivide_s64_get_algorithm(const libdivide_s64_t *denom); +int64_t libdivide_s64_do_alg0(int64_t numer, const libdivide_s64_t *denom); +int64_t libdivide_s64_do_alg1(int64_t numer, const libdivide_s64_t *denom); +int64_t libdivide_s64_do_alg2(int64_t numer, const libdivide_s64_t *denom); +int64_t libdivide_s64_do_alg3(int64_t numer, const libdivide_s64_t *denom); +int64_t libdivide_s64_do_alg4(int64_t numer, const libdivide_s64_t *denom); + +#if LIBDIVIDE_USE_SSE2 +__m128i libdivide_u32_do_vector(__m128i numers, const libdivide_u32_t * denom); +__m128i libdivide_s32_do_vector(__m128i numers, const libdivide_s32_t * denom); +__m128i libdivide_u64_do_vector(__m128i numers, const libdivide_u64_t * denom); +__m128i libdivide_s64_do_vector(__m128i numers, const libdivide_s64_t * denom); + +__m128i libdivide_u32_do_vector_alg0(__m128i numers, const libdivide_u32_t * denom); +__m128i libdivide_u32_do_vector_alg1(__m128i numers, const libdivide_u32_t * denom); +__m128i libdivide_u32_do_vector_alg2(__m128i numers, const libdivide_u32_t * denom); + +__m128i libdivide_s32_do_vector_alg0(__m128i numers, const libdivide_s32_t * denom); +__m128i libdivide_s32_do_vector_alg1(__m128i numers, const libdivide_s32_t * denom); +__m128i libdivide_s32_do_vector_alg2(__m128i numers, const libdivide_s32_t * denom); +__m128i libdivide_s32_do_vector_alg3(__m128i numers, const libdivide_s32_t * denom); +__m128i libdivide_s32_do_vector_alg4(__m128i numers, const libdivide_s32_t * denom); + +__m128i libdivide_u64_do_vector_alg0(__m128i numers, const libdivide_u64_t * denom); +__m128i libdivide_u64_do_vector_alg1(__m128i numers, const libdivide_u64_t * denom); +__m128i libdivide_u64_do_vector_alg2(__m128i numers, const libdivide_u64_t * denom); + +__m128i libdivide_s64_do_vector_alg0(__m128i numers, const libdivide_s64_t * denom); +__m128i libdivide_s64_do_vector_alg1(__m128i numers, const libdivide_s64_t * denom); +__m128i libdivide_s64_do_vector_alg2(__m128i numers, const libdivide_s64_t * denom); +__m128i libdivide_s64_do_vector_alg3(__m128i numers, const libdivide_s64_t * denom); +__m128i libdivide_s64_do_vector_alg4(__m128i numers, const libdivide_s64_t * denom); +#endif + +#endif + + #ifdef LIBDIVIDE_BODY + +//////// Internal Utility Functions + +static inline uint32_t libdivide__mullhi_u32(uint32_t x, uint32_t y) { + uint64_t xl = x, yl = y; + uint64_t rl = xl * yl; + return (uint32_t)(rl >> 32); +} + +static uint64_t libdivide__mullhi_u64(uint64_t x, uint64_t y) { +#if HAS_INT128_T + __uint128_t xl = x, yl = y; + __uint128_t rl = xl * yl; + return (uint64_t)(rl >> 64); +#else + //full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64) + const uint32_t mask = 0xFFFFFFFF; + const uint32_t x0 = (uint32_t)(x & mask), x1 = (uint32_t)(x >> 32); + const uint32_t y0 = (uint32_t)(y & mask), y1 = (uint32_t)(y >> 32); + const uint32_t x0y0_hi = libdivide__mullhi_u32(x0, y0); + const uint64_t x0y1 = x0 * (uint64_t)y1; + const uint64_t x1y0 = x1 * (uint64_t)y0; + const uint64_t x1y1 = x1 * (uint64_t)y1; + + uint64_t temp = x1y0 + x0y0_hi; + uint64_t temp_lo = temp & mask, temp_hi = temp >> 32; + return x1y1 + temp_hi + ((temp_lo + x0y1) >> 32); +#endif +} + +static inline int64_t libdivide__mullhi_s64(int64_t x, int64_t y) { +#if HAS_INT128_T + __int128_t xl = x, yl = y; + __int128_t rl = xl * yl; + return (int64_t)(rl >> 64); +#else + //full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64) + const uint32_t mask = 0xFFFFFFFF; + const uint32_t x0 = (uint32_t)(x & mask), y0 = (uint32_t)(y & mask); + const int32_t x1 = (int32_t)(x >> 32), y1 = (int32_t)(y >> 32); + const uint32_t x0y0_hi = libdivide__mullhi_u32(x0, y0); + const int64_t t = x1*(int64_t)y0 + x0y0_hi; + const int64_t w1 = x0*(int64_t)y1 + (t & mask); + return x1*(int64_t)y1 + (t >> 32) + (w1 >> 32); +#endif +} + +#if LIBDIVIDE_USE_SSE2 + +static inline __m128i libdivide__u64_to_m128(uint64_t x) { +#if LIBDIVIDE_VC && ! _WIN64 + //64 bit windows doesn't seem to have an implementation of any of these load intrinsics, and 32 bit Visual C++ crashes + _declspec(align(16)) uint64_t temp[2] = {x, x}; + return _mm_load_si128((const __m128i*)temp); +#elif defined(__ICC) + uint64_t __attribute__((aligned(16))) temp[2] = {x,x}; + return _mm_load_si128((const __m128i*)temp); +#elif __clang__ + // clang does not provide this intrinsic either + return (__m128i){x, x}; +#else + // everyone else gets it right + return _mm_set1_epi64x(x); +#endif +} + +static inline __m128i libdivide_get_FFFFFFFF00000000(void) { + //returns the same as _mm_set1_epi64(0xFFFFFFFF00000000ULL) without touching memory + __m128i result = _mm_set1_epi8(-1); //optimizes to pcmpeqd on OS X + return _mm_slli_epi64(result, 32); +} + +static inline __m128i libdivide_get_00000000FFFFFFFF(void) { + //returns the same as _mm_set1_epi64(0x00000000FFFFFFFFULL) without touching memory + __m128i result = _mm_set1_epi8(-1); //optimizes to pcmpeqd on OS X + result = _mm_srli_epi64(result, 32); + return result; +} + +static inline __m128i libdivide_get_0000FFFF(void) { + //returns the same as _mm_set1_epi32(0x0000FFFFULL) without touching memory + __m128i result; //we don't care what its contents are + result = _mm_cmpeq_epi8(result, result); //all 1s + result = _mm_srli_epi32(result, 16); + return result; +} + +static inline __m128i libdivide_s64_signbits(__m128i v) { + //we want to compute v >> 63, that is, _mm_srai_epi64(v, 63). But there is no 64 bit shift right arithmetic instruction in SSE2. So we have to fake it by first duplicating the high 32 bit values, and then using a 32 bit shift. Another option would be to use _mm_srli_epi64(v, 63) and then subtract that from 0, but that approach appears to be substantially slower for unknown reasons + __m128i hiBitsDuped = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1)); + __m128i signBits = _mm_srai_epi32(hiBitsDuped, 31); + return signBits; +} + +/* Returns an __m128i whose low 32 bits are equal to amt and has zero elsewhere. */ +static inline __m128i libdivide_u32_to_m128i(uint32_t amt) { + return _mm_set_epi32(0, 0, 0, amt); +} + +static inline __m128i libdivide_s64_shift_right_vector(__m128i v, int amt) { + //implementation of _mm_sra_epi64. Here we have two 64 bit values which are shifted right to logically become (64 - amt) values, and are then sign extended from a (64 - amt) bit number. + const int b = 64 - amt; + __m128i m = libdivide__u64_to_m128(1ULL << (b - 1)); + __m128i x = _mm_srl_epi64(v, libdivide_u32_to_m128i(amt)); + __m128i result = _mm_sub_epi64(_mm_xor_si128(x, m), m); //result = x^m - m + return result; +} + +/* Here, b is assumed to contain one 32 bit value repeated four times. If it did not, the function would not work. */ +static inline __m128i libdivide__mullhi_u32_flat_vector(__m128i a, __m128i b) { + __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epu32(a, b), 32); + __m128i a1X3X = _mm_srli_epi64(a, 32); + __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epu32(a1X3X, b), libdivide_get_FFFFFFFF00000000()); + return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3); // = hi_product_0123 +} + + +/* Here, y is assumed to contain one 64 bit value repeated twice. */ +static inline __m128i libdivide_mullhi_u64_flat_vector(__m128i x, __m128i y) { + //full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64) + const __m128i mask = libdivide_get_00000000FFFFFFFF(); + const __m128i x0 = _mm_and_si128(x, mask), x1 = _mm_srli_epi64(x, 32); //x0 is low half of 2 64 bit values, x1 is high half in low slots + const __m128i y0 = _mm_and_si128(y, mask), y1 = _mm_srli_epi64(y, 32); + const __m128i x0y0_hi = _mm_srli_epi64(_mm_mul_epu32(x0, y0), 32); //x0 happens to have the low half of the two 64 bit values in 32 bit slots 0 and 2, so _mm_mul_epu32 computes their full product, and then we shift right by 32 to get just the high values + const __m128i x0y1 = _mm_mul_epu32(x0, y1); + const __m128i x1y0 = _mm_mul_epu32(x1, y0); + const __m128i x1y1 = _mm_mul_epu32(x1, y1); + + const __m128i temp = _mm_add_epi64(x1y0, x0y0_hi); + __m128i temp_lo = _mm_and_si128(temp, mask), temp_hi = _mm_srli_epi64(temp, 32); + temp_lo = _mm_srli_epi64(_mm_add_epi64(temp_lo, x0y1), 32); + temp_hi = _mm_add_epi64(x1y1, temp_hi); + + return _mm_add_epi64(temp_lo, temp_hi); +} + +/* y is one 64 bit value repeated twice */ +static inline __m128i libdivide_mullhi_s64_flat_vector(__m128i x, __m128i y) { + __m128i p = libdivide_mullhi_u64_flat_vector(x, y); + __m128i t1 = _mm_and_si128(libdivide_s64_signbits(x), y); + p = _mm_sub_epi64(p, t1); + __m128i t2 = _mm_and_si128(libdivide_s64_signbits(y), x); + p = _mm_sub_epi64(p, t2); + return p; +} + +/* SSE2 does not have a signed multiplication instruction, but we can convert unsigned to signed pretty efficiently. Again, b is just a 32 bit value repeated four times. */ +static inline __m128i libdivide_mullhi_s32_flat_vector(__m128i a, __m128i b) { + __m128i p = libdivide__mullhi_u32_flat_vector(a, b); + __m128i t1 = _mm_and_si128(_mm_srai_epi32(a, 31), b); //t1 = (a >> 31) & y, arithmetic shift + __m128i t2 = _mm_and_si128(_mm_srai_epi32(b, 31), a); + p = _mm_sub_epi32(p, t1); + p = _mm_sub_epi32(p, t2); + return p; +} +#endif + +static inline int32_t libdivide__count_trailing_zeros32(uint32_t val) { +#if __GNUC__ || __has_builtin(__builtin_ctz) + /* Fast way to count trailing zeros */ + return __builtin_ctz(val); +#elif LIBDIVIDE_VC + unsigned long result; + if (_BitScanForward(&result, val)) { + return result; + } + return 0; +#else + /* Dorky way to count trailing zeros. Note that this hangs for val = 0! */ + int32_t result = 0; + val = (val ^ (val - 1)) >> 1; // Set v's trailing 0s to 1s and zero rest + while (val) { + val >>= 1; + result++; + } + return result; +#endif +} + +static inline int32_t libdivide__count_trailing_zeros64(uint64_t val) { +#if __LP64__ && (__GNUC__ || __has_builtin(__builtin_ctzll)) + /* Fast way to count trailing zeros. Note that we disable this in 32 bit because gcc does something horrible - it calls through to a dynamically bound function. */ + return __builtin_ctzll(val); +#elif LIBDIVIDE_VC && _WIN64 + unsigned long result; + if (_BitScanForward64(&result, val)) { + return result; + } + return 0; +#else + /* Pretty good way to count trailing zeros. Note that this hangs for val = 0! */ + uint32_t lo = val & 0xFFFFFFFF; + if (lo != 0) return libdivide__count_trailing_zeros32(lo); + return 32 + libdivide__count_trailing_zeros32(val >> 32); +#endif +} + +static inline int32_t libdivide__count_leading_zeros32(uint32_t val) { +#if __GNUC__ || __has_builtin(__builtin_clzll) + /* Fast way to count leading zeros */ + return __builtin_clz(val); +#elif LIBDIVIDE_VC + unsigned long result; + if (_BitScanReverse(&result, val)) { + return 31 - result; + } + return 0; +#else + /* Dorky way to count leading zeros. Note that this hangs for val = 0! */ + int32_t result = 0; + while (! (val & (1U << 31))) { + val <<= 1; + result++; + } + return result; +#endif +} + +static inline int32_t libdivide__count_leading_zeros64(uint64_t val) { +#if __GNUC__ || __has_builtin(__builtin_clzll) + /* Fast way to count leading zeros */ + return __builtin_clzll(val); +#elif LIBDIVIDE_VC && _WIN64 + unsigned long result; + if (_BitScanReverse64(&result, val)) { + return 63 - result; + } + return 0; +#else + /* Dorky way to count leading zeros. Note that this hangs for val = 0! */ + int32_t result = 0; + while (! (val & (1ULL << 63))) { + val <<= 1; + result++; + } + return result; +#endif +} + +//libdivide_64_div_32_to_32: divides a 64 bit uint {u1, u0} by a 32 bit uint {v}. The result must fit in 32 bits. Returns the quotient directly and the remainder in *r +#if (LIBDIVIDE_IS_i386 || LIBDIVIDE_IS_X86_64) && LIBDIVIDE_GCC_STYLE_ASM +static uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) { + uint32_t result; + __asm__("divl %[v]" + : "=a"(result), "=d"(*r) + : [v] "r"(v), "a"(u0), "d"(u1) + ); + return result; +} +#else +static uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) { + uint64_t n = (((uint64_t)u1) << 32) | u0; + uint32_t result = (uint32_t)(n / v); + *r = (uint32_t)(n - result * (uint64_t)v); + return result; +} +#endif + +#if LIBDIVIDE_IS_X86_64 && LIBDIVIDE_GCC_STYLE_ASM +static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) { + //u0 -> rax + //u1 -> rdx + //divq + uint64_t result; + __asm__("divq %[v]" + : "=a"(result), "=d"(*r) + : [v] "r"(v), "a"(u0), "d"(u1) + ); + return result; + +} +#else + +/* Code taken from Hacker's Delight, http://www.hackersdelight.org/HDcode/divlu.c . License permits inclusion here per http://www.hackersdelight.org/permissions.htm + */ +static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) { + const uint64_t b = (1ULL << 32); // Number base (16 bits). + uint64_t un1, un0, // Norm. dividend LSD's. + vn1, vn0, // Norm. divisor digits. + q1, q0, // Quotient digits. + un64, un21, un10,// Dividend digit pairs. + rhat; // A remainder. + int s; // Shift amount for norm. + + if (u1 >= v) { // If overflow, set rem. + if (r != NULL) // to an impossible value, + *r = (uint64_t)(-1); // and return the largest + return (uint64_t)(-1);} // possible quotient. + + /* count leading zeros */ + s = libdivide__count_leading_zeros64(v); // 0 <= s <= 63. + if (s > 0) { + v = v << s; // Normalize divisor. + un64 = (u1 << s) | ((u0 >> (64 - s)) & (-s >> 31)); + un10 = u0 << s; // Shift dividend left. + } else { + // Avoid undefined behavior. + un64 = u1 | u0; + un10 = u0; + } + + vn1 = v >> 32; // Break divisor up into + vn0 = v & 0xFFFFFFFF; // two 32-bit digits. + + un1 = un10 >> 32; // Break right half of + un0 = un10 & 0xFFFFFFFF; // dividend into two digits. + + q1 = un64/vn1; // Compute the first + rhat = un64 - q1*vn1; // quotient digit, q1. +again1: + if (q1 >= b || q1*vn0 > b*rhat + un1) { + q1 = q1 - 1; + rhat = rhat + vn1; + if (rhat < b) goto again1;} + + un21 = un64*b + un1 - q1*v; // Multiply and subtract. + + q0 = un21/vn1; // Compute the second + rhat = un21 - q0*vn1; // quotient digit, q0. +again2: + if (q0 >= b || q0*vn0 > b*rhat + un0) { + q0 = q0 - 1; + rhat = rhat + vn1; + if (rhat < b) goto again2;} + + if (r != NULL) // If remainder is wanted, + *r = (un21*b + un0 - q0*v) >> s; // return it. + return q1*b + q0; +} +#endif + +#if LIBDIVIDE_ASSERTIONS_ON +#define LIBDIVIDE_ASSERT(x) do { if (! (x)) { fprintf(stderr, "Assertion failure on line %ld: %s\n", (long)__LINE__, #x); exit(-1); } } while (0) +#else +#define LIBDIVIDE_ASSERT(x) +#endif + +////////// UINT32 + +libdivide_u32_t libdivide_u32_gen(uint32_t d) { + libdivide_u32_t result; + if ((d & (d - 1)) == 0) { + result.magic = 0; + result.more = libdivide__count_trailing_zeros32(d) | LIBDIVIDE_U32_SHIFT_PATH; + } + else { + const uint32_t floor_log_2_d = 31 - libdivide__count_leading_zeros32(d); + + uint8_t more; + uint32_t rem, proposed_m = libdivide_64_div_32_to_32(1U << floor_log_2_d, 0, d, &rem); + + const uint32_t e = d - rem; + LIBDIVIDE_ASSERT(rem > 0 && rem < d); + + /* This power works if e < 2**floor_log_2_d. */ + if (e < (1U << floor_log_2_d)) { + /* This power works */ + more = floor_log_2_d; + } + else { + /* We have to use the general 33-bit algorithm. We need to compute (2**power) / d. However, we already have (2**(power-1))/d and its remainder. By doubling both, and then correcting the remainder, we can compute the larger division. */ + const uint32_t twice_rem = rem + rem; + proposed_m += proposed_m; //don't care about overflow here - in fact, we expect it + if (twice_rem >= d || twice_rem < rem) proposed_m += 1; + more = floor_log_2_d | LIBDIVIDE_ADD_MARKER; + } + result.magic = 1 + proposed_m; + result.more = more; + //result.more's shift should in general be ceil_log_2_d. But if we used the smaller power, we subtract one from the shift because we're using the smaller power. If we're using the larger power, we subtract one from the shift because it's taken care of by the add indicator. So floor_log_2_d happens to be correct in both cases. + + } + return result; +} + +uint32_t libdivide_u32_do(uint32_t numer, const libdivide_u32_t *denom) { + uint8_t more = denom->more; + if (more & LIBDIVIDE_U32_SHIFT_PATH) { + return numer >> (more & LIBDIVIDE_32_SHIFT_MASK); + } + else { + uint32_t q = libdivide__mullhi_u32(denom->magic, numer); + if (more & LIBDIVIDE_ADD_MARKER) { + uint32_t t = ((numer - q) >> 1) + q; + return t >> (more & LIBDIVIDE_32_SHIFT_MASK); + } + else { + return q >> more; //all upper bits are 0 - don't need to mask them off + } + } +} + + +int libdivide_u32_get_algorithm(const libdivide_u32_t *denom) { + uint8_t more = denom->more; + if (more & LIBDIVIDE_U32_SHIFT_PATH) return 0; + else if (! (more & LIBDIVIDE_ADD_MARKER)) return 1; + else return 2; +} + +uint32_t libdivide_u32_do_alg0(uint32_t numer, const libdivide_u32_t *denom) { + return numer >> (denom->more & LIBDIVIDE_32_SHIFT_MASK); +} + +uint32_t libdivide_u32_do_alg1(uint32_t numer, const libdivide_u32_t *denom) { + uint32_t q = libdivide__mullhi_u32(denom->magic, numer); + return q >> denom->more; +} + +uint32_t libdivide_u32_do_alg2(uint32_t numer, const libdivide_u32_t *denom) { + // denom->add != 0 + uint32_t q = libdivide__mullhi_u32(denom->magic, numer); + uint32_t t = ((numer - q) >> 1) + q; + return t >> (denom->more & LIBDIVIDE_32_SHIFT_MASK); +} + + + + +#if LIBDIVIDE_USE_SSE2 +__m128i libdivide_u32_do_vector(__m128i numers, const libdivide_u32_t *denom) { + uint8_t more = denom->more; + if (more & LIBDIVIDE_U32_SHIFT_PATH) { + return _mm_srl_epi32(numers, libdivide_u32_to_m128i(more & LIBDIVIDE_32_SHIFT_MASK)); + } + else { + __m128i q = libdivide__mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + //uint32_t t = ((numer - q) >> 1) + q; + //return t >> denom->shift; + __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q); + return _mm_srl_epi32(t, libdivide_u32_to_m128i(more & LIBDIVIDE_32_SHIFT_MASK)); + + } + else { + //q >> denom->shift + return _mm_srl_epi32(q, libdivide_u32_to_m128i(more)); + } + } +} + +__m128i libdivide_u32_do_vector_alg0(__m128i numers, const libdivide_u32_t *denom) { + return _mm_srl_epi32(numers, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK)); +} + +__m128i libdivide_u32_do_vector_alg1(__m128i numers, const libdivide_u32_t *denom) { + __m128i q = libdivide__mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic)); + return _mm_srl_epi32(q, libdivide_u32_to_m128i(denom->more)); +} + +__m128i libdivide_u32_do_vector_alg2(__m128i numers, const libdivide_u32_t *denom) { + __m128i q = libdivide__mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic)); + __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q); + return _mm_srl_epi32(t, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK)); +} + +#endif + +/////////// UINT64 + +libdivide_u64_t libdivide_u64_gen(uint64_t d) { + libdivide_u64_t result; + if ((d & (d - 1)) == 0) { + result.more = libdivide__count_trailing_zeros64(d) | LIBDIVIDE_U64_SHIFT_PATH; + result.magic = 0; + } + else { + const uint32_t floor_log_2_d = 63 - libdivide__count_leading_zeros64(d); + + uint64_t rem, proposed_m = libdivide_128_div_64_to_64(1ULL << floor_log_2_d, 0, d, &rem); //== (1 << (64 + floor_log_2_d)) / d + uint8_t more; + const uint64_t e = d - rem; + + LIBDIVIDE_ASSERT(rem > 0 && rem < d); + + /* This power works if e < 2**floor_log_2_d. */ + if (e < (1ULL << floor_log_2_d)) { + /* This power works */ + more = floor_log_2_d; + } + else { + /* We have to use the general 65-bit algorithm. We need to compute (2**power) / d. However, we already have (2**(power-1))/d and its remainder. By doubling both, and then correcting the remainder, we can compute the larger division. */ + const uint64_t twice_rem = rem + rem; + proposed_m += proposed_m; //don't care about overflow here - in fact, we expect it + if (twice_rem >= d || twice_rem < rem) proposed_m += 1; + more = floor_log_2_d | LIBDIVIDE_ADD_MARKER; + } + result.magic = 1 + proposed_m; + result.more = more; + //result.more's shift should in general be ceil_log_2_d. But if we used the smaller power, we subtract one from the shift because we're using the smaller power. If we're using the larger power, we subtract one from the shift because it's taken care of by the add indicator. So floor_log_2_d happens to be correct in both cases, which is why we do it outside of the if statement. + } + return result; +} + +uint64_t libdivide_u64_do(uint64_t numer, const libdivide_u64_t *denom) { + uint8_t more = denom->more; + if (more & LIBDIVIDE_U64_SHIFT_PATH) { + return numer >> (more & LIBDIVIDE_64_SHIFT_MASK); + } + else { + uint64_t q = libdivide__mullhi_u64(denom->magic, numer); + if (more & LIBDIVIDE_ADD_MARKER) { + uint64_t t = ((numer - q) >> 1) + q; + return t >> (more & LIBDIVIDE_64_SHIFT_MASK); + } + else { + return q >> more; //all upper bits are 0 - don't need to mask them off + } + } +} + + +int libdivide_u64_get_algorithm(const libdivide_u64_t *denom) { + uint8_t more = denom->more; + if (more & LIBDIVIDE_U64_SHIFT_PATH) return 0; + else if (! (more & LIBDIVIDE_ADD_MARKER)) return 1; + else return 2; +} + +uint64_t libdivide_u64_do_alg0(uint64_t numer, const libdivide_u64_t *denom) { + return numer >> (denom->more & LIBDIVIDE_64_SHIFT_MASK); +} + +uint64_t libdivide_u64_do_alg1(uint64_t numer, const libdivide_u64_t *denom) { + uint64_t q = libdivide__mullhi_u64(denom->magic, numer); + return q >> denom->more; +} + +uint64_t libdivide_u64_do_alg2(uint64_t numer, const libdivide_u64_t *denom) { + uint64_t q = libdivide__mullhi_u64(denom->magic, numer); + uint64_t t = ((numer - q) >> 1) + q; + return t >> (denom->more & LIBDIVIDE_64_SHIFT_MASK); +} + +#if LIBDIVIDE_USE_SSE2 +__m128i libdivide_u64_do_vector(__m128i numers, const libdivide_u64_t * denom) { + uint8_t more = denom->more; + if (more & LIBDIVIDE_U64_SHIFT_PATH) { + return _mm_srl_epi64(numers, libdivide_u32_to_m128i(more & LIBDIVIDE_64_SHIFT_MASK)); + } + else { + __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide__u64_to_m128(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + //uint32_t t = ((numer - q) >> 1) + q; + //return t >> denom->shift; + __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q); + return _mm_srl_epi64(t, libdivide_u32_to_m128i(more & LIBDIVIDE_64_SHIFT_MASK)); + } + else { + //q >> denom->shift + return _mm_srl_epi64(q, libdivide_u32_to_m128i(more)); + } + } +} + +__m128i libdivide_u64_do_vector_alg0(__m128i numers, const libdivide_u64_t *denom) { + return _mm_srl_epi64(numers, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_64_SHIFT_MASK)); +} + +__m128i libdivide_u64_do_vector_alg1(__m128i numers, const libdivide_u64_t *denom) { + __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide__u64_to_m128(denom->magic)); + return _mm_srl_epi64(q, libdivide_u32_to_m128i(denom->more)); +} + +__m128i libdivide_u64_do_vector_alg2(__m128i numers, const libdivide_u64_t *denom) { + __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide__u64_to_m128(denom->magic)); + __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q); + return _mm_srl_epi64(t, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_64_SHIFT_MASK)); +} + + +#endif + +/////////// SINT32 + + +static inline int32_t libdivide__mullhi_s32(int32_t x, int32_t y) { + int64_t xl = x, yl = y; + int64_t rl = xl * yl; + return (int32_t)(rl >> 32); //needs to be arithmetic shift +} + +libdivide_s32_t libdivide_s32_gen(int32_t d) { + libdivide_s32_t result; + + /* If d is a power of 2, or negative a power of 2, we have to use a shift. This is especially important because the magic algorithm fails for -1. To check if d is a power of 2 or its inverse, it suffices to check whether its absolute value has exactly one bit set. This works even for INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set and is a power of 2. */ + uint32_t absD = (uint32_t)(d < 0 ? -d : d); //gcc optimizes this to the fast abs trick + if ((absD & (absD - 1)) == 0) { //check if exactly one bit is set, don't care if absD is 0 since that's divide by zero + result.magic = 0; + result.more = libdivide__count_trailing_zeros32(absD) | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0) | LIBDIVIDE_S32_SHIFT_PATH; + } + else { + const uint32_t floor_log_2_d = 31 - libdivide__count_leading_zeros32(absD); + + uint8_t more; + //the dividend here is 2**(floor_log_2_d + 31), so the low 32 bit word is 0 and the high word is floor_log_2_d - 1 + uint32_t rem, proposed_m = libdivide_64_div_32_to_32(1U << (floor_log_2_d - 1), 0, absD, &rem); + const uint32_t e = absD - rem; + LIBDIVIDE_ASSERT(floor_log_2_d >= 1); + + /* We are going to start with a power of floor_log_2_d - 1. This works if works if e < 2**floor_log_2_d. */ + if (e < (1U << floor_log_2_d)) { + /* This power works */ + more = floor_log_2_d - 1; + } + else { + /* We need to go one higher. This should not make proposed_m overflow, but it will make it negative when interpreted as an int32_t. */ + const uint32_t twice_rem = rem + rem; + proposed_m += proposed_m; + if (twice_rem >= absD || twice_rem < rem) proposed_m += 1; + more = floor_log_2_d | LIBDIVIDE_ADD_MARKER | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0); //use the general algorithm + } + proposed_m += 1; + result.magic = (d < 0 ? -(int32_t)proposed_m : (int32_t)proposed_m); + result.more = more; + + } + return result; +} + +int32_t libdivide_s32_do(int32_t numer, const libdivide_s32_t *denom) { + uint8_t more = denom->more; + if (more & LIBDIVIDE_S32_SHIFT_PATH) { + uint8_t shifter = more & LIBDIVIDE_32_SHIFT_MASK; + int32_t q = (numer + ((numer >> 31) & ((1 << shifter) - 1))) >> shifter; + int32_t shiftMask = (int8_t)more >> 7; //must be arithmetic shift and then sign-extend + q = (q ^ shiftMask) - shiftMask; + return q; + } + else { + int32_t q = libdivide__mullhi_s32(denom->magic, numer); + if (more & LIBDIVIDE_ADD_MARKER) { + int32_t sign = (int8_t)more >> 7; //must be arithmetic shift and then sign extend + q += ((numer ^ sign) - sign); + } + q >>= more & LIBDIVIDE_32_SHIFT_MASK; + q += (q < 0); + return q; + } +} + +int libdivide_s32_get_algorithm(const libdivide_s32_t *denom) { + uint8_t more = denom->more; + int positiveDivisor = ! (more & LIBDIVIDE_NEGATIVE_DIVISOR); + if (more & LIBDIVIDE_S32_SHIFT_PATH) return (positiveDivisor ? 0 : 1); + else if (more & LIBDIVIDE_ADD_MARKER) return (positiveDivisor ? 2 : 3); + else return 4; +} + +int32_t libdivide_s32_do_alg0(int32_t numer, const libdivide_s32_t *denom) { + uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK; + int32_t q = numer + ((numer >> 31) & ((1 << shifter) - 1)); + return q >> shifter; +} + +int32_t libdivide_s32_do_alg1(int32_t numer, const libdivide_s32_t *denom) { + uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK; + int32_t q = numer + ((numer >> 31) & ((1 << shifter) - 1)); + return - (q >> shifter); +} + +int32_t libdivide_s32_do_alg2(int32_t numer, const libdivide_s32_t *denom) { + int32_t q = libdivide__mullhi_s32(denom->magic, numer); + q += numer; + q >>= denom->more & LIBDIVIDE_32_SHIFT_MASK; + q += (q < 0); + return q; +} + +int32_t libdivide_s32_do_alg3(int32_t numer, const libdivide_s32_t *denom) { + int32_t q = libdivide__mullhi_s32(denom->magic, numer); + q -= numer; + q >>= denom->more & LIBDIVIDE_32_SHIFT_MASK; + q += (q < 0); + return q; +} + +int32_t libdivide_s32_do_alg4(int32_t numer, const libdivide_s32_t *denom) { + int32_t q = libdivide__mullhi_s32(denom->magic, numer); + q >>= denom->more & LIBDIVIDE_32_SHIFT_MASK; + q += (q < 0); + return q; +} + +#if LIBDIVIDE_USE_SSE2 +__m128i libdivide_s32_do_vector(__m128i numers, const libdivide_s32_t * denom) { + uint8_t more = denom->more; + if (more & LIBDIVIDE_S32_SHIFT_PATH) { + uint32_t shifter = more & LIBDIVIDE_32_SHIFT_MASK; + __m128i roundToZeroTweak = _mm_set1_epi32((1 << shifter) - 1); //could use _mm_srli_epi32 with an all -1 register + __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak)); //q = numer + ((numer >> 31) & roundToZeroTweak); + q = _mm_sra_epi32(q, libdivide_u32_to_m128i(shifter)); // q = q >> shifter + __m128i shiftMask = _mm_set1_epi32((int32_t)((int8_t)more >> 7)); //set all bits of shift mask = to the sign bit of more + q = _mm_sub_epi32(_mm_xor_si128(q, shiftMask), shiftMask); //q = (q ^ shiftMask) - shiftMask; + return q; + } + else { + __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + __m128i sign = _mm_set1_epi32((int32_t)(int8_t)more >> 7); //must be arithmetic shift + q = _mm_add_epi32(q, _mm_sub_epi32(_mm_xor_si128(numers, sign), sign)); // q += ((numer ^ sign) - sign); + } + q = _mm_sra_epi32(q, libdivide_u32_to_m128i(more & LIBDIVIDE_32_SHIFT_MASK)); //q >>= shift + q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); // q += (q < 0) + return q; + } +} + +__m128i libdivide_s32_do_vector_alg0(__m128i numers, const libdivide_s32_t *denom) { + uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK; + __m128i roundToZeroTweak = _mm_set1_epi32((1 << shifter) - 1); + __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak)); + return _mm_sra_epi32(q, libdivide_u32_to_m128i(shifter)); +} + +__m128i libdivide_s32_do_vector_alg1(__m128i numers, const libdivide_s32_t *denom) { + uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK; + __m128i roundToZeroTweak = _mm_set1_epi32((1 << shifter) - 1); + __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak)); + return _mm_sub_epi32(_mm_setzero_si128(), _mm_sra_epi32(q, libdivide_u32_to_m128i(shifter))); +} + +__m128i libdivide_s32_do_vector_alg2(__m128i numers, const libdivide_s32_t *denom) { + __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic)); + q = _mm_add_epi32(q, numers); + q = _mm_sra_epi32(q, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK)); + q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); + return q; +} + +__m128i libdivide_s32_do_vector_alg3(__m128i numers, const libdivide_s32_t *denom) { + __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic)); + q = _mm_sub_epi32(q, numers); + q = _mm_sra_epi32(q, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK)); + q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); + return q; +} + +__m128i libdivide_s32_do_vector_alg4(__m128i numers, const libdivide_s32_t *denom) { + __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic)); + q = _mm_sra_epi32(q, libdivide_u32_to_m128i(denom->more)); //q >>= shift + q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); // q += (q < 0) + return q; +} +#endif + +///////////// SINT64 + + +libdivide_s64_t libdivide_s64_gen(int64_t d) { + libdivide_s64_t result; + + /* If d is a power of 2, or negative a power of 2, we have to use a shift. This is especially important because the magic algorithm fails for -1. To check if d is a power of 2 or its inverse, it suffices to check whether its absolute value has exactly one bit set. This works even for INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set and is a power of 2. */ + const uint64_t absD = (uint64_t)(d < 0 ? -d : d); //gcc optimizes this to the fast abs trick + if ((absD & (absD - 1)) == 0) { //check if exactly one bit is set, don't care if absD is 0 since that's divide by zero + result.more = libdivide__count_trailing_zeros64(absD) | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0); + result.magic = 0; + } + else { + const uint32_t floor_log_2_d = 63 - libdivide__count_leading_zeros64(absD); + + //the dividend here is 2**(floor_log_2_d + 63), so the low 64 bit word is 0 and the high word is floor_log_2_d - 1 + uint8_t more; + uint64_t rem, proposed_m = libdivide_128_div_64_to_64(1ULL << (floor_log_2_d - 1), 0, absD, &rem); + const uint64_t e = absD - rem; + + /* We are going to start with a power of floor_log_2_d - 1. This works if works if e < 2**floor_log_2_d. */ + if (e < (1ULL << floor_log_2_d)) { + /* This power works */ + more = floor_log_2_d - 1; + } + else { + /* We need to go one higher. This should not make proposed_m overflow, but it will make it negative when interpreted as an int32_t. */ + const uint64_t twice_rem = rem + rem; + proposed_m += proposed_m; + if (twice_rem >= absD || twice_rem < rem) proposed_m += 1; + more = floor_log_2_d | LIBDIVIDE_ADD_MARKER | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0); + } + proposed_m += 1; + result.more = more; + result.magic = (d < 0 ? -(int64_t)proposed_m : (int64_t)proposed_m); + } + return result; +} + +int64_t libdivide_s64_do(int64_t numer, const libdivide_s64_t *denom) { + uint8_t more = denom->more; + int64_t magic = denom->magic; + if (magic == 0) { //shift path + uint32_t shifter = more & LIBDIVIDE_64_SHIFT_MASK; + int64_t q = (numer + ((numer >> 63) & ((1LL << shifter) - 1))) >> shifter; + int64_t shiftMask = (int8_t)more >> 7; //must be arithmetic shift and then sign-extend + q = (q ^ shiftMask) - shiftMask; + return q; + } + else { + int64_t q = libdivide__mullhi_s64(magic, numer); + if (more & LIBDIVIDE_ADD_MARKER) { + int64_t sign = (int8_t)more >> 7; //must be arithmetic shift and then sign extend + q += ((numer ^ sign) - sign); + } + q >>= more & LIBDIVIDE_64_SHIFT_MASK; + q += (q < 0); + return q; + } +} + + +int libdivide_s64_get_algorithm(const libdivide_s64_t *denom) { + uint8_t more = denom->more; + int positiveDivisor = ! (more & LIBDIVIDE_NEGATIVE_DIVISOR); + if (denom->magic == 0) return (positiveDivisor ? 0 : 1); //shift path + else if (more & LIBDIVIDE_ADD_MARKER) return (positiveDivisor ? 2 : 3); + else return 4; +} + +int64_t libdivide_s64_do_alg0(int64_t numer, const libdivide_s64_t *denom) { + uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK; + int64_t q = numer + ((numer >> 63) & ((1LL << shifter) - 1)); + return q >> shifter; +} + +int64_t libdivide_s64_do_alg1(int64_t numer, const libdivide_s64_t *denom) { + //denom->shifter != -1 && demo->shiftMask != 0 + uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK; + int64_t q = numer + ((numer >> 63) & ((1LL << shifter) - 1)); + return - (q >> shifter); +} + +int64_t libdivide_s64_do_alg2(int64_t numer, const libdivide_s64_t *denom) { + int64_t q = libdivide__mullhi_s64(denom->magic, numer); + q += numer; + q >>= denom->more & LIBDIVIDE_64_SHIFT_MASK; + q += (q < 0); + return q; +} + +int64_t libdivide_s64_do_alg3(int64_t numer, const libdivide_s64_t *denom) { + int64_t q = libdivide__mullhi_s64(denom->magic, numer); + q -= numer; + q >>= denom->more & LIBDIVIDE_64_SHIFT_MASK; + q += (q < 0); + return q; +} + +int64_t libdivide_s64_do_alg4(int64_t numer, const libdivide_s64_t *denom) { + int64_t q = libdivide__mullhi_s64(denom->magic, numer); + q >>= denom->more; + q += (q < 0); + return q; +} + + +#if LIBDIVIDE_USE_SSE2 +__m128i libdivide_s64_do_vector(__m128i numers, const libdivide_s64_t * denom) { + uint8_t more = denom->more; + int64_t magic = denom->magic; + if (magic == 0) { //shift path + uint32_t shifter = more & LIBDIVIDE_64_SHIFT_MASK; + __m128i roundToZeroTweak = libdivide__u64_to_m128((1LL << shifter) - 1); + __m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak)); //q = numer + ((numer >> 63) & roundToZeroTweak); + q = libdivide_s64_shift_right_vector(q, shifter); // q = q >> shifter + __m128i shiftMask = _mm_set1_epi32((int32_t)((int8_t)more >> 7)); + q = _mm_sub_epi64(_mm_xor_si128(q, shiftMask), shiftMask); //q = (q ^ shiftMask) - shiftMask; + return q; + } + else { + __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + __m128i sign = _mm_set1_epi32((int32_t)((int8_t)more >> 7)); //must be arithmetic shift + q = _mm_add_epi64(q, _mm_sub_epi64(_mm_xor_si128(numers, sign), sign)); // q += ((numer ^ sign) - sign); + } + q = libdivide_s64_shift_right_vector(q, more & LIBDIVIDE_64_SHIFT_MASK); //q >>= denom->mult_path.shift + q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); // q += (q < 0) + return q; + } +} + +__m128i libdivide_s64_do_vector_alg0(__m128i numers, const libdivide_s64_t *denom) { + uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK; + __m128i roundToZeroTweak = libdivide__u64_to_m128((1LL << shifter) - 1); + __m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak)); + q = libdivide_s64_shift_right_vector(q, shifter); + return q; +} + +__m128i libdivide_s64_do_vector_alg1(__m128i numers, const libdivide_s64_t *denom) { + uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK; + __m128i roundToZeroTweak = libdivide__u64_to_m128((1LL << shifter) - 1); + __m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak)); + q = libdivide_s64_shift_right_vector(q, shifter); + return _mm_sub_epi64(_mm_setzero_si128(), q); +} + +__m128i libdivide_s64_do_vector_alg2(__m128i numers, const libdivide_s64_t *denom) { + __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(denom->magic)); + q = _mm_add_epi64(q, numers); + q = libdivide_s64_shift_right_vector(q, denom->more & LIBDIVIDE_64_SHIFT_MASK); + q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); // q += (q < 0) + return q; +} + +__m128i libdivide_s64_do_vector_alg3(__m128i numers, const libdivide_s64_t *denom) { + __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(denom->magic)); + q = _mm_sub_epi64(q, numers); + q = libdivide_s64_shift_right_vector(q, denom->more & LIBDIVIDE_64_SHIFT_MASK); + q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); // q += (q < 0) + return q; +} + +__m128i libdivide_s64_do_vector_alg4(__m128i numers, const libdivide_s64_t *denom) { + __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(denom->magic)); + q = libdivide_s64_shift_right_vector(q, denom->more); + q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); + return q; +} + +#endif + + +#endif //LIBDIVIDE_HEADER_ONLY diff --git a/polymer/eduke32/build/include/pragmas.h b/polymer/eduke32/build/include/pragmas.h index 98985742d..1cffc97c9 100644 --- a/polymer/eduke32/build/include/pragmas.h +++ b/polymer/eduke32/build/include/pragmas.h @@ -30,39 +30,93 @@ extern int32_t dmval; #define wo(x) ((int16_t)(x)) // word cast #define by(x) ((uint8_t)(x)) // byte cast -// XXX: Only for testing on x86. Don't use from outside; it doesn't account for -// whether we're compiling for e.g. x86_64 which will never use asm anyway. -//#define USE_ASM_DIVSCALE +#define LIBDIVIDE_ALWAYS +#define DIVTABLESIZE 16384 + +extern libdivide_s64pad_t divtable64[DIVTABLESIZE]; +extern libdivide_s32pad_t divtable32[DIVTABLESIZE]; + +#if defined(__arm__) || defined(LIBDIVIDE_ALWAYS) +static inline uint32_t divideu32(uint32_t n, uint32_t d) +{ + static libdivide_u32_t udiv; + static uint32_t lastd; + + if (d == lastd) + goto skip; + + lastd = d; + udiv = libdivide_u32_gen(d); +skip: + return libdivide_u32_do(n, &udiv); +} + +static inline int32_t tabledivide64(int64_t n, int32_t d) +{ + static libdivide_s64_t sdiv; + static int32_t lastd; + libdivide_s64_t *dptr = ((unsigned) d < DIVTABLESIZE) ? (libdivide_s64_t *)&divtable64[d] : &sdiv; + + if (d == lastd || dptr != &sdiv) + goto skip; + + lastd = d; + sdiv = libdivide_s64_gen(d); +skip: + return libdivide_s64_do(n, dptr); +} + +static inline int32_t tabledivide32(int32_t n, int32_t d) +{ + static libdivide_s32_t sdiv; + static int32_t lastd; + libdivide_s32_t *dptr = ((unsigned) d < DIVTABLESIZE) ? (libdivide_s32_t *)&divtable32[d] : &sdiv; + + if (d == lastd || dptr != &sdiv) + goto skip; + + lastd = d; + sdiv = libdivide_s32_gen(d); +skip: + return libdivide_s32_do(n, dptr); +} +#else +static inline uint32_t divideu32(uint32_t n, uint32_t d) { return n / d; } + +static inline int32_t tabledivide64(int64_t n, int32_t d) { return ((unsigned) d < DIVTABLESIZE) ? + libdivide_s64_do(n, (libdivide_s64_t *) &divtable64[d]) : n / d; } + +static inline int32_t tabledivide32(int32_t n, int32_t d) { return ((unsigned) d < DIVTABLESIZE) ? + libdivide_s32_do(n, (libdivide_s32_t *) &divtable32[d]) : n / d; } +#endif + +extern uint32_t divideu32_noinline(uint32_t n, uint32_t d); +extern int32_t tabledivide32_noinline(int32_t n, int32_t d); +extern int32_t tabledivide64_noinline(int64_t n, int32_t d); -#if !defined USE_ASM_DIVSCALE #ifdef GEKKO #include static inline int32_t divscale(int32_t eax, int32_t ebx, int32_t ecx) { - return ldexp(eax, ecx) / ebx; + return tabledivide64(ldexp(eax, ecx), ebx); } - -# define _scaler(a) \ - static inline int32_t divscale##a(int32_t eax, int32_t ebx) \ -{ \ - return divscale(eax, ebx, a); \ -} \ - #else -static inline int32_t divscale(int32_t eax, int32_t ebx, int32_t ecx) { return dw((qw(eax) << by(ecx)) / ebx); } - -# define _scaler(a) \ - static inline int32_t divscale##a(int32_t eax, int32_t ebx) \ -{ \ - return dw((qw(eax) << by(a)) / ebx); \ -} \ - +static inline int32_t divscale(int32_t eax, int32_t ebx, int32_t ecx) +{ + const int64_t numer = qw(eax) << by(ecx); + return dw(tabledivide64(numer, ebx)); +} #endif +# define _scaler(a) static inline int32_t divscale##a(int32_t eax, int32_t ebx) { return divscale(eax, ebx, a); } PRAGMA_FUNCS _scaler(32) - #undef _scaler -#endif // !defined USE_ASM_DIVSCALE + +static inline int32_t scale(int32_t eax, int32_t edx, int32_t ecx) +{ + const int64_t numer = qw(eax) * edx; + return dw(tabledivide64(numer, ecx)); +} #if defined(__GNUC__) && defined(GEKKO) @@ -115,11 +169,6 @@ static inline void swap64bit(void* a, void* b) { int64_t t = *((int64_t*)b); *(( static inline char readpixel(void* s) { return (*((char*)(s))); } static inline void drawpixel(void* s, char a) { *((char*)(s)) = a; } -static inline void drawpixels(void* s, int16_t a) { *((int16_t*)(s)) = a; } -static inline void drawpixelses(void* s, int32_t a) { *((int32_t*)(s)) = a; } - -static inline int32_t divmod(int32_t a, int32_t b) { uint32_t _a=(uint32_t)a, _b=(uint32_t)b; dmval = _a%_b; return _a/_b; } -static inline int32_t moddiv(int32_t a, int32_t b) { uint32_t _a=(uint32_t)a, _b=(uint32_t)b; dmval = _a/_b; return _a%_b; } static inline int32_t klabs(int32_t a) { const uint32_t m = a >> (sizeof(int) * CHAR_BIT - 1); return (a ^ m) - m; } static inline int32_t ksgn(int32_t a) { return (a>0)-(a<0); } @@ -130,7 +179,6 @@ static inline int32_t kmin(int32_t a, int32_t b) { if ((int32_t)a < (int32_t)b) static inline int32_t kmax(int32_t a, int32_t b) { if ((int32_t)a < (int32_t)b) return b; return a; } static inline int32_t sqr(int32_t eax) { return (eax) * (eax); } -static inline int32_t scale(int32_t eax, int32_t edx, int32_t ecx) { return dw((qw(eax) * edx) / ecx); } static inline int32_t mulscale(int32_t eax, int32_t edx, int32_t ecx) { return dw((qw(eax) * edx) >> by(ecx)); } static inline int32_t dmulscale(int32_t eax, int32_t edx, int32_t esi, int32_t edi, int32_t ecx) { return dw(((qw(eax) * edx) + (qw(esi) * edi)) >> by(ecx)); } @@ -155,9 +203,15 @@ void copybufreverse(const void *S, void *D, int32_t c); static inline void swapbufreverse(void *s, void *d, int32_t c) { uint8_t *src = (uint8_t*)s, *dst = (uint8_t*)d; - while (c--) { - swapchar(dst++, src--); - } + do + { + swapchar(dst, src); + swapchar(dst+1, src-1); + swapchar(dst+2, src-2); + swapchar(dst+3, src-3); + dst += 4, src -= 4; + } while (--c > 4); + while (c--) swapchar(dst++, src--); } #ifdef EXTERNC diff --git a/polymer/eduke32/build/include/pragmas_arm.h b/polymer/eduke32/build/include/pragmas_arm.h index a6e1b7749..17db9fe82 100644 --- a/polymer/eduke32/build/include/pragmas_arm.h +++ b/polymer/eduke32/build/include/pragmas_arm.h @@ -5,7 +5,6 @@ #ifndef __pragmas_arm_h__ #define __pragmas_arm_h__ -// TODO: implement libdivide.h #define _scaler(a) \ static inline int32_t mulscale##a(int32_t eax, int32_t edx) \ { \ @@ -30,11 +29,6 @@ static inline void swap64bit(void* a, void* b) { int64_t t = *((int64_t*) b); *( static inline char readpixel(void* s) { return (*((char*) (s))); } static inline void drawpixel(void* s, char a) { *((char*) (s)) = a; } -static inline void drawpixels(void* s, int16_t a) { *((int16_t*) (s)) = a; } -static inline void drawpixelses(void* s, int32_t a) { *((int32_t*) (s)) = a; } - -static inline int32_t divmod(int32_t a, int32_t b) { uint32_t _a=(uint32_t) a, _b=(uint32_t) b; dmval = _a%_b; return _a/_b; } -static inline int32_t moddiv(int32_t a, int32_t b) { uint32_t _a=(uint32_t) a, _b=(uint32_t) b; dmval = _a/_b; return _a%_b; } static inline int32_t klabs(int32_t a) { const uint32_t m = a >> (sizeof(int) * CHAR_BIT - 1); return (a ^ m) - m; } static inline int32_t ksgn(int32_t a) { return (a>0)-(a<0); } @@ -45,7 +39,6 @@ static inline int32_t kmin(int32_t a, int32_t b) { if ((int32_t) a < (int32_t) b static inline int32_t kmax(int32_t a, int32_t b) { if ((int32_t) a < (int32_t) b) return b; return a; } static inline int32_t sqr(int32_t eax) { return (eax) * (eax); } -static inline int32_t scale(int32_t eax, int32_t edx, int32_t ecx) { return dw((qw(eax) * qw(edx)) / qw(ecx)); } static inline int32_t mulscale(int32_t eax, int32_t edx, int32_t ecx) { return dw((qw(eax) * qw(edx)) >> by(ecx)); } static inline int32_t dmulscale(int32_t eax, int32_t edx, int32_t esi, int32_t edi, int32_t ecx) { return dw(((qw(eax) * qw(edx)) + (qw(esi) * qw(edi))) >> by(ecx)); } diff --git a/polymer/eduke32/build/include/pragmas_ppc.h b/polymer/eduke32/build/include/pragmas_ppc.h index 5e7462510..dc490e66b 100644 --- a/polymer/eduke32/build/include/pragmas_ppc.h +++ b/polymer/eduke32/build/include/pragmas_ppc.h @@ -6,8 +6,6 @@ #define sqr(a) ((a)*(a)) -int32_t scale(int32_t a, int32_t d, int32_t c); - #define _scaler(x) \ static inline int32_t mulscale##x(int32_t a, int32_t d) \ { \ @@ -118,26 +116,6 @@ static inline void drawpixel(void *d, char a) *(char*) d = a; } -static inline void drawpixels(void *d, int16_t a) -{ - __asm__( - " sthbrx %0, 0, %1\n" - : - : "r"(&a), "r"(d) - : "memory" - ); -} - -static inline void drawpixelses(void *d, int32_t a) -{ - __asm__( - " stwbrx %0, 0, %1\n" - : - : "r"(&a), "r"(d) - : "memory" - ); -} - void clearbufbyte(void *d, int32_t c, int32_t a); static inline void clearbuf(void *d, int32_t c, int32_t a) @@ -266,32 +244,6 @@ static inline void swap64bit(void *a, void *b) *(double*) b = t; } -static inline int32_t divmod(int32_t a, int32_t b) -{ - int32_t div; - __asm__( - " divwu %0, %2, %3\n" - " mullw %1, %0, %3\n" - " subf %1, %1, %2\n" - : "=&r"(div), "=&r"(dmval) - : "r"(a), "r"(b) - ); - return div; -} - -static inline int32_t moddiv(int32_t a, int32_t b) -{ - int32_t mod; - __asm__( - " divwu %0, %2, %3\n" - " mullw %1, %0, %3\n" - " subf %1, %1, %2\n" - : "=&r"(dmval), "=&r"(mod) - : "r"(a), "r"(b) - ); - return mod; -} - static inline int32_t umin(int32_t a, int32_t b) { if ((uint32_t) a < (uint32_t) b) return a; return b; } static inline int32_t umax(int32_t a, int32_t b) { if ((uint32_t) a < (uint32_t) b) return b; return a; } static inline int32_t kmin(int32_t a, int32_t b) { if ((int32_t) a < (int32_t) b) return a; return b; } diff --git a/polymer/eduke32/build/include/pragmas_x86_gcc.h b/polymer/eduke32/build/include/pragmas_x86_gcc.h index 3ea80bd62..77fd0019e 100644 --- a/polymer/eduke32/build/include/pragmas_x86_gcc.h +++ b/polymer/eduke32/build/include/pragmas_x86_gcc.h @@ -33,13 +33,6 @@ void copybufreverse(const void *S, void *D, int32_t c); #define sqr(a) __builtin_sqr(a) #endif -#define scale(a,d,c) \ - ({ int32_t __a=(a), __d=(d), __c=(c); \ - __asm__ __volatile__ ("imull %%edx; idivl %%ecx" \ - : "=a" (__a), "=d" (__d) \ - : "0" (__a), "1" (__d), "c" (__c) : "cc"); \ - __a; }) - #define mulscale(a,d,c) \ ({ int32_t __a=(a), __d=(d), __c=(c); \ __asm__ __volatile__ ("imull %%edx; shrdl %%cl, %%edx, %%eax" \ @@ -471,174 +464,6 @@ void copybufreverse(const void *S, void *D, int32_t c); : "a" (__a), "d" (__d), "S" (__S), "D" (__D) : "ebx", "cc"); \ __d; }) -#ifdef USE_ASM_DIVSCALE -#define divscale(a,b,c) \ - ({ int32_t __a=(a), __b=(b), __c=(c); \ - __asm__ __volatile__ ("movl %%eax, %%edx; shll %%cl, %%eax; negb %%cl; sarl %%cl, %%edx; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "c" (__c), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale1(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("addl %%eax, %%eax; sbbl %%edx, %%edx; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale2(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $30, %%edx; leal (,%%eax,4), %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale3(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $29, %%edx; leal (,%%eax,8), %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale4(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $28, %%edx; shll $4, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale5(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $27, %%edx; shll $5, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale6(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $26, %%edx; shll $6, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale7(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $25, %%edx; shll $7, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale8(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $24, %%edx; shll $8, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale9(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $23, %%edx; shll $9, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale10(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $22, %%edx; shll $10, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale11(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $21, %%edx; shll $11, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale12(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $20, %%edx; shll $12, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale13(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $19, %%edx; shll $13, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale14(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $18, %%edx; shll $14, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale15(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $17, %%edx; shll $15, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale16(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $16, %%edx; shll $16, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale17(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $15, %%edx; shll $17, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale18(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $14, %%edx; shll $18, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale19(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $13, %%edx; shll $19, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale20(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $12, %%edx; shll $20, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale21(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $11, %%edx; shll $21, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale22(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $10, %%edx; shll $22, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale23(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $9, %%edx; shll $23, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale24(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $8, %%edx; shll $24, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale25(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $7, %%edx; shll $25, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale26(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $6, %%edx; shll $26, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale27(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $5, %%edx; shll $27, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale28(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $4, %%edx; shll $28, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale29(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $3, %%edx; shll $29, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale30(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $2, %%edx; shll $30, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale31(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("movl %%eax, %%edx; sarl $1, %%edx; shll $31, %%eax; idivl %%ebx" \ - : "=a" (__a) : "a" (__a), "b" (__b) : "edx", "cc"); \ - __a; }) -#define divscale32(d,b) \ - ({ int32_t __d=(d), __b=(b), __r; \ - __asm__ __volatile__ ("xorl %%eax, %%eax; idivl %%ebx" \ - : "=a" (__r), "=d" (__d) : "d" (__d), "b" (__b) : "cc"); \ - __r; }) -#endif // defined USE_ASM_DIVSCALE - #define readpixel(D) \ ({ void *__D=(D); int32_t __a; \ __asm__ __volatile__ ("movb (%%edi), %%al" \ @@ -649,16 +474,6 @@ void copybufreverse(const void *S, void *D, int32_t c); __asm__ __volatile__ ("movb %%al, (%%edi)" \ : : "D" (__D), "a" (__a) : "memory", "cc"); \ 0; }) -#define drawpixels(D,a) \ - ({ void *__D=(D); int32_t __a=(a); \ - __asm__ __volatile__ ("movw %%ax, (%%edi)" \ - : : "D" (__D), "a" (__a) : "memory", "cc"); \ - 0; }) -#define drawpixelses(D,a) \ - ({ void *__D=(D); int32_t __a=(a); \ - __asm__ __volatile__ ("movl %%eax, (%%edi)" \ - : : "D" (__D), "a" (__a) : "memory", "cc"); \ - 0; }) #define clearbuf(D,c,a) \ ({ void *__D=(D); int32_t __c=(c), __a=(a); \ __asm__ __volatile__ ("rep; stosl" \ @@ -670,19 +485,6 @@ void copybufreverse(const void *S, void *D, int32_t c); : "=&S" (__S), "=&D" (__D), "=&c" (__c) : "0" (__S), "1" (__D), "2" (__c) : "memory", "cc"); \ 0; }) -//returns eax/ebx, dmval = eax%edx; -#define divmod(a,b) \ - ({ int32_t __a=(a), __b=(b); \ - __asm__ __volatile__ ("xorl %%edx, %%edx; divl %%ebx; movl %%edx, "_DMVAL \ - : "+a" (__a) : "b" (__b) : "edx", "memory", "cc"); \ - __a; }) -//returns eax%ebx, dmval = eax/edx; -#define moddiv(a,b) \ - ({ int32_t __a=(a), __b=(b), __d; \ - __asm__ __volatile__ ("xorl %%edx, %%edx; divl %%ebx; movl %%eax, "_DMVAL \ - : "=d" (__d) : "a" (__a), "b" (__b) : "eax", "memory", "cc"); \ - __d; }) - #define klabs(a) \ ({ int32_t __a=(a); \ __asm__ __volatile__ ("testl %%eax, %%eax; jns 0f; negl %%eax; 0:" \ diff --git a/polymer/eduke32/build/include/pragmas_x86_msvc.h b/polymer/eduke32/build/include/pragmas_x86_msvc.h index 1461e6116..27e2f905a 100644 --- a/polymer/eduke32/build/include/pragmas_x86_msvc.h +++ b/polymer/eduke32/build/include/pragmas_x86_msvc.h @@ -16,15 +16,6 @@ static __inline int32_t sqr(int32_t a) } } -static __inline int32_t scale(int32_t a, int32_t d, int32_t c) -{ - _asm { - mov eax, a - imul d - idiv c - } -} - static __inline int32_t mulscale(int32_t a, int32_t d, int32_t c) { _asm { @@ -99,80 +90,6 @@ static __inline int32_t dmulscale32(int32_t a, int32_t d, int32_t S, int32_t D) } } -#ifdef USE_ASM_DIVSCALE -static __inline int32_t divscale(int32_t a, int32_t b, int32_t c) -{ - _asm { - mov eax, a - mov ecx, c - mov edx, eax - shl eax, cl - neg cl - sar edx, cl - idiv b - } -} - -static __inline int32_t divscale1(int32_t a, int32_t b) -{ - _asm { - mov eax, a - add eax, eax - sbb edx, edx - idiv b - } -} - -static __inline int32_t divscale2(int32_t a, int32_t b) -{ - _asm { - mov eax, a - mov edx, eax - sar edx, 30 - lea eax, [eax*4] - idiv b - } -} - -static __inline int32_t divscale3(int32_t a, int32_t b) -{ - _asm { - mov eax, a - mov edx, eax - sar edx, 29 - lea eax, [eax*8] - idiv b - } -} - -#define DIVSCALE(x,y) \ -static __inline int32_t divscale##y(int32_t a, int32_t b) \ -{ \ - _asm mov eax, a \ - _asm mov edx, eax \ - _asm sar edx, x \ - _asm shl eax, y \ - _asm idiv b \ -} - -DIVSCALE(28, 4) DIVSCALE(27, 5) DIVSCALE(26, 6) DIVSCALE(25, 7) -DIVSCALE(24, 8) DIVSCALE(23, 9) DIVSCALE(22, 10) DIVSCALE(21, 11) -DIVSCALE(20, 12) DIVSCALE(19, 13) DIVSCALE(18, 14) DIVSCALE(17, 15) -DIVSCALE(16, 16) DIVSCALE(15, 17) DIVSCALE(14, 18) DIVSCALE(13, 19) -DIVSCALE(12, 20) DIVSCALE(11, 21) DIVSCALE(10, 22) DIVSCALE(9, 23) -DIVSCALE(8, 24) DIVSCALE(7, 25) DIVSCALE(6, 26) DIVSCALE(5, 27) -DIVSCALE(4, 28) DIVSCALE(3, 29) DIVSCALE(2, 30) DIVSCALE(1, 31) - -static __inline int32_t divscale32(int32_t d, int32_t b) -{ - _asm { - mov edx, d - xor eax, eax - idiv b - } -} -#endif // defined USE_ASM_DIVSCALE - static __inline char readpixel(void *d) { _asm { @@ -190,24 +107,6 @@ static __inline void drawpixel(void *d, char a) } } -static __inline void drawpixels(void *d, int16_t a) -{ - _asm { - mov edx, d - mov ax, a - mov word ptr[edx], ax - } -} - -static __inline void drawpixelses(void *d, int32_t a) -{ - _asm { - mov edx, d - mov eax, a - mov dword ptr[edx], eax - } -} - static __inline void clearbuf(void *d, int32_t c, int32_t a) { _asm { @@ -415,29 +314,6 @@ static __inline void qinterpolatedown16short(int32_t a, int32_t c, int32_t d, in } } -//returns eax/ebx, dmval = eax%edx; -static __inline int32_t divmod(int32_t a, int32_t b) -{ - _asm { - mov eax, a - xor edx, edx - div b - mov dmval, edx - } -} - -//returns eax%ebx, dmval = eax/edx; -static __inline int32_t moddiv(int32_t a, int32_t b) -{ - _asm { - mov eax, a - xor edx, edx - div b - mov dmval, eax - mov eax, edx - } -} - static __inline int32_t klabs(int32_t a) { _asm { diff --git a/polymer/eduke32/build/src/a-c.c b/polymer/eduke32/build/src/a-c.c index d0fc4a62d..a16c35212 100644 --- a/polymer/eduke32/build/src/a-c.c +++ b/polymer/eduke32/build/src/a-c.c @@ -7,6 +7,7 @@ // See the included license file "BUILDLIC.TXT" for license info. #include "a.h" +#include "pragmas.h" #ifdef ENGINE_USING_A_C @@ -71,6 +72,17 @@ void hlineasm4(int32_t cnt, int32_t skiploadincs, int32_t paloffs, uint32_t by, const int32_t logx = glogx, logy = glogy; char *pp = (char *)p; + for (; cnt>=4; cnt -= 4) + { + *pp = palptr[buf[((bx>>(32-logx))<>(32-logy))]]; pp--; + *pp = palptr[buf[(((bx-bxinc)>>(32-logx))<>(32-logy))]]; pp--; + *pp = palptr[buf[(((bx-(bxinc<<1))>>(32-logx))<>(32-logy))]]; pp--; + *pp = palptr[buf[(((bx-(bxinc*3))>>(32-logx))<>(32-logy))]]; pp--; + + bx -= bxinc<<2; + by -= byinc<<2; + } + for (; cnt>=0; cnt--) { *pp = palptr[buf[((bx>>(32-logx))<>(32-logy))]]; @@ -121,10 +133,7 @@ static inline uint32_t ourmulscale32(uint32_t a, uint32_t b) static inline int32_t getpix(int32_t logy, const char *buf, uint32_t vplc) { - if (logy != 0) - return buf[vplc>>logy]; - else - return buf[ourmulscale32(vplc,globaltilesizy)]; + return logy ? buf[vplc>>logy] : buf[ourmulscale32(vplc,globaltilesizy)]; } void setupvlineasm(int32_t neglogy) { glogy = neglogy; } @@ -138,18 +147,45 @@ int32_t vlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, in cnt++; - do + if (logy) { - if (logy != 0) +#ifdef CLASSIC_SLICE_BY_4 + for (; cnt>=4; cnt-=4) + { *pp = pal[buf[vplc>>logy]]; - else - *pp = pal[buf[ourmulscale32(vplc,globaltilesizy)]]; - - pp += ourbpl; - vplc += vinc; + *(pp+ourbpl) = pal[buf[(vplc+vinc)>>logy]]; + *(pp+(ourbpl<<1)) = pal[buf[(vplc+(vinc<<1))>>logy]]; + *(pp+(ourbpl*3)) = pal[buf[(vplc+(vinc*3))>>logy ]]; + pp += ourbpl<<2; + vplc += vinc<<2; + } +#endif + while (cnt--) + { + *pp = pal[buf[vplc>>logy]]; + pp += ourbpl; + vplc += vinc; + } + } + else + { +#ifdef CLASSIC_SLICE_BY_4 + for (; cnt>=4; cnt-=4) + { + *pp = pal[buf[ourmulscale32(vplc, globaltilesizy)]]; + *(pp+ourbpl) = pal[buf[ourmulscale32((vplc+vinc),globaltilesizy)]]; + *(pp+(ourbpl<<1)) = pal[buf[ourmulscale32((vplc+(vinc<<1)), globaltilesizy)]]; + *(pp+(ourbpl*3)) = pal[buf[ourmulscale32((vplc+(vinc*3)), globaltilesizy)]]; + pp += ourbpl<<2; + vplc += vinc<<2; + } +#endif + while (cnt--) + { + *pp = pal[buf[ourmulscale32(vplc,globaltilesizy)]], pp += ourbpl; + vplc += vinc; + } } - while (--cnt); - return vplc; } @@ -191,9 +227,6 @@ typedef uint32_t uint32_vec4 __attribute__ ((vector_size (16))); // cnt >= 1 void vlineasm4(int32_t cnt, char *p) { - char ch; - int32_t i; - char *const pal[4] = {(char *)palookupoffse[0], (char *)palookupoffse[1], (char *)palookupoffse[2], (char *)palookupoffse[3]}; char *const buf[4] = {(char *)bufplce[0], (char *)bufplce[1], (char *)bufplce[2], (char *)bufplce[3]}; #ifdef USE_VECTOR_EXT @@ -205,25 +238,86 @@ void vlineasm4(int32_t cnt, char *p) #endif const int32_t logy = glogy, ourbpl = bpl; - do + if (!logy) { - for (i=0; i<4; i++) + do { - ch = getpix(logy, buf[i], vplc[i]); - p[i] = pal[i][ch]; -#if !defined USE_VECTOR_EXT - vplc[i] += vinc[i]; + p[0] = pal[0][buf[0][ourmulscale32(vplc[0],globaltilesizy)]]; + p[1] = pal[1][buf[1][ourmulscale32(vplc[1],globaltilesizy)]]; + p[2] = pal[2][buf[2][ourmulscale32(vplc[2],globaltilesizy)]]; + p[3] = pal[3][buf[3][ourmulscale32(vplc[3],globaltilesizy)]]; + +#if defined USE_VECTOR_EXT + vplc += vinc; +#else + vplc[0] += vinc[0]; + vplc[1] += vinc[1]; + vplc[2] += vinc[2]; + vplc[3] += vinc[3]; #endif + p += ourbpl; } -#ifdef USE_VECTOR_EXT + while (--cnt); + + goto skip; + } + + // just fucking shoot me +#ifdef CLASSIC_SLICE_BY_4 + for (; cnt>=4;cnt-=4) + { + p[0] = pal[0][buf[0][ vplc[0]>>logy ]]; + p[1] = pal[1][buf[1][ vplc[1]>>logy ]]; + p[2] = pal[2][buf[2][ vplc[2]>>logy ]]; + p[3] = pal[3][buf[3][ vplc[3]>>logy ]]; + (p+ourbpl)[0] = pal[0][buf[0][ (vplc[0]+vinc[0])>>logy ]]; + (p+ourbpl)[1] = pal[1][buf[1][ (vplc[1]+vinc[1])>>logy ]]; + (p+ourbpl)[2] = pal[2][buf[2][ (vplc[2]+vinc[2])>>logy ]]; + (p+ourbpl)[3] = pal[3][buf[3][ (vplc[3]+vinc[3])>>logy ]]; + (p+(ourbpl<<1))[0] = pal[0][buf[0][ (vplc[0]+(vinc[0]<<1))>>logy ]]; + (p+(ourbpl<<1))[1] = pal[1][buf[1][ (vplc[1]+(vinc[1]<<1))>>logy ]]; + (p+(ourbpl<<1))[2] = pal[2][buf[2][ (vplc[2]+(vinc[2]<<1))>>logy ]]; + (p+(ourbpl<<1))[3] = pal[3][buf[3][ (vplc[3]+(vinc[3]<<1))>>logy ]]; + (p+(ourbpl*3))[0] = pal[0][buf[0][ (vplc[0]+(vinc[0]*3))>>logy ]]; + (p+(ourbpl*3))[1] = pal[1][buf[1][ (vplc[1]+(vinc[1]*3))>>logy ]]; + (p+(ourbpl*3))[2] = pal[2][buf[2][ (vplc[2]+(vinc[2]*3))>>logy ]]; + (p+(ourbpl*3))[3] = pal[3][buf[3][ (vplc[3]+(vinc[3]*3))>>logy ]]; + +#if defined USE_VECTOR_EXT + vplc += vinc<<2; +#else + vplc[0] += vinc[0]<<2; + vplc[1] += vinc[1]<<2; + vplc[2] += vinc[2]<<2; + vplc[3] += vinc[3]<<2; +#endif + p += ourbpl<<2; + } +#endif + + while (cnt--) + { + p[0] = pal[0][buf[0][vplc[0]>>logy]]; + p[1] = pal[1][buf[1][vplc[1]>>logy]]; + p[2] = pal[2][buf[2][vplc[2]>>logy]]; + p[3] = pal[3][buf[3][vplc[3]>>logy]]; + +#if defined USE_VECTOR_EXT vplc += vinc; +#else + vplc[0] += vinc[0]; + vplc[1] += vinc[1]; + vplc[2] += vinc[2]; + vplc[3] += vinc[3]; #endif p += ourbpl; } - while (--cnt); - for (i=0; i<4; i++) - vplce[i] = vplc[i]; +skip: + vplce[0] = vplc[0]; + vplce[1] = vplc[1]; + vplce[2] = vplc[2]; + vplce[3] = vplc[3]; } #ifdef USE_SATURATE_VPLC @@ -251,10 +345,26 @@ int32_t mvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, i cnt++; + if (!logy) + { + do + { + ch = buf[ourmulscale32(vplc,globaltilesizy)]; + if (ch != 255) *pp = pal[ch]; + pp += ourbpl; + vplc += vinc; + saturate_vplc(vplc, vinc); + } + while (--cnt); + + return vplc; + } + do { - ch = getpix(logy, buf, vplc); - if (ch != 255) *pp = pal[ch]; + + if (buf[vplc>>logy] != 255) + *pp = pal[buf[vplc>>logy]]; pp += ourbpl; vplc += vinc; saturate_vplc(vplc, vinc); @@ -267,9 +377,6 @@ int32_t mvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, i // cnt >= 1 void mvlineasm4(int32_t cnt, char *p) { - char ch; - int32_t i; - char *const pal[4] = {(char *)palookupoffse[0], (char *)palookupoffse[1], (char *)palookupoffse[2], (char *)palookupoffse[3]}; char *const buf[4] = {(char *)bufplce[0], (char *)bufplce[1], (char *)bufplce[2], (char *)bufplce[3]}; #ifdef USE_VECTOR_EXT @@ -280,28 +387,73 @@ void mvlineasm4(int32_t cnt, char *p) uint32_t vplc[4] = {vplce[0], vplce[1], vplce[2], vplce[3]}; #endif const int32_t logy = glogy, ourbpl = bpl; + char ch; - do + if (logy) { - for (i=0; i<4; i++) + do { - ch = getpix(logy, buf[i], vplc[i]); - if (ch != 255) p[i] = pal[i][ch]; -#if !defined USE_VECTOR_EXT - vplc[i] += vinc[i]; - saturate_vplc(vplc[i], vinc[i]); -#endif - } -#ifdef USE_VECTOR_EXT - vplc += vinc; - saturate_vplc_vec(vplc, vinc); -#endif - p += ourbpl; - } - while (--cnt); + ch = buf[0][vplc[0]>>logy]; + if (ch != 255) p[0] = pal[0][ch]; + ch = buf[1][vplc[1]>>logy]; + if (ch != 255) p[1] = pal[1][ch]; + ch = buf[2][vplc[2]>>logy]; + if (ch != 255) p[2] = pal[2][ch]; + ch = buf[3][vplc[3]>>logy]; + if (ch != 255) p[3] = pal[3][ch]; - for (i=0; i<4; i++) - vplce[i] = vplc[i]; +#if !defined USE_VECTOR_EXT + vplc[0] += vinc[0]; + vplc[1] += vinc[1]; + vplc[2] += vinc[2]; + vplc[3] += vinc[3]; + saturate_vplc(vplc[0], vinc[0]); + saturate_vplc(vplc[1], vinc[1]); + saturate_vplc(vplc[2], vinc[2]); + saturate_vplc(vplc[3], vinc[3]); +#else + vplc += vinc; + saturate_vplc_vec(vplc, vinc); +#endif + p += ourbpl; + } + while (--cnt); + } + else + { + do + { + ch = buf[0][ourmulscale32(vplc[0],globaltilesizy)]; + if (ch != 255) p[0] = pal[0][ch]; + ch = buf[1][ourmulscale32(vplc[1],globaltilesizy)]; + if (ch != 255) p[1] = pal[1][ch]; + ch = buf[2][ourmulscale32(vplc[2],globaltilesizy)]; + if (ch != 255) p[2] = pal[2][ch]; + ch = buf[3][ourmulscale32(vplc[3],globaltilesizy)]; + if (ch != 255) p[3] = pal[3][ch]; + +#if !defined USE_VECTOR_EXT + vplc[0] += vinc[0]; + vplc[1] += vinc[1]; + vplc[2] += vinc[2]; + vplc[3] += vinc[3]; + saturate_vplc(vplc[0], vinc[0]); + saturate_vplc(vplc[1], vinc[1]); + saturate_vplc(vplc[2], vinc[2]); + saturate_vplc(vplc[3], vinc[3]); +#else + vplc += vinc; + saturate_vplc_vec(vplc, vinc); +#endif + p += ourbpl; + } + while (--cnt); + } + + vplce[0] = vplc[0]; + vplce[1] = vplc[1]; + vplce[2] = vplc[2]; + vplce[3] = vplc[3]; } #ifdef USE_ASM64 @@ -335,7 +487,8 @@ int32_t tvlineasm1(int32_t vinc, intptr_t paloffs, int32_t cnt, uint32_t vplc, i do { ch = getpix(logy, buf, vplc); - if (ch != 255) *pp = trans[(*pp)|(pal[ch]<<8)]; + if (ch != 255) + *pp = trans[(*pp)|(pal[ch]<<8)]; pp += ourbpl; vplc += vinc; saturate_vplc_trans(vplc, vinc); @@ -374,7 +527,7 @@ void tvlineasm2(uint32_t vplc2, int32_t vinc1, intptr_t bufplc1, intptr_t bufplc { char ch; - int32_t cnt = (asm2-p-1)/bpl; // >= 1 + int32_t cnt = tabledivide32(asm2-p-1, bpl); // >= 1 const int32_t vinc2 = asm1; const char *const buf1 = (char *)bufplc1; @@ -533,7 +686,7 @@ void mspritevline(int32_t bx, int32_t by, int32_t cnt, intptr_t bufplc, intptr_t for (; cnt>1; cnt--) { ch = gbuf[(bx>>16)*glogy+(by>>16)]; - if (ch != 255)(*(char *)p) = gpal[ch]; + if (ch != 255) (*(char *)p) = gpal[ch]; bx += gbxinc; by += gbyinc; p += bpl; @@ -557,7 +710,7 @@ void tspritevline(int32_t bx, int32_t by, int32_t cnt, intptr_t bufplc, intptr_t for (; cnt>1; cnt--) { ch = gbuf[(bx>>16)*glogy+(by>>16)]; - if (ch != 255) *((char *)p) = gtrans[(*((char *)p))+(gpal[ch]<<8)]; + if (ch != 255) *((char *)p) = gtrans[(*((char *)p))+(gpal[ch]<<8)]; bx += gbxinc; by += gbyinc; p += bpl; diff --git a/polymer/eduke32/build/src/cache1d.c b/polymer/eduke32/build/src/cache1d.c index 0b2b1d36f..fd99612ca 100644 --- a/polymer/eduke32/build/src/cache1d.c +++ b/polymer/eduke32/build/src/cache1d.c @@ -128,7 +128,7 @@ void initcache(intptr_t dacachestart, int32_t dacachesize) int32_t i; for (i=1; i<200; i++) - lockrecip[i] = (1<<28)/(200-i); + lockrecip[i] = tabledivide32_noinline(1<<28, 200-i); // The following code was relocated here from engine.c, since this // function is only ever called once (from there), and it seems to diff --git a/polymer/eduke32/build/src/compat.c b/polymer/eduke32/build/src/compat.c index 42dbba09a..357be91a7 100644 --- a/polymer/eduke32/build/src/compat.c +++ b/polymer/eduke32/build/src/compat.c @@ -797,33 +797,32 @@ char *Bstrtolower(char *str) //Brute-force case-insensitive, slash-insensitive, * and ? wildcard matcher //Given: string i and string j. string j can have wildcards //Returns: 1:matches, 0:doesn't match -int32_t Bwildmatch(const char *i, const char *j) -{ - const char *k; - char c0, c1; +#ifndef WITHKPLIB +extern char toupperlookup[256]; - if (!*j) return(1); +static int32_t wildmatch(const char *match, const char *wild) +{ do { - if (*j == '*') + if (*match && (toupperlookup[*wild] == toupperlookup[*match] || *wild == '?')) { - for (k=i,j++; *k; k++) if (Bwildmatch(k,j)) return(1); + wild++, match++; continue; } - if (!*i) return(0); - if (*j == '?') { i++; j++; continue; } - c0 = *i; if ((c0 >= 'a') && (c0 <= 'z')) c0 -= 32; - c1 = *j; if ((c1 >= 'a') && (c1 <= 'z')) c1 -= 32; -#ifdef _WIN32 - if (c0 == '/') c0 = '\\'; - if (c1 == '/') c1 = '\\'; -#endif - if (c0 != c1) return(0); - i++; j++; - } - while (*j); - return(!*i); + else if ((*match|*wild) == '\0') + return 1; + else if (*wild == '*') + { + while (*wild == '*') wild++; + if (*wild == '\0') return 1; + while (*match && toupperlookup[*match] != toupperlookup[*wild]) match++; + if (toupperlookup[*match] == toupperlookup[*wild]) + continue; + } + return 0; + } while (1); } +#endif #if !defined(_WIN32) char *Bstrlwr(char *s) @@ -917,3 +916,6 @@ int access(const char *pathname, int mode) } #endif +#define LIBDIVIDE_BODY +#include "libdivide.h" + diff --git a/polymer/eduke32/build/src/dxtfilter.c b/polymer/eduke32/build/src/dxtfilter.c index 816aa0adf..220b05ea1 100644 --- a/polymer/eduke32/build/src/dxtfilter.c +++ b/polymer/eduke32/build/src/dxtfilter.c @@ -153,7 +153,7 @@ int32_t dxtfilter(int32_t fil, const texcachepicture *pict, const char *pic, voi for (j=stride; (unsigned)jsize/stride)*8, midbuf, pict->size, packbuf, ispacked)) + if (dedxt_handle_io(fil, tabledivide32(pict->size, stride)*8, midbuf, pict->size, packbuf, ispacked)) return -1; cptr = (char *)midbuf; @@ -206,7 +206,7 @@ int32_t dedxtfilter(int32_t fil, const texcachepicture *pict, char *pic, void *m } //rgb0,rgb1 - if (dedxt_handle_io(fil, (pict->size/stride)*4, midbuf, pict->size, packbuf, ispacked)) + if (dedxt_handle_io(fil, tabledivide32(pict->size, stride)*4, midbuf, pict->size, packbuf, ispacked)) return -1; cptr = (char *)midbuf; @@ -220,7 +220,7 @@ int32_t dedxtfilter(int32_t fil, const texcachepicture *pict, char *pic, void *m } //index_4x4: - if (dedxt_handle_io(fil, (pict->size/stride)*4, midbuf, pict->size, packbuf, ispacked)) + if (dedxt_handle_io(fil, tabledivide32(pict->size, stride)*4, midbuf, pict->size, packbuf, ispacked)) return -1; cptr = (char *)midbuf; diff --git a/polymer/eduke32/build/src/engine.c b/polymer/eduke32/build/src/engine.c index 56f42e465..e944064de 100644 --- a/polymer/eduke32/build/src/engine.c +++ b/polymer/eduke32/build/src/engine.c @@ -144,7 +144,8 @@ static char voxlock[MAXVOXELS][MAXVOXMIPS]; int32_t voxscale[MAXVOXELS]; static int32_t ggxinc[MAXXSIZ+1], ggyinc[MAXXSIZ+1]; -static int32_t lowrecip[1024], nytooclose, nytoofar; +static int32_t lowrecip[1024], nytooclose; +static const int32_t nytoofar = 65536*16384-1048576; static uint32_t distrecip[65536+256]; static int32_t *lookups = NULL; @@ -3549,7 +3550,7 @@ static int32_t setup_globals_cf1(const sectortype *sec, int32_t pal, int32_t zd, j = sec->wallptr; ox = wall[wall[j].point2].x - wall[j].x; oy = wall[wall[j].point2].y - wall[j].y; - i = nsqrtasm(uhypsq(ox,oy)); if (i == 0) i = 1024; else i = 1048576/i; + i = nsqrtasm(uhypsq(ox,oy)); if (i == 0) i = 1024; else i = tabledivide32(1048576, i); globalx1 = mulscale10(dmulscale10(ox,singlobalang,-oy,cosglobalang),i); globaly1 = mulscale10(dmulscale10(ox,cosglobalang,oy,singlobalang),i); globalx2 = -globalx1; @@ -4110,6 +4111,7 @@ static void transmaskwallscan(int32_t x1, int32_t x2, int32_t saturatevplc) #endif // cntup16>>16 iterations + static void nonpow2_mhline(intptr_t bufplc, uint32_t bx, int32_t cntup16, int32_t junk, uint32_t by, char *p) { char ch; @@ -4126,7 +4128,7 @@ static void nonpow2_mhline(intptr_t bufplc, uint32_t bx, int32_t cntup16, int32_ for (cntup16>>=16; cntup16>0; cntup16--) { - ch = buf[(bx/xdiv)*yspan + by/ydiv]; + ch = buf[(divideu32(bx, xdiv))*yspan + divideu32(by, ydiv)]; if (ch != 255) *p = pal[ch]; bx += xinc; @@ -4155,7 +4157,7 @@ static void nonpow2_thline(intptr_t bufplc, uint32_t bx, int32_t cntup16, int32_ { for (cntup16>>=16; cntup16>0; cntup16--) { - ch = buf[(bx/xdiv)*yspan + by/ydiv]; + ch = buf[divideu32(bx, xdiv)*yspan + divideu32(by, ydiv)]; if (ch != 255) *p = trans[(*p)|(pal[ch]<<8)]; bx += xinc; by += yinc; @@ -4166,7 +4168,7 @@ static void nonpow2_thline(intptr_t bufplc, uint32_t bx, int32_t cntup16, int32_ { for (cntup16>>=16; cntup16>0; cntup16--) { - ch = buf[(bx/xdiv)*yspan + by/ydiv]; + ch = buf[divideu32(bx, xdiv)*yspan + divideu32(by, ydiv)]; if (ch != 255) *p = trans[((*p)<<8)|pal[ch]]; bx += xinc; by += yinc; @@ -4287,14 +4289,8 @@ static void tslopevlin(uint8_t *p, int32_t i, const intptr_t *slopalptr, int32_t v = by + ytov*i; ch = *(uint8_t *)(slopalptr[0] + buf[((u>>(32-logx))<>(32-logy))]); - if (transmode) - { - if (ch != 255) *p = trans[*p|(pal[ch]<<8)]; - } - else - { - if (ch != 255) *p = trans[(*p<<8)|pal[ch]]; - } + if (ch != 255) + *p = trans[transmode ? *p|(pal[ch]<<8) : (*p<<8)|pal[ch]]; slopalptr--; p += pinc; @@ -5314,8 +5310,8 @@ static void drawvox(int32_t dasprx, int32_t daspry, int32_t dasprz, int32_t dasp daxscale = scale(daxscale,xdimenscale,xdimen<<8); dayscale = scale(dayscale,mulscale16(xdimenscale,viewingrangerecip),xdimen<<8); - daxscalerecip = (1<<30)/daxscale; - dayscalerecip = (1<<30)/dayscale; + daxscalerecip = tabledivide32_noinline(1<<30, daxscale); + dayscalerecip = tabledivide32_noinline(1<<30, dayscale); longptr = (int32_t *)davoxptr; daxsiz = B_LITTLE32(longptr[0]); daysiz = B_LITTLE32(longptr[1]); //dazsiz = B_LITTLE32(longptr[2]); @@ -5772,7 +5768,22 @@ draw_as_face_sprite: if ((cstat&8) > 0) swaplong(&y1, &y2); - for (x=lx; x<=rx; x++) + x = lx; +#ifdef CLASSIC_SLICE_BY_4 + for (; x<=rx-4; x+=4) + { + uwall[x] = max(startumost[x+windowx1]-windowy1, (int16_t) startum); + uwall[x+1] = max(startumost[x+windowx1+1]-windowy1, (int16_t) startum); + uwall[x+2] = max(startumost[x+windowx1+2]-windowy1, (int16_t) startum); + uwall[x+3] = max(startumost[x+windowx1+3]-windowy1, (int16_t) startum); + + dwall[x] = min(startdmost[x+windowx1]-windowy1, (int16_t) startdm); + dwall[x+1] = min(startdmost[x+windowx1+1]-windowy1, (int16_t) startdm); + dwall[x+2] = min(startdmost[x+windowx1+2]-windowy1, (int16_t) startdm); + dwall[x+3] = min(startdmost[x+windowx1+3]-windowy1, (int16_t) startdm); + } +#endif + for (; x<=rx; x++) { uwall[x] = max(startumost[x+windowx1]-windowy1,(int16_t)startum); dwall[x] = min(startdmost[x+windowx1]-windowy1,(int16_t)startdm); @@ -5801,13 +5812,31 @@ draw_as_face_sprite: break; case 1: k = smoststart[i] - xb1[j]; - for (x=dalx2; x<=darx2; x++) + x = dalx2; +#ifdef CLASSIC_SLICE_BY_4 // ok, this one is really by 2 ;) + for (x=dalx2; x<=darx2-2; x+=2) + { + if (smost[k+x] > uwall[x]) uwall[x] = smost[k+x]; + if (smost[k+x+1] > uwall[x+1]) uwall[x+1] = smost[k+x+1]; + } +#endif + for (; x<=darx2; x++) if (smost[k+x] > uwall[x]) uwall[x] = smost[k+x]; if ((dalx2 == lx) && (darx2 == rx)) daclip |= 1; break; case 2: k = smoststart[i] - xb1[j]; - for (x=dalx2; x<=darx2; x++) + x = dalx2; +#ifdef CLASSIC_SLICE_BY_4 + for (; x<=darx2-4; x+=4) + { + if (smost[k+x] < dwall[x]) dwall[x] = smost[k+x]; + if (smost[k+x+1] < dwall[x+1]) dwall[x+1] = smost[k+x+1]; + if (smost[k+x+2] < dwall[x+2]) dwall[x+2] = smost[k+x+2]; + if (smost[k+x+3] < dwall[x+3]) dwall[x+3] = smost[k+x+3]; + } +#endif + for (; x<=darx2; x++) if (smost[k+x] < dwall[x]) dwall[x] = smost[k+x]; if ((dalx2 == lx) && (darx2 == rx)) daclip |= 2; break; @@ -7842,12 +7871,41 @@ static void dosetaspect(void) oxyaspect = xyaspect; j = xyaspect*320; horizlookup2[horizycent-1] = divscale26(131072,j); - for (i=ydim*4-1; i>=0; i--) - if (i != (horizycent-1)) - { - horizlookup[i] = divscale28(1,i-(horizycent-1)); - horizlookup2[i] = divscale14(klabs(horizlookup[i]),j); - } + for (i=0; i < horizycent-1-4; i += 4) + { + horizlookup[i] = divscale28(1, i -(horizycent-1)); + horizlookup[i+1] = divscale28(1, i+1-(horizycent-1)); + horizlookup[i+2] = divscale28(1, i+2-(horizycent-1)); + horizlookup[i+3] = divscale28(1, i+3-(horizycent-1)); + + horizlookup2[i] = divscale14(klabs(horizlookup[i]), j); + horizlookup2[i+1] = divscale14(klabs(horizlookup[i+1]), j); + horizlookup2[i+2] = divscale14(klabs(horizlookup[i+2]), j); + horizlookup2[i+3] = divscale14(klabs(horizlookup[i+3]), j); + } + for (; i < horizycent-1; i++) + { + horizlookup[i] = divscale28(1, i-(horizycent-1)); + horizlookup2[i] = divscale14(klabs(horizlookup[i]), j); + } + + for (i=horizycent; i < ydim*4-1-4; i += 4) + { + horizlookup[i] = divscale28(1, i -(horizycent-1)); + horizlookup[i+1] = divscale28(1, i+1-(horizycent-1)); + horizlookup[i+2] = divscale28(1, i+2-(horizycent-1)); + horizlookup[i+3] = divscale28(1, i+3-(horizycent-1)); + + horizlookup2[i] = divscale14(klabs(horizlookup[i]), j); + horizlookup2[i+1] = divscale14(klabs(horizlookup[i+1]), j); + horizlookup2[i+2] = divscale14(klabs(horizlookup[i+2]), j); + horizlookup2[i+3] = divscale14(klabs(horizlookup[i+3]), j); + } + for (; i < ydim*4-1; i++) + { + horizlookup[i] = divscale28(1, i-(horizycent-1)); + horizlookup2[i] = divscale14(klabs(horizlookup[i]), j); + } } if (xdimen != oxdimen || viewingrange != oviewingrange) @@ -7856,7 +7914,6 @@ static void dosetaspect(void) no_radarang2 = 0; oviewingrange = viewingrange; - oxdimen = xdimen; xinc = mulscale32(viewingrange*320,xdimenrecip); x = (640<<16)-mulscale1(xinc,xdimen); @@ -7880,15 +7937,28 @@ static void dosetaspect(void) radarang2[i] = (int16_t)((radarang[k]+j)>>6); } + if (xdimen != oxdimen) { EDUKE32_STATIC_ASSERT((uint64_t) MAXXDIM*(ARRAY_SIZE(distrecip)-1) <= INT32_MAX); - for (i=1; i<(int32_t) ARRAY_SIZE(distrecip); i++) + i = 1; + +#ifdef CLASSIC_SLICE_BY_4 + for (; i<(int32_t) ARRAY_SIZE(distrecip)-4; i+=4) + { distrecip[i] = (xdimen * i)>>20; + distrecip[i+1] = (xdimen * (i+1))>>20; + distrecip[i+2] = (xdimen * (i+2))>>20; + distrecip[i+3] = (xdimen * (i+3))>>20; + } +#endif + for (; i<(int32_t) ARRAY_SIZE(distrecip); i++) + distrecip[i] = (xdimen * i)>>20; + + nytooclose = xdimen*2100; } - nytooclose = xdimen*2100; - nytoofar = 65536*16384-1048576; + oxdimen = xdimen; } } @@ -7920,9 +7990,19 @@ static int32_t loadtables(void) if (tablesloaded == 0) { int32_t i; + libdivide_s64_t d; + libdivide_s32_t d32; initksqrt(); + for (i=1; icstat&48)!=32) // face/wall sprite { int32_t tempint1 = clipmapinfo.sector[k].CM_XREPEAT; - maxcorrection = (maxcorrection * (int32_t)spr->xrepeat)/tempint1; + maxcorrection = tabledivide32_noinline(maxcorrection * (int32_t)spr->xrepeat, tempint1); } else // floor sprite { int32_t tempint1 = clipmapinfo.sector[k].CM_XREPEAT; int32_t tempint2 = clipmapinfo.sector[k].CM_YREPEAT; - maxcorrection = max((maxcorrection * (int32_t)spr->xrepeat)/tempint1, - (maxcorrection * (int32_t)spr->yrepeat)/tempint2); + maxcorrection = max(tabledivide32_noinline(maxcorrection * (int32_t)spr->xrepeat, tempint1), + tabledivide32_noinline(maxcorrection * (int32_t)spr->yrepeat, tempint2)); } maxcorrection -= MAXCLIPDIST; @@ -15140,9 +15220,9 @@ void clearview(int32_t dacol) { palette_t p = getpal(dacol); - bglClearColor(((float)p.r)/255.0, - ((float)p.g)/255.0, - ((float)p.b)/255.0, + bglClearColor((float)p.r * (1.f/255.f), + (float)p.g * (1.f/255.f), + (float)p.b * (1.f/255.f), 0); bglClear(GL_COLOR_BUFFER_BIT); return; @@ -15179,9 +15259,9 @@ void clearallviews(int32_t dacol) palette_t p = getpal(dacol); bglViewport(0,0,xdim,ydim); glox1 = -1; - bglClearColor(((float)p.r)/255.0, - ((float)p.g)/255.0, - ((float)p.b)/255.0, + bglClearColor((float)p.r * (1.f/255.f), + (float)p.g * (1.f/255.f), + (float)p.b * (1.f/255.f), 0); bglClear(GL_COLOR_BUFFER_BIT); return; @@ -15740,8 +15820,8 @@ void drawline256(int32_t x1, int32_t y1, int32_t x2, int32_t y2, char col) //bglEnable(GL_BLEND); // When using line antialiasing, this is needed bglBegin(GL_LINES); bglColor4ub(p.r,p.g,p.b,255); - bglVertex2f((float)x1/4096.0,(float)y1/4096.0); - bglVertex2f((float)x2/4096.0,(float)y2/4096.0); + bglVertex2f((float)x1 * (1.f/4096.f), (float)y1 * (1.f/4096.f)); + bglVertex2f((float)x2 * (1.f/4096.f), (float)y2 * (1.f/4096.f)); bglEnd(); //bglDisable(GL_BLEND); diff --git a/polymer/eduke32/build/src/kplib.c b/polymer/eduke32/build/src/kplib.c index 813a36c34..b89fe3444 100644 --- a/polymer/eduke32/build/src/kplib.c +++ b/polymer/eduke32/build/src/kplib.c @@ -37,6 +37,7 @@ credits. #include #include #include +#include "pragmas.h" #if defined(__POWERPC__) || defined(GEKKO) #define BIGENDIAN 1 @@ -820,14 +821,14 @@ static int32_t kpngrend(const char *kfilebuf, int32_t kfilength, //Save code by making grayscale look like a palette color scheme if ((!kcoltype) || (kcoltype == 4)) { - j = 0xff000000; k = (255 / ((1<=0; i--) palcol[i] = LSWAPIB((LSWAPIL(*(int32_t *)&filptr[i*3])>>8)|0xff000000); } else if (i == (int32_t)LSWAPIB(0x44474b62)) //bKGD (must be after PLTE and before IDAT) @@ -835,7 +836,7 @@ static int32_t kpngrend(const char *kfilebuf, int32_t kfilength, switch (kcoltype) { case 0: case 4: - bakcol = (((int32_t)filptr[0]<<8)+(int32_t)filptr[1])*255/((1<0; z--) lut[z] = (255<<16)/z; + for (z=256-1; z>0; z--) lut[z] = tabledivide32_noinline(255<<16, z); lut[0] = (1<<16); } if (dxt == 1) stride = (xsiz<<1); else stride = (xsiz<<2); @@ -2419,7 +2420,7 @@ int32_t kprender(const char *buf, int32_t leng, intptr_t frameptr, int32_t bpl, extern char toupperlookup[256]; -static int32_t wildmatch(const char *match, const char *wild) +int32_t wildmatch(const char *match, const char *wild) { do { @@ -2428,13 +2429,13 @@ static int32_t wildmatch(const char *match, const char *wild) wild++, match++; continue; } - else if (*match + *wild == '\0') + else if ((*match|*wild) == '\0') return 1; else if (*wild == '*') { while (*wild == '*') wild++; if (*wild == '\0') return 1; - while (toupperlookup[*match] != toupperlookup[*wild] && *match) match++; + while (*match && toupperlookup[*match] != toupperlookup[*wild]) match++; if (toupperlookup[*match] == toupperlookup[*wild]) continue; } diff --git a/polymer/eduke32/build/src/mdsprite.c b/polymer/eduke32/build/src/mdsprite.c index eab159169..94c7f2e97 100644 --- a/polymer/eduke32/build/src/mdsprite.c +++ b/polymer/eduke32/build/src/mdsprite.c @@ -1004,10 +1004,7 @@ void updateanimation(md2model_t *m, const spritetype *tspr, uint8_t lpal) return; } - if (smooth->mdsmooth) // VERIFY: (smooth->mdsmooth) implies (tile2model[tile].smoothduration!=0) ? - ftol((1.0f / (float)(tile2model[tile].smoothduration)) * 66.f, &fps); - else - fps = anim->fpssc; + fps = smooth->mdsmooth ? Blrintf((1.0f / (float) (tile2model[tile].smoothduration)) * 66.f) : anim->fpssc; i = (mdtims - sprext->mdanimtims)*((fps*timerticspersec)/120); diff --git a/polymer/eduke32/build/src/osd.c b/polymer/eduke32/build/src/osd.c index 9f42d80b7..c08782374 100644 --- a/polymer/eduke32/build/src/osd.c +++ b/polymer/eduke32/build/src/osd.c @@ -1459,7 +1459,7 @@ void OSD_Draw(void) while (j > -1) { osdrowscur++; - j -= 200/osd->draw.rows; + j -= tabledivide32_noinline(200, osd->draw.rows); if (osdrowscur > osd->draw.rows-1) break; } @@ -1470,7 +1470,7 @@ void OSD_Draw(void) while (j > -1) { osdrowscur--; - j -= 200/osd->draw.rows; + j -= tabledivide32_noinline(200, osd->draw.rows); if (osdrowscur < 1) break; } diff --git a/polymer/eduke32/build/src/polymer.c b/polymer/eduke32/build/src/polymer.c index 49dad50d9..4eb520c2b 100644 --- a/polymer/eduke32/build/src/polymer.c +++ b/polymer/eduke32/build/src/polymer.c @@ -2830,9 +2830,7 @@ static float calc_ypancoef(char curypanning, int16_t curpicnum, int32_t dopancor if (dopancor) { - int32_t yoffs; - - ftol((ypancoef - tilesiz[curpicnum].y) * (255.0f / ypancoef), &yoffs); + int32_t yoffs = Blrintf((ypancoef - tilesiz[curpicnum].y) * (255.0f / ypancoef)); if (curypanning > 256 - yoffs) curypanning -= yoffs; } diff --git a/polymer/eduke32/build/src/polymost.c b/polymer/eduke32/build/src/polymost.c index 5a8a43540..3d2678ef3 100644 --- a/polymer/eduke32/build/src/polymost.c +++ b/polymer/eduke32/build/src/polymost.c @@ -558,8 +558,9 @@ static inline void fogcalc(int32_t tile, int32_t shade, int32_t vis, int32_t pal } else { - fogresult = (r_usenewshading == 3 && shade > 0) ? 0 : -(FOGDISTCONST * shade)/combvis; - fogresult2 = (FOGDISTCONST * (numshades-1-shade))/combvis; + combvis = 1.f/combvis; + fogresult = (r_usenewshading == 3 && shade > 0) ? 0 : -(FOGDISTCONST * shade) * combvis; + fogresult2 = (FOGDISTCONST * (numshades-1-shade)) * combvis; } } } @@ -663,7 +664,7 @@ static void resizeglcheck(void) if ((glox1 != windowx1) || (gloy1 != windowy1) || (glox2 != windowx2) || (gloy2 != windowy2)) { const int32_t ourxdimen = (windowx2-windowx1+1); - const float ratio = get_projhack_ratio(); + float ratio = get_projhack_ratio(); const int32_t fovcorrect = (ratio==0) ? 0 : (int32_t)(ourxdimen*ratio - ourxdimen); float m[4][4]; @@ -675,9 +676,10 @@ static void resizeglcheck(void) bglMatrixMode(GL_PROJECTION); memset(m,0,sizeof(m)); - m[0][0] = fydimen / ratio; m[0][2] = 1.f; + ratio = 1.f/ratio; + m[0][0] = fydimen * ratio; m[0][2] = 1.f; m[1][1] = fxdimen; m[1][2] = 1.f; - m[2][2] = 1.f; m[2][3] = fydimen / ratio; + m[2][2] = 1.f; m[2][3] = fydimen * ratio; m[3][2] =-1.f; bglLoadMatrixf(&m[0][0]); @@ -2380,9 +2382,7 @@ static void calc_ypanning(int32_t refposz, float ryp0, float ryp1, { // Carry out panning "correction" to make it look like classic in some // cases, but failing in the general case. - int32_t yoffs; - - ftol((i-tilesiz[globalpicnum].y)*(255.f/i), &yoffs); + int32_t yoffs = Blrintf((i-tilesiz[globalpicnum].y)*(255.f/i)); if (ypan > 256-yoffs) ypan -= yoffs; @@ -2543,11 +2543,11 @@ static void polymost_drawalls(int32_t bunch) else domost(x0,fy0,x1,fy1); if (r_parallaxskypanning) - vv[0] += dd[0]*((float)sec->floorypanning)*((float)i)/256.0; + vv[0] += dd[0]*((float)sec->floorypanning)*((float)i)*(1.f/256.f); gdx = 0; gdy = 0; gdo = dd[0]; gux = gdo * - (t * (float) ((uint64_t) (xdimscale * yxaspect) * viewingrange)) / (16384.0*65536.0*65536.0*5.0*1024.0); + (t * (float) ((uint64_t) (xdimscale * yxaspect) * viewingrange)) * (1.f/(16384.0*65536.0*65536.0*5.0*1024.0)); guy = 0; //guo calculated later gvx = 0; gvy = vv[1]; gvo = vv[0]; @@ -2559,7 +2559,7 @@ static void polymost_drawalls(int32_t bunch) do { globalpicnum = dapskyoff[y&((1<floorxpanning:0)) - gux*ghalfx; + guo = gdo*(t*((float)(globalang-(y<<(11-dapskybits)))) * (1.f/2048.f) + (float)((r_parallaxskypanning)?sec->floorxpanning:0)) - gux*ghalfx; y++; ox = fx; fx = ((float)((y<<(11-dapskybits))-globalang))*oz+ghalfx; if (fx > x1) { fx = x1; i = -1; } @@ -2798,7 +2798,7 @@ static void polymost_drawalls(int32_t bunch) i = (1<<(picsiz[globalpicnum]>>4)); if (i != tilesiz[globalpicnum].y) i += i; //Hack to draw black rectangle below sky when looking down... - gdx = 0; gdy = gxyaspect / -262144.f; gdo = -ghoriz*gdy; + gdx = 0; gdy = gxyaspect * (1.f/-262144.f); gdo = -ghoriz*gdy; gux = 0; guy = 0; guo = 0; gvx = 0; gvy = 0; gvo = 0; oy = -vv[0]/vv[1]; @@ -2819,7 +2819,7 @@ static void polymost_drawalls(int32_t bunch) else domost(x1,cy1,x0,cy0); if (r_parallaxskypanning) - vv[0] += dd[0]*((float)sec->ceilingypanning)*((float)i)/256.f; + vv[0] += dd[0]*(float)sec->ceilingypanning*(float)i/256.f; gdx = 0; gdy = 0; gdo = dd[0]; gux = gdo * @@ -2835,7 +2835,7 @@ static void polymost_drawalls(int32_t bunch) do { globalpicnum = dapskyoff[y&((1<ceilingxpanning:0)) - gux*ghalfx; + guo = gdo*(t*((float)(globalang-(y<<(11-dapskybits)))) * 1.f/2048.f + (float)((r_parallaxskypanning)?sec->ceilingxpanning:0)) - gux*ghalfx; y++; ox = fx; fx = ((float)((y<<(11-dapskybits))-globalang))*oz+ghalfx; if (fx > x1) { fx = x1; i = -1; } diff --git a/polymer/eduke32/build/src/pragmas.c b/polymer/eduke32/build/src/pragmas.c index c14a1ed2d..ed441c786 100644 --- a/polymer/eduke32/build/src/pragmas.c +++ b/polymer/eduke32/build/src/pragmas.c @@ -7,167 +7,19 @@ // inline versions. I'll eventually convert these to macro-inline // equivalents. --Jonathon -//#include "pragmas.h" #include "compat.h" +#include "pragmas.h" + +libdivide_s64pad_t divtable64[DIVTABLESIZE]; +libdivide_s32pad_t divtable32[DIVTABLESIZE]; + +uint32_t divideu32_noinline(uint32_t n, uint32_t d) { return divideu32(n, d); } +int32_t tabledivide32_noinline(int32_t n, int32_t d) { return tabledivide32(n, d); } +int32_t tabledivide64_noinline(int64_t n, int32_t d) { return tabledivide64(n, d); } int32_t dmval; -#if defined(__GNUC__) && defined(GEKKO) - -// naked function (no prolog/epilog) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wreturn-type" -int32_t scale(int32_t a, int32_t d, int32_t c) ATTRIBUTE((naked)); -int32_t scale(int32_t a, int32_t d, int32_t c) -{ -// return ((int64_t)a * d) / c; - - __asm__ __volatile__ ( - " mullw 6, 3, 4\n" - " mulhw 4, 3, 4\n" - " mr 3, 6\n" - - " srawi. 0, 5, 31\n" - " cmpwi cr1, 4, 0\n" - " crxor 7, 0, 4\n" - - " xor 5, 0, 5\n" - " subf. 5, 0, 5\n" - - " beq DivByZero\n" - " bge cr1, Div64Common\n" - - " subfic 3, 3, 0\n" - " subfze 4, 4\n" - - "Div64Common:\n" - " cmplw 4, 5\n" - - " cntlzw 6, 5\n" - " xor 4, 4, 3\n" - " slw 5, 5, 6\n" - " rotlw 4, 4, 6\n" - " slw 3, 3, 6\n" - " li 7, 2\n" - " xor 4, 4, 3\n" - - " bge DivOverflow\n" - " mtctr 7\n" - - "Div64Compute:\n" - " srwi 6, 5, 16\n" - " divwu 7, 4, 6\n" - " mullw 6, 7, 6\n" - " subf 4, 6, 4\n" - " slwi 4, 4, 16\n" - " inslwi 4, 3, 16, 16\n" - " slwi 3, 3, 16\n" - " clrlwi 6, 5, 16\n" - " mullw 6, 7, 6\n" - " subfc 4, 6, 4\n" - " subfe. 6, 6, 6\n" - " add 3, 3, 7\n" - " bge Div64Done\n" - "Div64Correct:\n" - " addc 4, 4, 5\n" - " addze. 6, 6\n" - " subi 3, 3, 1\n" - " blt Div64Correct\n" - - "Div64Done:\n" - " bdnz Div64Compute\n" - - " cmpwi 3, 0\n" - " bso cr1, Div64QuotientNeg\n" - - " blt DivOverflow\n" - " blr\n" - - "Div64QuotientNeg:\n" - " neg. 3, 3\n" - " blelr\n" - - "DivOverflow:\n" - " cror 4, 7, 7\n" - - "DivByZero:\n" - " lis 3, 0x8000\n" - " bltlr cr1\n" - " subi 3, 3, 1\n" - " blr\n" - ); -} -#pragma GCC diagnostic pop - -void clearbufbyte(void *d, int32_t c, int32_t a) -{ - if (a==0) { - uint8_t *dd = (uint8_t*)d; - int32_t align = (32 - (int32_t)d) & 31; - - if (align && c >= align) { - uint32_t izero = 0; - double fzero = 0; - c -= align; - - if (align&1) { - *dd = izero; - dd += 1; - } - if (align&2) { - *(uint16_t*)dd = izero; - dd += 2; - } - if (align&4) { - *(uint32_t*)dd = izero; - dd += 4; - } - if (align&8) { - *(double*)dd = fzero; - dd += 8; - } - if (align&16) { - *(double*)dd = fzero; - *(double*)(dd+8) = fzero; - dd += 16; - } - } - align = c >> 5; - while (align) { - __asm__ ( - " dcbz 0, %0\n" - " addi %0, %0, 32\n" - : "+r"(dd) - : - : "memory" - ); - align--; - } - if ((c &= 31)) { - while (c--) { - *dd++ = 0; - } - } - return; - } - __asm__ __volatile__ ( - " add %1, %1, %2\n" - " neg. %2, %2\n" - " beq 2f\n" - "1:\n" - " stbx %0, %1, %2\n" - " addic. %2, %2, 1\n" - " rotrwi %0, %0, 8\n" - " bne 1b\n" - "2:\n" - : "+r"(a), "+b"(d), "+r"(c) - : - : "cc", "xer", "memory" - ); -} - -#elif defined(__GNUC__) && defined(__i386__) && !defined(NOASM) // NOASM +#if defined(__GNUC__) && defined(__i386__) && !defined(NOASM) // NOASM // // GCC Inline Assembler version @@ -297,7 +149,158 @@ void copybufreverse(const void *S, void *D, int32_t c) // Microsoft C Inline Assembler version // -#else // _MSC_VER +#elif defined(__GNUC__) && defined(GEKKO) + +// naked function (no prolog/epilog) +// FIXME: this function produces unused parameter warnings and a missing return warning +int32_t scale(int32_t a, int32_t d, int32_t c) +{ + // return ((int64_t)a * d) / c; + + __asm__ __volatile__ ( + " mullw 6, 3, 4\n" + " mulhw 4, 3, 4\n" + " mr 3, 6\n" + + " srawi. 0, 5, 31\n" + " cmpwi cr1, 4, 0\n" + " crxor 7, 0, 4\n" + + " xor 5, 0, 5\n" + " subf. 5, 0, 5\n" + + " beq DivByZero\n" + " bge cr1, Div64Common\n" + + " subfic 3, 3, 0\n" + " subfze 4, 4\n" + + "Div64Common:\n" + " cmplw 4, 5\n" + + " cntlzw 6, 5\n" + " xor 4, 4, 3\n" + " slw 5, 5, 6\n" + " rotlw 4, 4, 6\n" + " slw 3, 3, 6\n" + " li 7, 2\n" + " xor 4, 4, 3\n" + + " bge DivOverflow\n" + " mtctr 7\n" + + "Div64Compute:\n" + " srwi 6, 5, 16\n" + " divwu 7, 4, 6\n" + " mullw 6, 7, 6\n" + " subf 4, 6, 4\n" + " slwi 4, 4, 16\n" + " inslwi 4, 3, 16, 16\n" + " slwi 3, 3, 16\n" + " clrlwi 6, 5, 16\n" + " mullw 6, 7, 6\n" + " subfc 4, 6, 4\n" + " subfe. 6, 6, 6\n" + " add 3, 3, 7\n" + " bge Div64Done\n" + "Div64Correct:\n" + " addc 4, 4, 5\n" + " addze. 6, 6\n" + " subi 3, 3, 1\n" + " blt Div64Correct\n" + + "Div64Done:\n" + " bdnz Div64Compute\n" + + " cmpwi 3, 0\n" + " bso cr1, Div64QuotientNeg\n" + + " blt DivOverflow\n" + " blr\n" + + "Div64QuotientNeg:\n" + " neg. 3, 3\n" + " blelr\n" + + "DivOverflow:\n" + " cror 4, 7, 7\n" + + "DivByZero:\n" + " lis 3, 0x8000\n" + " bltlr cr1\n" + " subi 3, 3, 1\n" + " blr\n" + ); +} + +void clearbufbyte(void *d, int32_t c, int32_t a) +{ + if (a==0) { + uint8_t *dd = (uint8_t*)d; + int32_t align = (32 - (int32_t)d) & 31; + + if (align && c >= align) { + uint32_t izero = 0; + double fzero = 0; + c -= align; + + if (align&1) { + *dd = izero; + dd += 1; + } + if (align&2) { + *(uint16_t*)dd = izero; + dd += 2; + } + if (align&4) { + *(uint32_t*)dd = izero; + dd += 4; + } + if (align&8) { + *(double*)dd = fzero; + dd += 8; + } + if (align&16) { + *(double*)dd = fzero; + *(double*)(dd+8) = fzero; + dd += 16; + } + } + align = c >> 5; + while (align) { + __asm__ ( + " dcbz 0, %0\n" + " addi %0, %0, 32\n" + : "+r"(dd) + : + : "memory" + ); + align--; + } + if ((c &= 31)) { + while (c--) { + *dd++ = 0; + } + } + return; + } + __asm__ __volatile__( + " add %1, %1, %2\n" + " neg. %2, %2\n" + " beq 2f\n" + "1:\n" + " stbx %0, %1, %2\n" + " addic. %2, %2, 1\n" + " rotrwi %0, %0, 8\n" + " bne 1b\n" + "2:\n" + : "+r"(a), "+b"(d), "+r"(c) + : + : "cc", "xer", "memory" + ); +} + +#else // // Generic C version diff --git a/polymer/eduke32/eduke32.vcxproj b/polymer/eduke32/eduke32.vcxproj index 0bcfec2ec..f5ced22cc 100644 --- a/polymer/eduke32/eduke32.vcxproj +++ b/polymer/eduke32/eduke32.vcxproj @@ -127,7 +127,7 @@ nmake /f Makefile.msvc veryclean all DEBUG=1 WINBITS=64 nmake /f Makefile.msvc veryclean WINBITS=64 eduke32.exe - USE_OPENGL;POLYMER + USE_OPENGL;POLYMER;NOASM $(NMakeIncludeSearchPath);build\include;source\jmact;source\jaudiolib\include;source\enet\include; $(NMakeForcedIncludes) $(NMakeAssemblySearchPath) @@ -138,7 +138,7 @@ nmake /f Makefile.msvc veryclean all WINBITS=64 nmake /f Makefile.msvc veryclean WINBITS=64 eduke32.exe - USE_OPENGL;POLYMER + USE_OPENGL;POLYMER;NOASM $(NMakeIncludeSearchPath);build\include;source\jmact;source\jaudiolib\include;source\enet\include; $(NMakeForcedIncludes) $(NMakeAssemblySearchPath) @@ -172,7 +172,7 @@ nmake /f Makefile.msvc veryclean all DEBUG=1 WINBITS=64 RENDERTYPE=SDL nmake /f Makefile.msvc veryclean WINBITS=64 RENDERTYPE=SDL eduke32.exe - USE_OPENGL;POLYMER + USE_OPENGL;POLYMER;NOASM $(NMakeIncludeSearchPath);build\include;source\jmact;source\jaudiolib\include;source\enet\include; $(NMakeForcedIncludes) $(NMakeAssemblySearchPath) @@ -183,7 +183,7 @@ nmake /f Makefile.msvc veryclean all WINBITS=64 RENDERTYPE=SDL nmake /f Makefile.msvc veryclean WINBITS=64 RENDERTYPE=SDL eduke32.exe - USE_OPENGL;POLYMER + USE_OPENGL;POLYMER;NOASM $(NMakeIncludeSearchPath);build\include;source\jmact;source\jaudiolib\include;source\enet\include; $(NMakeForcedIncludes) $(NMakeAssemblySearchPath) diff --git a/polymer/eduke32/source/actors.c b/polymer/eduke32/source/actors.c index 03c76a042..9ed46ef1f 100644 --- a/polymer/eduke32/source/actors.c +++ b/polymer/eduke32/source/actors.c @@ -5029,7 +5029,7 @@ ACTOR_STATIC void G_MoveMisc(void) // STATNUM 5 case NEON5__STATIC: case NEON6__STATIC: - if ((g_globalRandom/(s->lotag+1)&31) > 4) s->shade = -127; + if (tabledivide32_noinline(g_globalRandom, (s->lotag+1)&31) > 4) s->shade = -127; else s->shade = 127; goto BOLT; @@ -6300,7 +6300,7 @@ ACTOR_STATIC void G_MoveEffectors(void) //STATNUM 3 // if(t[5] > 0) { t[5]--; break; } - if ((g_globalRandom/(sh+1)&31) < 4 && !t[2]) + if (tabledivide32_noinline(g_globalRandom, (sh+1)&31) < 4 && !t[2]) { // t[5] = 4+(g_globalRandom&7); sc->ceilingpal = s->owner>>8; @@ -6337,7 +6337,7 @@ ACTOR_STATIC void G_MoveEffectors(void) //STATNUM 3 case SE_4_RANDOM_LIGHTS: - if ((g_globalRandom/(sh+1)&31) < 4) + if (tabledivide32_noinline(g_globalRandom, (sh+1)&31) < 4) { t[1] = s->shade + (g_globalRandom&15);//Got really bright t[0] = s->shade + (g_globalRandom&15); diff --git a/polymer/eduke32/source/android/in_android.c b/polymer/eduke32/source/android/in_android.c index 7642ad222..1e40a5a81 100644 --- a/polymer/eduke32/source/android/in_android.c +++ b/polymer/eduke32/source/android/in_android.c @@ -243,11 +243,11 @@ void CONTROL_Android_PollDevices(ControlInfo *info) //LOGI("CONTROL_Android_PollDevices %f %f",forwardmove,sidemove); //LOGI("CONTROL_Android_PollDevices %f %f",droidinput.pitch,droidinput.yaw); - info->dz = (int32_t)nearbyintf(-droidinput.forwardmove * ANDROIDFORWARDMOVEFACTOR); - info->dx = (int32_t)nearbyintf(droidinput.sidemove * ANDROIDSIDEMOVEFACTOR); - info->dpitch = (int32_t)nearbyint(droidinput.pitch * ANDROIDPITCHFACTOR + + info->dz = (int32_t)Blrintf(-droidinput.forwardmove * ANDROIDFORWARDMOVEFACTOR); + info->dx = (int32_t)Blrintf(droidinput.sidemove * ANDROIDSIDEMOVEFACTOR); + info->dpitch = (int32_t)Blrintf(droidinput.pitch * ANDROIDPITCHFACTOR + droidinput.pitch_joystick * ANDROIDPITCHFACTORJOYSTICK); - info->dyaw = (int32_t)nearbyint(-droidinput.yaw * ANDROIDYAWFACTOR - + info->dyaw = (int32_t)Blrintf(-droidinput.yaw * ANDROIDYAWFACTOR - droidinput.yaw_joystick * ANDROIDYAWFACTORJOYSTICK); /* diff --git a/polymer/eduke32/source/demo.c b/polymer/eduke32/source/demo.c index 1a6b90ecd..5975a6e30 100644 --- a/polymer/eduke32/source/demo.c +++ b/polymer/eduke32/source/demo.c @@ -886,7 +886,7 @@ nextdemo_nomenu: rotatesprite(120<<16,16<<16,32768,0,SLIDEBAR,0,0,2+8+16+1024,(xdim*125)/320,0,(xdim*155)/320,ydim-1); rotatesprite(150<<16,16<<16,32768,0,SLIDEBAR,0,0,2+8+16+1024,(xdim*155)/320,0,xdim-1,ydim-1); - j = (182<<16) - ((((120*(g_demo_totalCnt-g_demo_cnt))<<4)/g_demo_totalCnt)<<12); + j = (182<<16) - (tabledivide32_noinline((120*(g_demo_totalCnt-g_demo_cnt))<<4, g_demo_totalCnt)<<12); rotatesprite_fs(j,(16<<16)+(1<<15),32768,0,SLIDEBAR+1,0,0,2+8+16+1024); j=(g_demo_totalCnt-g_demo_cnt)/REALGAMETICSPERSEC; diff --git a/polymer/eduke32/source/game.c b/polymer/eduke32/source/game.c index c16db8b07..19598a6d2 100644 --- a/polymer/eduke32/source/game.c +++ b/polymer/eduke32/source/game.c @@ -809,7 +809,7 @@ vec2_t G_ScreenText(const int32_t font, { size.x = xbetween; - xbetween = (length == 1) ? 0 : ((xbetween - linewidth) / (length - 1)); + xbetween = (length == 1) ? 0 : tabledivide32_noinline((xbetween - linewidth), (length - 1)); linewidth = size.x; } @@ -823,7 +823,7 @@ vec2_t G_ScreenText(const int32_t font, if (f & TEXT_YJUSTIFY) { const int32_t tempswap = ybetween; - ybetween = (lines == 1) ? 0 : ((ybetween - size.y) / (lines - 1)); + ybetween = (lines == 1) ? 0 : tabledivide32_noinline(ybetween - size.y, lines - 1); size.y = tempswap; } @@ -1001,7 +1001,7 @@ vec2_t G_ScreenText(const int32_t font, if (f & TEXT_XJUSTIFY) { - xbetween = (length == 1) ? 0 : ((xbetween - linewidth) / (length - 1)); + xbetween = (length == 1) ? 0 : tabledivide32_noinline(xbetween - linewidth, length - 1); linewidth = size.x; } @@ -2489,7 +2489,7 @@ static void G_PrintFPS(void) if (thisSec - LastSec) { - g_currentFrameRate = LastCount = FrameCount / (thisSec - LastSec); + g_currentFrameRate = LastCount = tabledivide32_noinline(FrameCount, thisSec - LastSec); LastSec = thisSec; FrameCount = 0; @@ -3483,7 +3483,9 @@ static void palaccum_add(palaccum_t *pa, const palette_t *pal, int32_t f) static void G_FadePalaccum(const palaccum_t *pa) { - setpalettefade(pa->r/pa->sumf, pa->g/pa->sumf, pa->b/pa->sumf, pa->maxf); + setpalettefade(tabledivide32_noinline(pa->r, pa->sumf), + tabledivide32_noinline(pa->g, pa->sumf), + tabledivide32_noinline(pa->b, pa->sumf), pa->maxf); } @@ -4502,7 +4504,7 @@ void G_DrawRooms(int32_t snum, int32_t smoothratio) else { tmpvr = vr; - tmpyx = (65536*ydim*8)/(xdim*5); + tmpyx = tabledivide32_noinline(65536*ydim*8, xdim*5); setaspect(mulscale16(tmpvr,viewingrange), yxaspect); } @@ -4581,7 +4583,7 @@ void G_DrawRooms(int32_t snum, int32_t smoothratio) setaspect(mulscale16(oviewingrange,i>>1), yxaspect); tmpvr = i>>1; - tmpyx = (65536*ydim*8)/(xdim*5); + tmpyx = tabledivide32_noinline(65536*ydim*8, xdim*5); } } else if (getrendermode() >= REND_POLYMOST && (ud.screen_tilting @@ -7646,7 +7648,7 @@ void G_DoSpriteAnimations(int32_t ourx, int32_t oury, int32_t oura, int32_t smoo l = s->z-actor[g_player[p].ps->i].floorz+(3<<8); // SET_SPRITE_NOT_TSPRITE if (l > 1024 && s->yrepeat > 32 && s->extra > 0) - s->yoffset = (int8_t)(l/(s->yrepeat<<2)); + s->yoffset = (int8_t)tabledivide32_noinline(l, s->yrepeat<<2); else s->yoffset=0; } @@ -12908,8 +12910,8 @@ void A_SpawnWallGlass(int32_t i,int32_t wallnum,int32_t n) x1 -= ksgn(yv); y1 += ksgn(xv); - xv /= j; - yv /= j; + xv = tabledivide32_noinline(xv, j); + yv = tabledivide32_noinline(yv, j); for (j=n; j>0; j--) { @@ -12949,8 +12951,8 @@ void A_SpawnCeilingGlass(int32_t i,int32_t sectnum,int32_t n) x1 = wall[s].x; y1 = wall[s].y; - xv = (wall[s+1].x-x1)/(n+1); - yv = (wall[s+1].y-y1)/(n+1); + xv = tabledivide32_noinline(wall[s+1].x-x1, n+1); + yv = tabledivide32_noinline(wall[s+1].y-y1, n+1); for (j=n; j>0; j--) { @@ -12984,8 +12986,8 @@ void A_SpawnRandomGlass(int32_t i,int32_t wallnum,int32_t n) x1 = wall[wallnum].x; y1 = wall[wallnum].y; - xv = (wall[wall[wallnum].point2].x-wall[wallnum].x)/j; - yv = (wall[wall[wallnum].point2].y-wall[wallnum].y)/j; + xv = tabledivide32_noinline(wall[wall[wallnum].point2].x-wall[wallnum].x, j); + yv = tabledivide32_noinline(wall[wall[wallnum].point2].y-wall[wallnum].y, j); for (j=n; j>0; j--) { diff --git a/polymer/eduke32/source/gameexec.c b/polymer/eduke32/source/gameexec.c index 2293c8cd4..a69be5465 100644 --- a/polymer/eduke32/source/gameexec.c +++ b/polymer/eduke32/source/gameexec.c @@ -266,7 +266,7 @@ int32_t A_GetFurthestAngle(int32_t iActor, int32_t angs) int32_t furthest_angle=0; int32_t d, j; int32_t greatestd = INT32_MIN; - int32_t angincs=2048/angs; + int32_t angincs=tabledivide32_noinline(2048, angs); hitdata_t hit; for (j=s->ang; j<(2048+s->ang); j+=angincs) @@ -303,7 +303,7 @@ int32_t A_FurthestVisiblePoint(int32_t iActor, spritetype *ts, int32_t *dax, int if ((!g_netServer && ud.multimode < 2) && ud.player_skill < 3) angincs = 2048/2; - else angincs = 2048/(1+(krand()&1)); + else angincs = tabledivide32_noinline(2048, 1+(krand()&1)); for (j=ts->ang; j<(2048+ts->ang); j+=(angincs-(krand()&511))) { @@ -4447,7 +4447,7 @@ finish_qsprintf: /*OSD_Printf(OSDTEXT_GREEN "CON_RESIZEARRAY: resizing array %s from %d to %d\n", aGameArrays[j].szLabel, aGameArrays[j].size, asize / GAR_ELTSZ);*/ aGameArrays[j].plValues = (intptr_t *)Xrealloc(aGameArrays[j].plValues, asize); - aGameArrays[j].size = asize / GAR_ELTSZ; + aGameArrays[j].size = asize/GAR_ELTSZ; kread(fil, aGameArrays[j].plValues, asize); } @@ -5513,7 +5513,11 @@ void A_Execute(int32_t iActor, int32_t iPlayer, int32_t lDist) else if (actor[vm.g_i].timetosleep > 1) actor[vm.g_i].timetosleep--; else if (actor[vm.g_i].timetosleep == 1) + { + if (g_scriptVersion == 13 && (vm.g_sp->picnum == FIRE || vm.g_sp->picnum == FIRE2)) + return; changespritestat(vm.g_i, STAT_ZOMBIEACTOR); + } } void G_SaveMapState(void) diff --git a/polymer/eduke32/source/gamevars.h b/polymer/eduke32/source/gamevars.h index dcb8e7ffd..8cdec7111 100644 --- a/polymer/eduke32/source/gamevars.h +++ b/polymer/eduke32/source/gamevars.h @@ -127,35 +127,79 @@ void Gv_FinalizeWeaponDefaults(void); { \ default: \ aGameVars[id].val.lValue operator lValue; \ - return; \ + break; \ case GAMEVAR_PERPLAYER: \ - if ((unsigned)vm.g_p > MAXPLAYERS-1) return; \ + if ((unsigned)vm.g_p > MAXPLAYERS-1) break; \ aGameVars[id].val.plValues[vm.g_p] operator lValue; \ - return; \ + break; \ case GAMEVAR_PERACTOR: \ - if ((unsigned)vm.g_i > MAXSPRITES-1) return; \ + if ((unsigned)vm.g_i > MAXSPRITES-1) break; \ aGameVars[id].val.plValues[vm.g_i] operator lValue; \ - return; \ + break; \ case GAMEVAR_INTPTR: \ *((int32_t *)aGameVars[id].val.lValue) operator (int32_t)lValue; \ - return; \ + break; \ case GAMEVAR_SHORTPTR: \ *((int16_t *)aGameVars[id].val.lValue) operator (int16_t)lValue; \ - return; \ + break; \ case GAMEVAR_CHARPTR: \ *((uint8_t *)aGameVars[id].val.lValue) operator (uint8_t)lValue; \ - return; \ + break; \ } \ } +// even though libdivide is faster than straight division (when using the LUT) the overhead makes this slower on x86 +// ARM, however, has no hardware integer division +#if defined(__arm__) || defined(LIBDIVIDE_ALWAYS) +static inline void __fastcall Gv_DivVar(int32_t id, int32_t lValue) +{ + static libdivide_s32_t sdiv; + static int32_t lastlValue; + libdivide_s32_t *dptr = &sdiv; + intptr_t *iptr = &aGameVars[id].val.lValue; + + if ((aGameVars[id].dwFlags & GAMEVAR_PERPLAYER && (unsigned) vm.g_p > MAXPLAYERS-1) || + (aGameVars[id].dwFlags & GAMEVAR_PERACTOR && (unsigned) vm.g_i > MAXSPRITES-1)) return; + + if ((unsigned) lValue < DIVTABLESIZE) + dptr = (libdivide_s32_t *)&divtable32[lValue]; + else if (lValue != lastlValue) + sdiv = libdivide_s32_gen(lValue), lastlValue = lValue; + + switch (aGameVars[id].dwFlags & (GAMEVAR_USER_MASK|GAMEVAR_PTR_MASK)) + { + case GAMEVAR_PERPLAYER: + iptr = &aGameVars[id].val.plValues[vm.g_p]; + default: + break; + case GAMEVAR_PERACTOR: + iptr = &aGameVars[id].val.plValues[vm.g_i]; + break; + case GAMEVAR_INTPTR: + *((int32_t *) aGameVars[id].val.lValue) = (int32_t) libdivide_s32_do(*((int32_t *) aGameVars[id].val.lValue), dptr); + return; + case GAMEVAR_SHORTPTR: + *((int16_t *) aGameVars[id].val.lValue) = (int16_t) libdivide_s32_do(*((int16_t *) aGameVars[id].val.lValue), dptr); + return; + case GAMEVAR_CHARPTR: + *((uint8_t *) aGameVars[id].val.lValue) = (uint8_t) libdivide_s32_do(*((uint8_t *) aGameVars[id].val.lValue), dptr); + return; + } + + *iptr = libdivide_s32_do(*iptr, dptr); +} +#else +GV_VAROP(Gv_DivVar, /=) +#endif + GV_VAROP(Gv_AddVar, +=) GV_VAROP(Gv_SubVar, -=) GV_VAROP(Gv_MulVar, *=) -GV_VAROP(Gv_DivVar, /=) GV_VAROP(Gv_ModVar, %=) GV_VAROP(Gv_AndVar, &=) GV_VAROP(Gv_XorVar, ^=) GV_VAROP(Gv_OrVar, |=) + #endif #endif diff --git a/polymer/eduke32/source/m32exec.c b/polymer/eduke32/source/m32exec.c index c628994fa..2180c3f43 100644 --- a/polymer/eduke32/source/m32exec.c +++ b/polymer/eduke32/source/m32exec.c @@ -809,7 +809,7 @@ skip_check: float fval = *((float *)&bits); // rounding must absolutely be! //OSD_Printf("ftoi: bits:%8x, scale=%d, fval=%f, (int32_t)(fval*scale)=%d\n", bits, scale, fval, (int32_t)(fval*scale)); - Gv_SetVarX(*insptr, (int32_t)nearbyintf(fval * scale)); + Gv_SetVarX(*insptr, (int32_t)Blrintf(fval * scale)); } insptr += 2; continue; diff --git a/polymer/eduke32/source/menus.c b/polymer/eduke32/source/menus.c index fb252c99d..dbbe1bf71 100644 --- a/polymer/eduke32/source/menus.c +++ b/polymer/eduke32/source/menus.c @@ -726,17 +726,17 @@ static MenuEntry_t *MEL_RENDERERSETUP_GL3[] = { #endif #ifdef DROIDMENU -static MenuRangeFloat_t MEO_COLCORR_GAMMA = MAKE_MENURANGE( &MF_Bluefont, 1.f, 1.f, 2.5f, 39.f, 0.f, &vid_gamma ); +static MenuRangeFloat_t MEO_COLCORR_GAMMA = MAKE_MENURANGE( &MF_Bluefont, 1, 1.f, 2.5f, 39.f, 0.f, &vid_gamma ); #else -static MenuRangeFloat_t MEO_COLCORR_GAMMA = MAKE_MENURANGE( &MF_Bluefont, 1.f, 0.2f, 4.f, 39.f, 0.f, &vid_gamma ); +static MenuRangeFloat_t MEO_COLCORR_GAMMA = MAKE_MENURANGE( &MF_Bluefont, 1, 0.2f, 4.f, 39, 0.f, &vid_gamma ); #endif static MenuEntry_t ME_COLCORR_GAMMA = MAKE_MENUENTRY( &MF_Redfont, "Gamma:", RangeFloat, &MEO_COLCORR_GAMMA ); -static MenuRangeFloat_t MEO_COLCORR_CONTRAST = MAKE_MENURANGE( &MF_Bluefont, 1.f, 0.1f, 2.7f, 53.f, 0.f, &vid_contrast ); +static MenuRangeFloat_t MEO_COLCORR_CONTRAST = MAKE_MENURANGE( &MF_Bluefont, 1, 0.1f, 2.7f, 53, 0.f, &vid_contrast ); static MenuEntry_t ME_COLCORR_CONTRAST = MAKE_MENUENTRY( &MF_Redfont, "Contrast:", RangeFloat, &MEO_COLCORR_CONTRAST ); -static MenuRangeFloat_t MEO_COLCORR_BRIGHTNESS = MAKE_MENURANGE( &MF_Bluefont, 1.f, -0.8f, 0.8f, 33.f, 0.f, &vid_brightness ); +static MenuRangeFloat_t MEO_COLCORR_BRIGHTNESS = MAKE_MENURANGE( &MF_Bluefont, 1, -0.8f, 0.8f, 33, 0.f, &vid_brightness ); static MenuEntry_t ME_COLCORR_BRIGHTNESS = MAKE_MENUENTRY( &MF_Redfont, "Brightness:", RangeFloat, &MEO_COLCORR_BRIGHTNESS ); static MenuEntry_t ME_COLCORR_RESET = MAKE_MENUENTRY( &MF_Redfont, "Reset To Defaults", Link, &MEO_NULL ); -static MenuRangeFloat_t MEO_COLCORR_AMBIENT = MAKE_MENURANGE(&MF_Bluefont, 1.f, 0.125f, 4.f, 32.f, 0.f, &r_ambientlight); +static MenuRangeFloat_t MEO_COLCORR_AMBIENT = MAKE_MENURANGE(&MF_Bluefont, 1, 0.125f, 4.f, 32, 0.f, &r_ambientlight); static MenuEntry_t ME_COLCORR_AMBIENT = MAKE_MENUENTRY(&MF_Redfont, "Visibility:", RangeFloat, &MEO_COLCORR_AMBIENT); static MenuEntry_t *MEL_COLCORR[] = { @@ -3575,7 +3575,7 @@ static int32_t M_RunMenu_MenuMenu(MenuMenu_t *menu, MenuEntry_t *currentry, int3 case 2: { int32_t v; - ftol(((float) *object->variable * 100.) / (float) object->onehundredpercent + 0.5, &v); + v = Blrintf(((float) *object->variable * 100.f) / (float) object->onehundredpercent); Bsprintf(tempbuf, "%d%%", v); break; } @@ -3602,7 +3602,7 @@ static int32_t M_RunMenu_MenuMenu(MenuMenu_t *menu, MenuEntry_t *currentry, int3 rotatesprite_fs(x, y - menu->scrollPos, z, 0, SLIDEBAR, s, p, 2|8|16|ROTATESPRITE_FULL16); rotatesprite_fs( - x + (1<<16) + ((float) scale((tilesiz[SLIDEBAR].x-2-tilesiz[SLIDEBAR+1].x)<<16, height, tilesiz[SLIDEBAR].y<<16) * (*object->variable - object->min) / (object->max - object->min)), + x + (1<<16) + (int32_t)((float) scale((tilesiz[SLIDEBAR].x-2-tilesiz[SLIDEBAR+1].x)<<16, height, tilesiz[SLIDEBAR].y<<16) * (*object->variable - object->min) / (object->max - object->min)), y + scale((tilesiz[SLIDEBAR].y-tilesiz[SLIDEBAR+1].y)<<15, height, tilesiz[SLIDEBAR].y<<16) - menu->scrollPos, z, 0, SLIDEBAR+1, s, p, 2|8|16|ROTATESPRITE_FULL16); @@ -3621,7 +3621,7 @@ static int32_t M_RunMenu_MenuMenu(MenuMenu_t *menu, MenuEntry_t *currentry, int3 case 2: { int32_t v; - ftol((*object->variable * 100.f) / object->onehundredpercent + 0.5f, &v); + v = Blrintf((*object->variable * 100.f) / object->onehundredpercent); Bsprintf(tempbuf, "%d%%", v); break; } @@ -3648,7 +3648,7 @@ static int32_t M_RunMenu_MenuMenu(MenuMenu_t *menu, MenuEntry_t *currentry, int3 rotatesprite_fs(x, y - menu->scrollPos, z, 0, SLIDEBAR, s, p, 2|8|16|ROTATESPRITE_FULL16); rotatesprite_fs( - x + (1<<16) + ((double) scale((tilesiz[SLIDEBAR].x-2-tilesiz[SLIDEBAR+1].x)<<16, height, tilesiz[SLIDEBAR].y<<16) * (*object->variable - object->min) / (object->max - object->min)), + x + (1<<16) + (int32_t)((double) scale((tilesiz[SLIDEBAR].x-2-tilesiz[SLIDEBAR+1].x)<<16, height, tilesiz[SLIDEBAR].y<<16) * (*object->variable - object->min) / (object->max - object->min)), y + scale((tilesiz[SLIDEBAR].y-tilesiz[SLIDEBAR+1].y)<<15, height, tilesiz[SLIDEBAR].y<<16) - menu->scrollPos, z, 0, SLIDEBAR+1, s, p, 2|8|16|ROTATESPRITE_FULL16); @@ -3667,7 +3667,7 @@ static int32_t M_RunMenu_MenuMenu(MenuMenu_t *menu, MenuEntry_t *currentry, int3 case 2: { int32_t v; - dtol((*object->variable * 100.) / object->onehundredpercent + 0.5, &v); + v = Blrintf((*object->variable * 100.) / object->onehundredpercent); Bsprintf(tempbuf, "%d%%", v); break; } @@ -4403,14 +4403,14 @@ static void M_RunMenuInput(Menu_t *cm) case RangeInt32: { MenuRangeInt32_t *object = (MenuRangeInt32_t*)currentry->entry; - const double interval = (double) (object->max - object->min) / (object->steps - 1); + const float interval = (float) (object->max - object->min) / (float) (object->steps - 1); int32_t step; int32_t modification = 0; if (currentry->disabled) break; - dtol((double) (*object->variable - object->min) / interval + 0.5, &step); + step = Blrintf((float) (*object->variable - object->min) / interval); if (I_SliderLeft()) { @@ -4438,7 +4438,7 @@ static void M_RunMenuInput(Menu_t *cm) else if (step >= object->steps) step = object->steps - 1; - dtol(interval * step + object->min + 0.5, &temp); + temp = Blrintf(interval * step + (object->min)); if (!M_MenuEntryRangeInt32Modify(currentry, temp)) *object->variable = temp; @@ -4456,7 +4456,7 @@ static void M_RunMenuInput(Menu_t *cm) if (currentry->disabled) break; - ftol((*object->variable - object->min) / interval + 0.5, &step); + step = Blrintf((*object->variable - object->min) / interval); if (I_SliderLeft()) { @@ -4502,7 +4502,7 @@ static void M_RunMenuInput(Menu_t *cm) if (currentry->disabled) break; - dtol((*object->variable - object->min) / interval + 0.5, &step); + step = Blrintf((*object->variable - object->min) / interval); if (I_SliderLeft()) { diff --git a/polymer/eduke32/source/midi.c b/polymer/eduke32/source/midi.c index f6aceff9a..abed1ccac 100644 --- a/polymer/eduke32/source/midi.c +++ b/polymer/eduke32/source/midi.c @@ -39,6 +39,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #include "midi.h" #include "mpu401.h" #include "compat.h" +#include "pragmas.h" #define WIN32_LEAN_AND_MEAN #include @@ -297,7 +298,7 @@ static void _MIDI_MetaEvent break; case MIDI_TEMPO_CHANGE : - tempo = 60000000L / _MIDI_ReadNumber(Track->pos, 3); + tempo = tabledivide32_noinline(60000000L, _MIDI_ReadNumber(Track->pos, 3)); MIDI_SetTempo(tempo); break; @@ -318,7 +319,7 @@ static void _MIDI_MetaEvent _MIDI_TimeBase += _MIDI_TimeBase; denominator--; } - _MIDI_TicksPerBeat = (_MIDI_Division * 4) / _MIDI_TimeBase; + _MIDI_TicksPerBeat = tabledivide32_noinline(_MIDI_Division * 4, _MIDI_TimeBase); break; } @@ -794,7 +795,7 @@ static void _MIDI_SetChannelVolume { remotevolume = volume * _MIDI_TotalVolume; remotevolume *= _MIDI_UserChannelVolume[ channel ]; - remotevolume /= MIDI_MaxVolume; + remotevolume = tabledivide32_noinline(remotevolume, MIDI_MaxVolume); remotevolume >>= 8; status = _MIDI_RerouteFunctions[ channel ](0xB0 + channel, @@ -821,7 +822,7 @@ static void _MIDI_SetChannelVolume if (_MIDI_Funcs->SetVolume == NULL) { volume *= _MIDI_TotalVolume; - volume /= MIDI_MaxVolume; + volume = tabledivide32_noinline(volume, MIDI_MaxVolume); } // For user volume @@ -1315,8 +1316,8 @@ void MIDI_SetTempo int32_t tickspersecond; MIDI_Tempo = tempo; - tickspersecond = ((tempo) * _MIDI_Division) / 60; - _MIDI_FPSecondsPerTick = (1 << TIME_PRECISION) / tickspersecond; + tickspersecond = ((tempo) * _MIDI_Division)/60; + _MIDI_FPSecondsPerTick = tabledivide32_noinline(1 << TIME_PRECISION, tickspersecond); MPU_SetTempo(tempo); } @@ -1562,8 +1563,8 @@ void MIDI_SetSongTime MIDI_PauseSong(); - mil = ((milliseconds % 1000) << TIME_PRECISION) / 1000; - sec = (milliseconds / 1000) << TIME_PRECISION; + mil = tabledivide32_noinline((milliseconds % 1000) << TIME_PRECISION, 1000); + sec = tabledivide32_noinline(milliseconds, 1000) << TIME_PRECISION; newtime = sec + mil; if (newtime < _MIDI_Time) diff --git a/polymer/eduke32/source/mpu401.c b/polymer/eduke32/source/mpu401.c index a56aed729..d62e1978f 100644 --- a/polymer/eduke32/source/mpu401.c +++ b/polymer/eduke32/source/mpu401.c @@ -34,6 +34,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #include "mpu401.h" #include "compat.h" +#include "pragmas.h" #define WIN32_LEAN_AND_MEAN #include @@ -441,7 +442,7 @@ void MPU_SetTempo(int32_t tempo) { MIDIPROPTEMPO prop; prop.cbStruct = sizeof(MIDIPROPTEMPO); - prop.dwTempo = 60000000l/tempo; + prop.dwTempo = tabledivide32_noinline(60000000l, tempo); midiStreamProperty(hmido, (LPBYTE)&prop, MIDIPROP_SET|MIDIPROP_TEMPO); } diff --git a/polymer/eduke32/source/player.c b/polymer/eduke32/source/player.c index c2f33f0b3..4286742fb 100644 --- a/polymer/eduke32/source/player.c +++ b/polymer/eduke32/source/player.c @@ -113,9 +113,9 @@ static void A_DoWaterTracers(int32_t x1,int32_t y1,int32_t z1,int32_t x2,int32_t int16_t sect = -1; i = n+1; - xv = (x2-x1)/i; - yv = (y2-y1)/i; - zv = (z2-z1)/i; + xv = tabledivide32_noinline(x2-x1, i); + yv = tabledivide32_noinline(y2-y1, i); + zv = tabledivide32_noinline(z2-z1, i); if ((klabs(x1-x2)+klabs(y1-y2)) < 3084) return; @@ -147,15 +147,15 @@ static void A_HitscanProjTrail(const vec3_t *sv, const vec3_t *dv, int32_t ang, Bmemcpy(&destvect, dv, sizeof(vec3_t)); - srcvect.x = sv->x + (sintable[(348+ang+512)&2047]/proj->offset); - srcvect.y = sv->y + (sintable[(ang+348)&2047]/proj->offset); + srcvect.x = sv->x + tabledivide32_noinline(sintable[(348+ang+512)&2047], proj->offset); + srcvect.y = sv->y + tabledivide32_noinline(sintable[(ang+348)&2047], proj->offset); srcvect.z = sv->z + 1024+(proj->toffset<<8); n = ((FindDistance2D(srcvect.x-destvect.x,srcvect.y-destvect.y))>>8)+1; - destvect.x = ((destvect.x-srcvect.x)/n); - destvect.y = ((destvect.y-srcvect.y)/n); - destvect.z = ((destvect.z-srcvect.z)/n); + destvect.x = tabledivide32_noinline((destvect.x-srcvect.x), n); + destvect.y = tabledivide32_noinline((destvect.y-srcvect.y), n); + destvect.z = tabledivide32_noinline((destvect.z-srcvect.z), n); srcvect.x += destvect.x>>2; srcvect.y += destvect.y>>2; @@ -379,7 +379,7 @@ static int32_t GetAutoAimAngle(int32_t i, int32_t p, int32_t atwith, } dst = safeldist(g_player[p].ps->i, &sprite[j]); - *zvel = ((spr->z - srcvect->z - cen)*vel) / dst; + *zvel = tabledivide32_noinline((spr->z - srcvect->z - cen)*vel, dst); if (!(flags&2) || sprite[j].picnum != RECON) *sa = getangle(spr->x-srcvect->x, spr->y-srcvect->y); @@ -530,7 +530,7 @@ static void A_PreFireHitscan(const spritetype *s, vec3_t *srcvect, int32_t *zvel const DukePlayer_t *targetps = g_player[j].ps; const int32_t d = safeldist(targetps->i, s); - *zvel = ((targetps->pos.z-srcvect->z)<<8) / d; + *zvel = tabledivide32_noinline((targetps->pos.z-srcvect->z)<<8, d); srcvect->z -= (4<<8); @@ -960,7 +960,7 @@ static int32_t A_ShootCustom(const int32_t i, const int32_t atwith, int16_t sa, sa = getangle(g_player[j].ps->opos.x-srcvect->x, g_player[j].ps->opos.y-srcvect->y); l = safeldist(g_player[j].ps->i, s); - zvel = ((g_player[j].ps->opos.z - srcvect->z)*vel) / l; + zvel = tabledivide32_noinline((g_player[j].ps->opos.z - srcvect->z)*vel, l); if (A_CheckEnemySprite(s) && (AC_MOVFLAGS(s, &actor[i]) & face_player_smart)) sa = s->ang + (krand() & 31) - 16; @@ -974,8 +974,8 @@ static int32_t A_ShootCustom(const int32_t i, const int32_t atwith, int16_t sa, zvel = A_GetShootZvel(zvel); j = A_InsertSprite(sect, - srcvect->x + (sintable[(348 + sa + 512) & 2047] / proj->offset), - srcvect->y + (sintable[(sa + 348) & 2047] / proj->offset), + srcvect->x + tabledivide32_noinline(sintable[(348 + sa + 512) & 2047], proj->offset), + srcvect->y + tabledivide32_noinline(sintable[(sa + 348) & 2047], proj->offset), srcvect->z - (1 << 8), atwith, 0, 14, 14, sa, vel, zvel, i, 4); sprite[j].xrepeat = proj->xrepeat; @@ -1017,7 +1017,7 @@ static int32_t A_ShootCustom(const int32_t i, const int32_t atwith, int16_t sa, { int32_t x; j = g_player[A_FindPlayer(s, &x)].ps->i; - zvel = ((sprite[j].z - srcvect->z) << 8) / (x + 1); + zvel = tabledivide32_noinline((sprite[j].z - srcvect->z) << 8, x + 1); sa = getangle(sprite[j].x - srcvect->x, sprite[j].y - srcvect->y); } @@ -1205,7 +1205,7 @@ int32_t A_ShootWithZvel(int32_t i, int32_t atwith, int32_t override_zvel) { int32_t x; j = g_player[A_FindPlayer(s,&x)].ps->i; - zvel = ((sprite[j].z-srcvect.z)<<8) / (x+1); + zvel = tabledivide32_noinline((sprite[j].z-srcvect.z)<<8, x+1); sa = getangle(sprite[j].x-srcvect.x,sprite[j].y-srcvect.y); } } @@ -1352,7 +1352,7 @@ int32_t A_ShootWithZvel(int32_t i, int32_t atwith, int32_t override_zvel) // sa = getangle(g_player[j].ps->opos.x-sx,g_player[j].ps->opos.y-sy); sa += 16-(krand()&31); hit.pos.x = safeldist(g_player[j].ps->i, s); - zvel = ((g_player[j].ps->opos.z - srcvect.z + (3<<8))*vel) / hit.pos.x; + zvel = tabledivide32_noinline((g_player[j].ps->opos.z - srcvect.z + (3<<8))*vel, hit.pos.x); } zvel = A_GetShootZvel(zvel); @@ -1438,7 +1438,7 @@ int32_t A_ShootWithZvel(int32_t i, int32_t atwith, int32_t override_zvel) } l = safeldist(g_player[j].ps->i, s); - zvel = ((g_player[j].ps->opos.z - srcvect.z)*vel) / l; + zvel = tabledivide32_noinline((g_player[j].ps->opos.z - srcvect.z)*vel, l); if (A_CheckEnemySprite(s) && (AC_MOVFLAGS(s, &actor[i]) & face_player_smart)) sa = s->ang+(krand()&31)-16; @@ -1635,7 +1635,7 @@ int32_t A_ShootWithZvel(int32_t i, int32_t atwith, int32_t override_zvel) { j = A_FindPlayer(s, NULL); l = safeldist(g_player[j].ps->i, s); - zvel = ((g_player[j].ps->opos.z-srcvect.z)*512) / l ; + zvel = tabledivide32_noinline((g_player[j].ps->opos.z-srcvect.z)*512, l); } else zvel = 0; @@ -1830,7 +1830,7 @@ static void G_DrawWeaponTile(int32_t x, int32_t y, int32_t tilenum, int32_t shad // HACK: Draw the upper part of the chaingun two screen // pixels (not texels; multiplied by weapon scale) lower // first, preventing ugly horizontal seam. - g_dts_yadd = (65536*2*200)/ydim; + g_dts_yadd = tabledivide32_noinline(65536*2*200, ydim); G_DrawTileScaled(x,y,tilenum,shadef[slot],orientation,p); g_dts_yadd = 0; } @@ -2753,8 +2753,8 @@ void P_GetInput(int32_t snum) if (ud.config.MouseBias) { if (klabs(info[0].dyaw) > klabs(info[0].dpitch)) - info[0].dpitch /= ud.config.MouseBias; - else info[0].dyaw /= ud.config.MouseBias; + info[0].dpitch = tabledivide32_noinline(info[0].dpitch, ud.config.MouseBias); + else info[0].dyaw = tabledivide32_noinline(info[0].dyaw, ud.config.MouseBias); } tics = totalclock-lastcontroltime; diff --git a/polymer/eduke32/source/premap.c b/polymer/eduke32/source/premap.c index 8d3a627ea..0fc6e1fe3 100644 --- a/polymer/eduke32/source/premap.c +++ b/polymer/eduke32/source/premap.c @@ -537,7 +537,7 @@ void G_CacheMapData(void) if (bpp > 8 && totalclock - tc > TICRATE/4) { /*Bsprintf(tempbuf,"%d resources remaining\n",g_precacheCount-pc+1);*/ - tc = min(100,100*pc/g_precacheCount); + tc = min(100, tabledivide32_noinline(100 * pc, g_precacheCount)); Bsprintf(tempbuf,"Loaded %d%% (%d/%d textures)\n",tc,pc,g_precacheCount); G_DoLoadScreen(tempbuf, tc); tc = totalclock; diff --git a/polymer/eduke32/source/savegame.c b/polymer/eduke32/source/savegame.c index a109329f3..522b723d0 100644 --- a/polymer/eduke32/source/savegame.c +++ b/polymer/eduke32/source/savegame.c @@ -692,7 +692,7 @@ static void docmpsd(const void *ptr, void *dump, uint32_t size, uint32_t cnt, ui { \ const UINT(Datbits) *p=(UINT(Datbits) *)ptr; \ UINT(Datbits) *op=(UINT(Datbits) *)dump; \ - uint32_t i, nelts=(size*cnt)/BYTES(Datbits); \ + uint32_t i, nelts=tabledivide32_noinline(size*cnt, BYTES(Datbits)); \ if (nelts>65536) \ CPELTS(32,Datbits); \ else if (nelts>256) \ @@ -831,7 +831,7 @@ readidx_##Idxbits##_##Datbits: \ #define CPDATA(Datbits) do \ { \ - uint32_t nelts=(sp->size*cnt)/BYTES(Datbits); \ + uint32_t nelts=tabledivide32_noinline(sp->size*cnt, BYTES(Datbits)); \ if (nelts>65536) \ CPELTS(32,Datbits); \ else if (nelts>256) \ diff --git a/polymer/eduke32/source/sector.c b/polymer/eduke32/source/sector.c index fe4cbc879..e68f3ec97 100644 --- a/polymer/eduke32/source/sector.c +++ b/polymer/eduke32/source/sector.c @@ -571,8 +571,9 @@ void G_OperateSectors(int32_t sn, int32_t ii) dax += wall[i].x; day += wall[i].y; } - dax /= (endwall-startwall+1); - day /= (endwall-startwall+1); + + dax = tabledivide32_noinline(dax, (endwall-startwall+1)); + day = tabledivide32_noinline(day, (endwall-startwall+1)); //find any points with either same x or same y coordinate // as center (dax, day) - should be 2 points found.