From 7d4a5898ac68d000e8aeaba224acd78b6a2c91ee Mon Sep 17 00:00:00 2001
From: Christoph Oelckers <c.oelckers@users.noreply.github.com>
Date: Sat, 10 Dec 2016 13:58:18 +0100
Subject: [PATCH] - removed most inline assembly. Integer multiplication gets
 handled fine by all current compilers and fixed point division is too
 infrequently used to justify this mess.

That only leaves the Scale function which is still being used in a few places and which would create considerably worse code without assembly on 32 bit platforms. This is also far too primitive (2 or 3 assembly instructions) to claim any copyright on it, so I think m_fixed.h can now be considered free of Build-related issues. The deficated inline headers have been removed because that sole remaining function could be easily moved into m_fixed.h.
---
 src/basicinlines.h |  40 -----------------
 src/gccinlines.h   |  95 ----------------------------------------
 src/m_fixed.h      |  80 ++++++++++++++++++++++++++++++----
 src/mscinlines.h   | 105 ---------------------------------------------
 src/p_maputl.cpp   |   4 +-
 5 files changed, 73 insertions(+), 251 deletions(-)
 delete mode 100644 src/basicinlines.h
 delete mode 100644 src/gccinlines.h
 delete mode 100644 src/mscinlines.h

diff --git a/src/basicinlines.h b/src/basicinlines.h
deleted file mode 100644
index db03232b3..000000000
--- a/src/basicinlines.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// "Build Engine & Tools" Copyright (c) 1993-1997 Ken Silverman
-// Ken Silverman's official web site: "http://www.advsys.net/ken"
-// See the included license file "BUILDLIC.TXT" for license info.
-//
-// This file is based on pragmas.h from Ken Silverman's original Build
-// source code release but is meant for use with any compiler and does not
-// rely on any inline assembly.
-//
-
-#if _MSC_VER
-#pragma once
-#endif
-
-#if defined(__GNUC__) && !defined(__forceinline)
-#define __forceinline __inline__ __attribute__((always_inline))
-#endif
-
-static __forceinline SDWORD Scale (SDWORD a, SDWORD b, SDWORD c)
-{
-	return (SDWORD)(((SQWORD)a*b)/c);
-}
-
-static __forceinline SDWORD MulScale14 (SDWORD a, SDWORD b) { return (SDWORD)(((SQWORD)a * b) >> 14); }
-static __forceinline SDWORD MulScale16 (SDWORD a, SDWORD b) { return (SDWORD)(((SQWORD)a * b) >> 16); }
-static __forceinline SDWORD MulScale30 (SDWORD a, SDWORD b) { return (SDWORD)(((SQWORD)a * b) >> 30); }
-static __forceinline SDWORD MulScale32 (SDWORD a, SDWORD b) { return (SDWORD)(((SQWORD)a * b) >> 32); }
-
-static __forceinline DWORD UMulScale16 (DWORD a, DWORD b) { return (DWORD)(((QWORD)a * b) >> 16); }
-
-static __forceinline SDWORD DMulScale3 (SDWORD a, SDWORD b, SDWORD c, SDWORD d) { return (SDWORD)(((SQWORD)a*b + (SQWORD)c*d) >> 3); }
-static __forceinline SDWORD DMulScale6 (SDWORD a, SDWORD b, SDWORD c, SDWORD d) { return (SDWORD)(((SQWORD)a*b + (SQWORD)c*d) >> 6); }
-static __forceinline SDWORD DMulScale10 (SDWORD a, SDWORD b, SDWORD c, SDWORD d) { return (SDWORD)(((SQWORD)a*b + (SQWORD)c*d) >> 10); }
-static __forceinline SDWORD DMulScale18 (SDWORD a, SDWORD b, SDWORD c, SDWORD d) { return (SDWORD)(((SQWORD)a*b + (SQWORD)c*d) >> 18); }
-static __forceinline SDWORD DMulScale32 (SDWORD a, SDWORD b, SDWORD c, SDWORD d) { return (SDWORD)(((SQWORD)a*b + (SQWORD)c*d) >> 32); }
-
-static inline SDWORD DivScale6 (SDWORD a, SDWORD b) { return (SDWORD)(((SQWORD)a << 6) / b); }
-static inline SDWORD DivScale16 (SDWORD a, SDWORD b) { return (SDWORD)(((SQWORD)a << 16) / b); }
-static inline SDWORD DivScale21 (SDWORD a, SDWORD b) { return (SDWORD)(((SQWORD)a << 21) / b); }
-static inline SDWORD DivScale30 (SDWORD a, SDWORD b) { return (SDWORD)(((SQWORD)a << 30) / b); }
-
diff --git a/src/gccinlines.h b/src/gccinlines.h
deleted file mode 100644
index 20506a5a8..000000000
--- a/src/gccinlines.h
+++ /dev/null
@@ -1,95 +0,0 @@
-// "Build Engine & Tools" Copyright (c) 1993-1997 Ken Silverman
-// Ken Silverman's official web site: "http://www.advsys.net/ken"
-// See the included license file "BUILDLIC.TXT" for license info.
-//
-// This file is based on pragmas.h from Ken Silverman's original Build
-// source code release but is meant for use with GCC instead of Watcom C.
-//
-// Some of the inline assembly has been turned into C code, because
-// modern compilers are smart enough to produce code at least as good as
-// Ken's inline assembly.
-//
-
-// I can come up with several different operand constraints for the
-// following that work just fine when used all by themselves, but
-// when I concatenate them together so that the compiler can choose
-// between them, I get the following (but only if enough parameters
-// are passed as the result of some operations instead of as
-// variables/constants--e.g. DMulScale16 (a*2, b*2, c*2, d*2) instead of
-// DMulScale16 (a, b, c, d)):
-//
-// `asm' operand requires impossible reload
-//
-// Why?
-
-#include <stdlib.h>
-
-#ifndef alloca
-// MinGW does not seem to come with alloca defined.
-#define alloca __builtin_alloca
-#endif
-
-static inline SDWORD Scale (SDWORD a, SDWORD b, SDWORD c)
-{
-	SDWORD result, dummy;
-
-	asm volatile
-		("imull %3\n\t"
-		 "idivl %4"
-		 : "=a,a,a,a,a,a" (result),
-		   "=&d,&d,&d,&d,d,d" (dummy)
-		 :   "a,a,a,a,a,a" (a),
-		     "m,r,m,r,d,d" (b),
-		     "r,r,m,m,r,m" (c)
-		 : "cc"
-			);
-
-	return result;
-}
-
-#define MAKECONSTMulScale(s) \
-static inline SDWORD MulScale##s (SDWORD a, SDWORD b) { return ((SQWORD)a * b) >> s; }
-
-MAKECONSTMulScale(14)
-MAKECONSTMulScale(16)
-MAKECONSTMulScale(30)
-MAKECONSTMulScale(32)
-#undef MAKECONSTMulScale
-
-static inline DWORD UMulScale16(DWORD a, DWORD b) { return ((QWORD)a * b) >> 16; }
-
-#define MAKECONSTDMulScale(s) \
-	static inline SDWORD DMulScale##s (SDWORD a, SDWORD b, SDWORD c, SDWORD d) \
-	{ \
-		return (((SQWORD)a * b) + ((SQWORD)c * d)) >> s; \
-	}
-
-MAKECONSTDMulScale(3)
-MAKECONSTDMulScale(6)
-MAKECONSTDMulScale(10)
-MAKECONSTDMulScale(18)
-MAKECONSTDMulScale(32)
-#undef MAKECONSTDMulScale
-
-
-#define MAKECONSTDivScale(s) \
-	static inline SDWORD DivScale##s (SDWORD a, SDWORD b) \
-	{ \
-		SDWORD result, dummy; \
-		asm volatile \
-			("idivl %4" \
-			:"=a,a" (result), \
-			 "=d,d" (dummy) \
-			: "a,a" (a<<s), \
-			  "d,d" (a>>(32-s)), \
-			  "r,m" (b) \
-			: "cc"); \
-		return result; \
-	}
-
-MAKECONSTDivScale(6)
-MAKECONSTDivScale(16)
-MAKECONSTDivScale(21)
-MAKECONSTDivScale(30)
-#undef MAKECONSTDivScale
-
diff --git a/src/m_fixed.h b/src/m_fixed.h
index 33d339608..dc7c3f4c9 100644
--- a/src/m_fixed.h
+++ b/src/m_fixed.h
@@ -4,14 +4,72 @@
 #include <stdlib.h>
 #include "doomtype.h"
 
+// Unfortunately, the Scale function still gets badly handled on 32 bit x86 platforms so it's the last remaining piece of inline assembly 
+
+// GCC inlines
 #if defined(__GNUC__) && defined(__i386__) && !defined(__clang__)
-#include "gccinlines.h"
-#elif defined(_MSC_VER) && defined(_M_IX86)
-#include "mscinlines.h"
-#else
-#include "basicinlines.h"
+#ifndef alloca
+// MinGW does not seem to come with alloca defined.
+#define alloca __builtin_alloca
 #endif
 
+static inline int32_t Scale(int32_t a, int32_t b, int32_t c)
+{
+	int32_t result, dummy;
+
+	asm volatile
+		("imull %3\n\t"
+			"idivl %4"
+			: "=a,a,a,a,a,a" (result),
+			"=&d,&d,&d,&d,d,d" (dummy)
+			: "a,a,a,a,a,a" (a),
+			"m,r,m,r,d,d" (b),
+			"r,r,m,m,r,m" (c)
+			: "cc"
+			);
+
+	return result;
+}
+
+// MSVC inlines
+#elif defined(_MSC_VER) && defined(_M_IX86)
+#pragma warning (disable: 4035)
+
+__forceinline int32_t Scale(int32_t a, int32_t b, int32_t c)
+{
+	__asm mov eax, a
+	__asm imul b
+	__asm idiv c
+}
+
+#pragma warning (default: 4035)
+#else
+
+static __forceinline int32_t Scale(int32_t a, int32_t b, int32_t c)
+{
+	return (int32_t)(((int64_t)a*b) / c);
+}
+
+#endif
+
+// Modern compilers are smart enough to do these multiplications intelligently.
+__forceinline int32_t MulScale14(int32_t a, int32_t b) { return (int32_t)(((int64_t)a * b) >> 14); } // only used by R_DrawVoxel
+__forceinline int32_t MulScale30(int32_t a, int32_t b) { return (int32_t)(((int64_t)a * b) >> 30); } // only used once in the node builder
+__forceinline int32_t MulScale32(int32_t a, int32_t b) { return (int32_t)(((int64_t)a * b) >> 32); } // only used by R_DrawVoxel
+
+__forceinline uint32_t UMulScale16(uint32_t a, uint32_t b) { return (uint32_t)(((uint64_t)a * b) >> 16); } // used for sky drawing
+
+__forceinline int32_t DMulScale3(int32_t a, int32_t b, int32_t c, int32_t d) { return (int32_t)(((int64_t)a*b + (int64_t)c*d) >> 3); } // used for setting up slopes for Build maps
+__forceinline int32_t DMulScale6(int32_t a, int32_t b, int32_t c, int32_t d) { return (int32_t)(((int64_t)a*b + (int64_t)c*d) >> 6); } // only used by R_DrawVoxel
+__forceinline int32_t DMulScale10(int32_t a, int32_t b, int32_t c, int32_t d) { return (int32_t)(((int64_t)a*b + (int64_t)c*d) >> 10); } // only used by R_DrawVoxel
+__forceinline int32_t DMulScale18(int32_t a, int32_t b, int32_t c, int32_t d) { return (int32_t)(((int64_t)a*b + (int64_t)c*d) >> 18); } // only used by R_DrawVoxel
+__forceinline int32_t DMulScale32(int32_t a, int32_t b, int32_t c, int32_t d) { return (int32_t)(((int64_t)a*b + (int64_t)c*d) >> 32); } // used by R_PointOnSide.
+
+// Sadly, for divisions this is not true but these are so infrequently used that the C versions are just fine, despite not being fully optimal.
+__forceinline int32_t DivScale6(int32_t a, int32_t b) { return (int32_t)(((int64_t)a << 6) / b); } // only used by R_DrawVoxel
+__forceinline int32_t DivScale21(int32_t a, int32_t b) { return (int32_t)(((int64_t)a << 21) / b); } // only used by R_DrawVoxel
+__forceinline int32_t DivScale30(int32_t a, int32_t b) { return (int32_t)(((int64_t)a << 30) / b); } // only used once in the node builder
+
 __forceinline void fillshort(void *buff, unsigned int count, WORD clear)
 {
 	SWORD *b2 = (SWORD *)buff;
@@ -23,14 +81,18 @@ __forceinline void fillshort(void *buff, unsigned int count, WORD clear)
 
 #include "xs_Float.h"
 
-inline SDWORD FixedDiv (SDWORD a, SDWORD b) 
+inline int32_t FixedDiv (int32_t a, int32_t b) 
 { 
-	if ((DWORD)abs(a) >> (31-16) >= (DWORD)abs (b)) 
+	if ((uint32_t)abs(a) >> (31-16) >= (uint32_t)abs (b)) 
 		return (a^b)<0 ? FIXED_MIN : FIXED_MAX; 
-	return DivScale16 (a, b); 
+
+	return (int32_t)(((int64_t)a << 16) / b);
 }
 
-#define FixedMul MulScale16
+__forceinline int32_t FixedMul(int32_t a, int32_t b) 
+{ 
+	return (int32_t)(((int64_t)a * b) >> 16); 
+}
 
 inline fixed_t FloatToFixed(double f)
 {
diff --git a/src/mscinlines.h b/src/mscinlines.h
deleted file mode 100644
index cc16c319d..000000000
--- a/src/mscinlines.h
+++ /dev/null
@@ -1,105 +0,0 @@
-// "Build Engine & Tools" Copyright (c) 1993-1997 Ken Silverman
-// Ken Silverman's official web site: "http://www.advsys.net/ken"
-// See the included license file "BUILDLIC.TXT" for license info.
-//
-// This file is based on pragmas.h from Ken Silverman's original Build
-// source code release but is meant for use with Visual C++ instead of
-// Watcom C.
-//
-// Some of the inline assembly has been turned into C code, because VC++
-// is smart enough to produce code at least as good as Ken's inlines.
-// The more used functions are still inline assembly, because they do
-// things that can't really be done in C. (I consider this a bad thing,
-// because VC++ has considerably poorer support for inline assembly than
-// Watcom, so it's better to rely on its C optimizer to produce fast code.)
-//
-
-
-#include <string.h>
-#include <stddef.h>
-
-#pragma warning (disable: 4035)
-
-__forceinline SDWORD Scale (SDWORD a, SDWORD b, SDWORD c)
-{
-	__asm mov eax,a
-	__asm imul b
-	__asm idiv c
-}
-
-#define MAKECONSTMulScale(s) \
-	__forceinline SDWORD MulScale##s (SDWORD a, SDWORD b) \
-	{ \
-		__asm mov eax,a \
-		__asm imul b \
-		__asm shrd eax,edx,s \
-	}
-MAKECONSTMulScale(14)
-MAKECONSTMulScale(16)
-MAKECONSTMulScale(30)
-#undef MAKECONSTMulScale
-
-__forceinline SDWORD MulScale32 (SDWORD a, SDWORD b)
-{
-	__asm mov eax,a
-	__asm imul b
-	__asm mov eax,edx
-}
-
-__forceinline DWORD UMulScale16(DWORD a, DWORD b)
-{
-	__asm mov eax,a
-	__asm mul b
-	__asm shrd eax,edx,16
-}
-
-#define MAKECONSTDMulScale(s) \
-	__forceinline SDWORD DMulScale##s (SDWORD a, SDWORD b, SDWORD c, SDWORD d) \
-	{ \
-		__asm mov eax,a \
-		__asm imul b \
-		__asm mov ebx,eax \
-		__asm mov eax,c \
-		__asm mov esi,edx \
-		__asm imul d \
-		__asm add eax,ebx \
-		__asm adc edx,esi \
-		__asm shrd eax,edx,s \
-	}
-
-MAKECONSTDMulScale(3)
-MAKECONSTDMulScale(6)
-MAKECONSTDMulScale(10)
-MAKECONSTDMulScale(18)
-#undef MAKCONSTDMulScale
-
-__forceinline SDWORD DMulScale32 (SDWORD a, SDWORD b, SDWORD c, SDWORD d)
-{
-	__asm mov eax,a
-	__asm imul b
-	__asm mov ebx,eax
-	__asm mov eax,c
-	__asm mov esi,edx
-	__asm imul d
-	__asm add eax,ebx
-	__asm adc edx,esi
-	__asm mov eax,edx
-}
-
-#define MAKECONSTDivScale(s) \
-	__forceinline SDWORD DivScale##s (SDWORD a, SDWORD b) \
-	{ \
-		__asm mov edx,a \
-		__asm sar edx,32-s \
-		__asm mov eax,a \
-		__asm shl eax,s \
-		__asm idiv b \
-	}
-
-MAKECONSTDivScale(6)
-MAKECONSTDivScale(16)
-MAKECONSTDivScale(21)
-MAKECONSTDivScale(30)
-#undef MAKECONSTDivScale
-
-#pragma warning (default: 4035)
diff --git a/src/p_maputl.cpp b/src/p_maputl.cpp
index fdf66e969..d676e966c 100644
--- a/src/p_maputl.cpp
+++ b/src/p_maputl.cpp
@@ -2001,8 +2001,8 @@ int P_VanillaPointOnLineSide(double x, double y, const line_t* line)
 	auto dx = FloatToFixed(x - line->v1->fX());
 	auto dy = FloatToFixed(y - line->v1->fY());
 
-	auto left = MulScale16( int(delta.Y * 256) , dx );
-	auto right = MulScale16( dy , int(delta.X * 256) );
+	auto left = FixedMul( int(delta.Y * 256) , dx );
+	auto right = FixedMul( dy , int(delta.X * 256) );
 
 	if (right < left)
 		return 0;		// front side