From 6111d84c1b6498be50c4717e24097598ac2567d8 Mon Sep 17 00:00:00 2001
From: Randy Heit <rheit@zdoom.fake>
Date: Tue, 29 Jun 2010 03:32:10 +0000
Subject: [PATCH] - Added a vectorized SSE2 version of ClassifyLine. Since
 compiler support for SSE intrinsics   pretty much sucks, this is slower than
 the unvectored version I get when I let the compiler   compile the regular
 function with SSE2 instructions. It will have to be converted to assembly,  
 and then it ought to be a bit faster. Since more than half of ZDBSP's time is
 spent in this   one function, it will hopefully be a measurable speedup.

SVN r2392 (trunk)
---
 nodebuild.h                      |  10 +-
 nodebuild_classify_sse2_vect.cpp | 213 +++++++++++++++++++++++++++++++
 workdata.h                       |  12 +-
 zdbsp_vs2005.vcproj              |  44 ++++---
 4 files changed, 256 insertions(+), 23 deletions(-)
 create mode 100644 nodebuild_classify_sse2_vect.cpp

diff --git a/nodebuild.h b/nodebuild.h
index 541cf85..f78206f 100644
--- a/nodebuild.h
+++ b/nodebuild.h
@@ -1,4 +1,5 @@
 #include <math.h>
+#include <xmmintrin.h>
 #include "doomdata.h"
 #include "workdata.h"
 #include "tarray.h"
@@ -69,7 +70,11 @@ class FNodeBuilder
 	};
 	struct FPrivVert
 	{
-		fixed_t x, y;
+		union
+		{
+			struct { fixed_t x, y; };
+			__m64 p64;
+		};
 		DWORD segs;		// segs that use this vertex as v1
 		DWORD segs2;	// segs that use this vertex as v2
 		int index;
@@ -297,7 +302,8 @@ inline int FNodeBuilder::ClassifyLine (node_t &node, const FPrivSeg *seg, int &s
 	return ClassifyLineBackpatch (node, seg, sidev1, sidev2);
 #else
 	if (SSELevel == 2)
-		return ClassifyLineSSE2 (node, seg, sidev1, sidev2);
+	{ int foo = ClassifyLineSSE2 (node, seg, sidev1, sidev2); assert(foo == ClassifyLine2(node,seg,sidev1,sidev2));
+	return foo; }
 	else if (SSELevel == 1)
 		return ClassifyLineSSE1 (node, seg, sidev1, sidev2);
 	else
diff --git a/nodebuild_classify_sse2_vect.cpp b/nodebuild_classify_sse2_vect.cpp
new file mode 100644
index 0000000..ef3475c
--- /dev/null
+++ b/nodebuild_classify_sse2_vect.cpp
@@ -0,0 +1,213 @@
+/*
+    Determine what side of a splitter a seg lies on. (SSE2 version)
+    Copyright (C) 2002-2006 Randy Heit
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+#ifndef DISABLE_SSE
+
+#include "zdbsp.h"
+#include "nodebuild.h"
+#include <emmintrin.h>
+
+#define FAR_ENOUGH 17179869184.f		// 4<<32
+
+// You may notice that this function is identical to ClassifyLine2.
+// The reason it is SSE2 is because this file is explicitly compiled
+// with SSE2 math enabled, but the other files are not.
+
+int FNodeBuilder::ClassifyLineSSE2 (node_t &node, const FPrivSeg *seg, int &sidev1, int &sidev2)
+{
+	const FPrivVert *v1 = &Vertices[seg->v1];
+	const FPrivVert *v2 = &Vertices[seg->v2];
+
+	__m128d xy, dxy, xyv1, xyv2;
+
+	// Why does this intrinsic go through an MMX register, when it can just go through memory?
+	// That would let it work with x64, too.
+	xy = _mm_cvtpi32_pd(node.p64);		// d_y1  d_x1
+	dxy = _mm_cvtpi32_pd(node.d64);		// d_dy  d_dx
+	xyv1 = _mm_cvtpi32_pd(v1->p64);		// d_yv1 d_xv1
+	xyv2 = _mm_cvtpi32_pd(v2->p64);		// d_yv2 d_xv2
+	_mm_empty();
+
+	__m128d num1, num2, dyx;
+
+	dyx = _mm_shuffle_pd(dxy, dxy, _MM_SHUFFLE2(0,1));
+	num1 = _mm_mul_pd(_mm_sub_pd(xy, xyv1), dyx);
+	num2 = _mm_mul_pd(_mm_sub_pd(xy, xyv2), dyx);
+
+	__m128d pnuma, pnumb, num;
+
+	pnuma = _mm_shuffle_pd(num1, num2, _MM_SHUFFLE2(1,1));
+	pnumb = _mm_shuffle_pd(num1, num2, _MM_SHUFFLE2(0,0));
+	num = _mm_sub_pd(pnuma, pnumb);
+	// s_num1 is at num[0]; s_num2 is at num[1]
+
+	__m128d neg_enough, pos_enough;
+	__m128d neg_check, pos_check;
+
+	neg_enough = _mm_set1_pd(-FAR_ENOUGH);
+	pos_enough = _mm_set1_pd( FAR_ENOUGH);
+
+	// Why do the comparison instructions return __m128d and not __m128i?
+	neg_check = _mm_cmple_pd(num, neg_enough);
+	pos_check = _mm_cmpge_pd(num, pos_enough);
+
+	union
+	{
+		struct
+		{
+			double n[2], p[2];
+		};
+		struct
+		{
+			__int64 ni[2], pi[2];
+		};
+	};
+
+	_mm_storeu_pd(n, neg_check);
+	_mm_storeu_pd(p, pos_check);
+
+	int nears = 0;
+
+	if (ni[0])
+	{
+		if (ni[1])
+		{
+			sidev1 = sidev2 = 1;
+			return 1;
+		}
+		if (pi[1])
+		{
+			sidev1 = 1;
+			sidev2 = -1;
+			return -1;
+		}
+		nears = 1;
+	}
+	else if (pi[0])
+	{
+		if (pi[1])
+		{
+			sidev1 = sidev2 = -1;
+			return 0;
+		}
+		if (ni[1])
+		{
+			sidev1 = -1;
+			sidev2 = 1;
+			return -1;
+		}
+		nears = 1;
+	}
+	else
+	{
+		nears = 2 | ((ni[1] | pi[1]) ? 0 : 1);
+	}
+
+	__m128d zero = _mm_setzero_pd();
+	__m128d posi = _mm_cmpgt_pd(num, zero);
+	_mm_storeu_pd(p, posi);
+
+	if (nears)
+	{
+		__m128d sqnum = _mm_mul_pd(num, num);
+		__m128d sqdyx = _mm_mul_pd(dyx, dyx);
+		__m128d sqdxy = _mm_mul_pd(dxy, dxy);
+		__m128d l = _mm_add_pd(sqdyx, sqdxy);
+		__m128d dist = _mm_div_pd(sqnum, l);
+		__m128d epsilon = _mm_set1_pd(SIDE_EPSILON);
+		__m128d close = _mm_cmplt_pd(dist, epsilon);
+		_mm_storeu_pd(n, close);
+		if (nears & 2)
+		{
+			if (ni[0])
+			{
+				sidev1 = 0;
+			}
+			else
+			{
+				sidev1 = pi[0] ? -1 : 1;
+			}
+		}
+		else
+		{
+			sidev1 = pi[0] ? -1 : 1;
+		}
+		if (nears & 1)
+		{
+			if (ni[1])
+			{
+				sidev2 = 0;
+			}
+			else
+			{
+				sidev2 = pi[1] ? -1 : 1;
+			}
+		}
+		else
+		{
+			sidev2 = pi[1] ? -1 : 1;
+		}
+	}
+	else
+	{
+		sidev1 = pi[0] ? -1 : 1;
+		sidev2 = pi[1] ? -1 : 1;
+	}
+
+	if ((sidev1 | sidev2) == 0)
+	{ // seg is coplanar with the splitter, so use its orientation to determine
+	  // which child it ends up in. If it faces the same direction as the splitter,
+	  // it goes in front. Otherwise, it goes in back.
+
+		if (node.dx != 0)
+		{
+			if ((node.dx > 0 && v2->x > v1->x) || (node.dx < 0 && v2->x < v1->x))
+			{
+				return 0;
+			}
+			else
+			{
+				return 1;
+			}
+		}
+		else
+		{
+			if ((node.dy > 0 && v2->y > v1->y) || (node.dy < 0 && v2->y < v1->y))
+			{
+				return 0;
+			}
+			else
+			{
+				return 1;
+			}
+		}
+	}
+	else if (sidev1 <= 0 && sidev2 <= 0)
+	{
+		return 0;
+	}
+	else if (sidev1 >= 0 && sidev2 >= 0)
+	{
+		return 1;
+	}
+	return -1;
+}
+
+#endif
diff --git a/workdata.h b/workdata.h
index e1647cc..4e09531 100644
--- a/workdata.h
+++ b/workdata.h
@@ -6,6 +6,7 @@
 #endif
 
 #include "zdbsp.h"
+#include <xmmintrin.h>
 
 struct vertex_t
 {
@@ -14,7 +15,16 @@ struct vertex_t
 
 struct node_t
 {
-	fixed_t x, y, dx, dy;
+	union
+	{
+		struct { fixed_t x, y; };
+		__m64 p64;
+	};
+	union
+	{
+		struct { fixed_t dx, dy; };
+		__m64 d64;
+	};
 	fixed_t bbox[2][4];
 	unsigned int intchildren[2];
 };
diff --git a/zdbsp_vs2005.vcproj b/zdbsp_vs2005.vcproj
index 4278109..a4dc36e 100644
--- a/zdbsp_vs2005.vcproj
+++ b/zdbsp_vs2005.vcproj
@@ -617,50 +617,54 @@
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
+					ExcludedFromBuild="true"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
-						EnableEnhancedInstructionSet="2"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|x64"
-					>
-					<Tool
-						Name="VCCLCompilerTool"
-						EnableEnhancedInstructionSet="2"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
 					Name="Release|Win32"
+					ExcludedFromBuild="true"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
-						EnableEnhancedInstructionSet="2"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|x64"
-					>
-					<Tool
-						Name="VCCLCompilerTool"
-						EnableEnhancedInstructionSet="2"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
 					Name="Release, SSE2|Win32"
+					ExcludedFromBuild="true"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath=".\nodebuild_classify_sse2_vect.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|x64"
+					ExcludedFromBuild="true"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|x64"
+					ExcludedFromBuild="true"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
-						EnableEnhancedInstructionSet="2"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
 					Name="Release, SSE2|x64"
+					ExcludedFromBuild="true"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
-						EnableEnhancedInstructionSet="2"
 					/>
 				</FileConfiguration>
 			</File>