From 6111d84c1b6498be50c4717e24097598ac2567d8 Mon Sep 17 00:00:00 2001 From: Randy Heit Date: Tue, 29 Jun 2010 03:32:10 +0000 Subject: [PATCH] - Added a vectorized SSE2 version of ClassifyLine. Since compiler support for SSE intrinsics pretty much sucks, this is slower than the unvectored version I get when I let the compiler compile the regular function with SSE2 instructions. It will have to be converted to assembly, and then it ought to be a bit faster. Since more than half of ZDBSP's time is spent in this one function, it will hopefully be a measurable speedup. SVN r2392 (trunk) --- nodebuild.h | 10 +- nodebuild_classify_sse2_vect.cpp | 213 +++++++++++++++++++++++++++++++ workdata.h | 12 +- zdbsp_vs2005.vcproj | 44 ++++--- 4 files changed, 256 insertions(+), 23 deletions(-) create mode 100644 nodebuild_classify_sse2_vect.cpp diff --git a/nodebuild.h b/nodebuild.h index 541cf85..f78206f 100644 --- a/nodebuild.h +++ b/nodebuild.h @@ -1,4 +1,5 @@ #include +#include #include "doomdata.h" #include "workdata.h" #include "tarray.h" @@ -69,7 +70,11 @@ class FNodeBuilder }; struct FPrivVert { - fixed_t x, y; + union + { + struct { fixed_t x, y; }; + __m64 p64; + }; DWORD segs; // segs that use this vertex as v1 DWORD segs2; // segs that use this vertex as v2 int index; @@ -297,7 +302,8 @@ inline int FNodeBuilder::ClassifyLine (node_t &node, const FPrivSeg *seg, int &s return ClassifyLineBackpatch (node, seg, sidev1, sidev2); #else if (SSELevel == 2) - return ClassifyLineSSE2 (node, seg, sidev1, sidev2); + { int foo = ClassifyLineSSE2 (node, seg, sidev1, sidev2); assert(foo == ClassifyLine2(node,seg,sidev1,sidev2)); + return foo; } else if (SSELevel == 1) return ClassifyLineSSE1 (node, seg, sidev1, sidev2); else diff --git a/nodebuild_classify_sse2_vect.cpp b/nodebuild_classify_sse2_vect.cpp new file mode 100644 index 0000000..ef3475c --- /dev/null +++ b/nodebuild_classify_sse2_vect.cpp @@ -0,0 +1,213 @@ +/* + Determine what side of a splitter a seg lies on. (SSE2 version) + Copyright (C) 2002-2006 Randy Heit + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +*/ + +#ifndef DISABLE_SSE + +#include "zdbsp.h" +#include "nodebuild.h" +#include + +#define FAR_ENOUGH 17179869184.f // 4<<32 + +// You may notice that this function is identical to ClassifyLine2. +// The reason it is SSE2 is because this file is explicitly compiled +// with SSE2 math enabled, but the other files are not. + +int FNodeBuilder::ClassifyLineSSE2 (node_t &node, const FPrivSeg *seg, int &sidev1, int &sidev2) +{ + const FPrivVert *v1 = &Vertices[seg->v1]; + const FPrivVert *v2 = &Vertices[seg->v2]; + + __m128d xy, dxy, xyv1, xyv2; + + // Why does this intrinsic go through an MMX register, when it can just go through memory? + // That would let it work with x64, too. + xy = _mm_cvtpi32_pd(node.p64); // d_y1 d_x1 + dxy = _mm_cvtpi32_pd(node.d64); // d_dy d_dx + xyv1 = _mm_cvtpi32_pd(v1->p64); // d_yv1 d_xv1 + xyv2 = _mm_cvtpi32_pd(v2->p64); // d_yv2 d_xv2 + _mm_empty(); + + __m128d num1, num2, dyx; + + dyx = _mm_shuffle_pd(dxy, dxy, _MM_SHUFFLE2(0,1)); + num1 = _mm_mul_pd(_mm_sub_pd(xy, xyv1), dyx); + num2 = _mm_mul_pd(_mm_sub_pd(xy, xyv2), dyx); + + __m128d pnuma, pnumb, num; + + pnuma = _mm_shuffle_pd(num1, num2, _MM_SHUFFLE2(1,1)); + pnumb = _mm_shuffle_pd(num1, num2, _MM_SHUFFLE2(0,0)); + num = _mm_sub_pd(pnuma, pnumb); + // s_num1 is at num[0]; s_num2 is at num[1] + + __m128d neg_enough, pos_enough; + __m128d neg_check, pos_check; + + neg_enough = _mm_set1_pd(-FAR_ENOUGH); + pos_enough = _mm_set1_pd( FAR_ENOUGH); + + // Why do the comparison instructions return __m128d and not __m128i? + neg_check = _mm_cmple_pd(num, neg_enough); + pos_check = _mm_cmpge_pd(num, pos_enough); + + union + { + struct + { + double n[2], p[2]; + }; + struct + { + __int64 ni[2], pi[2]; + }; + }; + + _mm_storeu_pd(n, neg_check); + _mm_storeu_pd(p, pos_check); + + int nears = 0; + + if (ni[0]) + { + if (ni[1]) + { + sidev1 = sidev2 = 1; + return 1; + } + if (pi[1]) + { + sidev1 = 1; + sidev2 = -1; + return -1; + } + nears = 1; + } + else if (pi[0]) + { + if (pi[1]) + { + sidev1 = sidev2 = -1; + return 0; + } + if (ni[1]) + { + sidev1 = -1; + sidev2 = 1; + return -1; + } + nears = 1; + } + else + { + nears = 2 | ((ni[1] | pi[1]) ? 0 : 1); + } + + __m128d zero = _mm_setzero_pd(); + __m128d posi = _mm_cmpgt_pd(num, zero); + _mm_storeu_pd(p, posi); + + if (nears) + { + __m128d sqnum = _mm_mul_pd(num, num); + __m128d sqdyx = _mm_mul_pd(dyx, dyx); + __m128d sqdxy = _mm_mul_pd(dxy, dxy); + __m128d l = _mm_add_pd(sqdyx, sqdxy); + __m128d dist = _mm_div_pd(sqnum, l); + __m128d epsilon = _mm_set1_pd(SIDE_EPSILON); + __m128d close = _mm_cmplt_pd(dist, epsilon); + _mm_storeu_pd(n, close); + if (nears & 2) + { + if (ni[0]) + { + sidev1 = 0; + } + else + { + sidev1 = pi[0] ? -1 : 1; + } + } + else + { + sidev1 = pi[0] ? -1 : 1; + } + if (nears & 1) + { + if (ni[1]) + { + sidev2 = 0; + } + else + { + sidev2 = pi[1] ? -1 : 1; + } + } + else + { + sidev2 = pi[1] ? -1 : 1; + } + } + else + { + sidev1 = pi[0] ? -1 : 1; + sidev2 = pi[1] ? -1 : 1; + } + + if ((sidev1 | sidev2) == 0) + { // seg is coplanar with the splitter, so use its orientation to determine + // which child it ends up in. If it faces the same direction as the splitter, + // it goes in front. Otherwise, it goes in back. + + if (node.dx != 0) + { + if ((node.dx > 0 && v2->x > v1->x) || (node.dx < 0 && v2->x < v1->x)) + { + return 0; + } + else + { + return 1; + } + } + else + { + if ((node.dy > 0 && v2->y > v1->y) || (node.dy < 0 && v2->y < v1->y)) + { + return 0; + } + else + { + return 1; + } + } + } + else if (sidev1 <= 0 && sidev2 <= 0) + { + return 0; + } + else if (sidev1 >= 0 && sidev2 >= 0) + { + return 1; + } + return -1; +} + +#endif diff --git a/workdata.h b/workdata.h index e1647cc..4e09531 100644 --- a/workdata.h +++ b/workdata.h @@ -6,6 +6,7 @@ #endif #include "zdbsp.h" +#include struct vertex_t { @@ -14,7 +15,16 @@ struct vertex_t struct node_t { - fixed_t x, y, dx, dy; + union + { + struct { fixed_t x, y; }; + __m64 p64; + }; + union + { + struct { fixed_t dx, dy; }; + __m64 d64; + }; fixed_t bbox[2][4]; unsigned int intchildren[2]; }; diff --git a/zdbsp_vs2005.vcproj b/zdbsp_vs2005.vcproj index 4278109..a4dc36e 100644 --- a/zdbsp_vs2005.vcproj +++ b/zdbsp_vs2005.vcproj @@ -617,50 +617,54 @@ > - - - - - - + + + + + + + +