mirror of
https://github.com/ZDoom/zdbsp.git
synced 2024-11-28 14:41:59 +00:00
- Added a vectorized SSE2 version of ClassifyLine. Since compiler support for SSE intrinsics
pretty much sucks, this is slower than the unvectored version I get when I let the compiler compile the regular function with SSE2 instructions. It will have to be converted to assembly, and then it ought to be a bit faster. Since more than half of ZDBSP's time is spent in this one function, it will hopefully be a measurable speedup. SVN r2392 (trunk)
This commit is contained in:
parent
594e1fd562
commit
6111d84c1b
4 changed files with 256 additions and 23 deletions
10
nodebuild.h
10
nodebuild.h
|
@ -1,4 +1,5 @@
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
#include <xmmintrin.h>
|
||||||
#include "doomdata.h"
|
#include "doomdata.h"
|
||||||
#include "workdata.h"
|
#include "workdata.h"
|
||||||
#include "tarray.h"
|
#include "tarray.h"
|
||||||
|
@ -69,7 +70,11 @@ class FNodeBuilder
|
||||||
};
|
};
|
||||||
struct FPrivVert
|
struct FPrivVert
|
||||||
{
|
{
|
||||||
fixed_t x, y;
|
union
|
||||||
|
{
|
||||||
|
struct { fixed_t x, y; };
|
||||||
|
__m64 p64;
|
||||||
|
};
|
||||||
DWORD segs; // segs that use this vertex as v1
|
DWORD segs; // segs that use this vertex as v1
|
||||||
DWORD segs2; // segs that use this vertex as v2
|
DWORD segs2; // segs that use this vertex as v2
|
||||||
int index;
|
int index;
|
||||||
|
@ -297,7 +302,8 @@ inline int FNodeBuilder::ClassifyLine (node_t &node, const FPrivSeg *seg, int &s
|
||||||
return ClassifyLineBackpatch (node, seg, sidev1, sidev2);
|
return ClassifyLineBackpatch (node, seg, sidev1, sidev2);
|
||||||
#else
|
#else
|
||||||
if (SSELevel == 2)
|
if (SSELevel == 2)
|
||||||
return ClassifyLineSSE2 (node, seg, sidev1, sidev2);
|
{ int foo = ClassifyLineSSE2 (node, seg, sidev1, sidev2); assert(foo == ClassifyLine2(node,seg,sidev1,sidev2));
|
||||||
|
return foo; }
|
||||||
else if (SSELevel == 1)
|
else if (SSELevel == 1)
|
||||||
return ClassifyLineSSE1 (node, seg, sidev1, sidev2);
|
return ClassifyLineSSE1 (node, seg, sidev1, sidev2);
|
||||||
else
|
else
|
||||||
|
|
213
nodebuild_classify_sse2_vect.cpp
Normal file
213
nodebuild_classify_sse2_vect.cpp
Normal file
|
@ -0,0 +1,213 @@
|
||||||
|
/*
|
||||||
|
Determine what side of a splitter a seg lies on. (SSE2 version)
|
||||||
|
Copyright (C) 2002-2006 Randy Heit
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with this program; if not, write to the Free Software
|
||||||
|
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef DISABLE_SSE
|
||||||
|
|
||||||
|
#include "zdbsp.h"
|
||||||
|
#include "nodebuild.h"
|
||||||
|
#include <emmintrin.h>
|
||||||
|
|
||||||
|
#define FAR_ENOUGH 17179869184.f // 4<<32
|
||||||
|
|
||||||
|
// You may notice that this function is identical to ClassifyLine2.
|
||||||
|
// The reason it is SSE2 is because this file is explicitly compiled
|
||||||
|
// with SSE2 math enabled, but the other files are not.
|
||||||
|
|
||||||
|
int FNodeBuilder::ClassifyLineSSE2 (node_t &node, const FPrivSeg *seg, int &sidev1, int &sidev2)
|
||||||
|
{
|
||||||
|
const FPrivVert *v1 = &Vertices[seg->v1];
|
||||||
|
const FPrivVert *v2 = &Vertices[seg->v2];
|
||||||
|
|
||||||
|
__m128d xy, dxy, xyv1, xyv2;
|
||||||
|
|
||||||
|
// Why does this intrinsic go through an MMX register, when it can just go through memory?
|
||||||
|
// That would let it work with x64, too.
|
||||||
|
xy = _mm_cvtpi32_pd(node.p64); // d_y1 d_x1
|
||||||
|
dxy = _mm_cvtpi32_pd(node.d64); // d_dy d_dx
|
||||||
|
xyv1 = _mm_cvtpi32_pd(v1->p64); // d_yv1 d_xv1
|
||||||
|
xyv2 = _mm_cvtpi32_pd(v2->p64); // d_yv2 d_xv2
|
||||||
|
_mm_empty();
|
||||||
|
|
||||||
|
__m128d num1, num2, dyx;
|
||||||
|
|
||||||
|
dyx = _mm_shuffle_pd(dxy, dxy, _MM_SHUFFLE2(0,1));
|
||||||
|
num1 = _mm_mul_pd(_mm_sub_pd(xy, xyv1), dyx);
|
||||||
|
num2 = _mm_mul_pd(_mm_sub_pd(xy, xyv2), dyx);
|
||||||
|
|
||||||
|
__m128d pnuma, pnumb, num;
|
||||||
|
|
||||||
|
pnuma = _mm_shuffle_pd(num1, num2, _MM_SHUFFLE2(1,1));
|
||||||
|
pnumb = _mm_shuffle_pd(num1, num2, _MM_SHUFFLE2(0,0));
|
||||||
|
num = _mm_sub_pd(pnuma, pnumb);
|
||||||
|
// s_num1 is at num[0]; s_num2 is at num[1]
|
||||||
|
|
||||||
|
__m128d neg_enough, pos_enough;
|
||||||
|
__m128d neg_check, pos_check;
|
||||||
|
|
||||||
|
neg_enough = _mm_set1_pd(-FAR_ENOUGH);
|
||||||
|
pos_enough = _mm_set1_pd( FAR_ENOUGH);
|
||||||
|
|
||||||
|
// Why do the comparison instructions return __m128d and not __m128i?
|
||||||
|
neg_check = _mm_cmple_pd(num, neg_enough);
|
||||||
|
pos_check = _mm_cmpge_pd(num, pos_enough);
|
||||||
|
|
||||||
|
union
|
||||||
|
{
|
||||||
|
struct
|
||||||
|
{
|
||||||
|
double n[2], p[2];
|
||||||
|
};
|
||||||
|
struct
|
||||||
|
{
|
||||||
|
__int64 ni[2], pi[2];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
_mm_storeu_pd(n, neg_check);
|
||||||
|
_mm_storeu_pd(p, pos_check);
|
||||||
|
|
||||||
|
int nears = 0;
|
||||||
|
|
||||||
|
if (ni[0])
|
||||||
|
{
|
||||||
|
if (ni[1])
|
||||||
|
{
|
||||||
|
sidev1 = sidev2 = 1;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (pi[1])
|
||||||
|
{
|
||||||
|
sidev1 = 1;
|
||||||
|
sidev2 = -1;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
nears = 1;
|
||||||
|
}
|
||||||
|
else if (pi[0])
|
||||||
|
{
|
||||||
|
if (pi[1])
|
||||||
|
{
|
||||||
|
sidev1 = sidev2 = -1;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (ni[1])
|
||||||
|
{
|
||||||
|
sidev1 = -1;
|
||||||
|
sidev2 = 1;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
nears = 1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
nears = 2 | ((ni[1] | pi[1]) ? 0 : 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
__m128d zero = _mm_setzero_pd();
|
||||||
|
__m128d posi = _mm_cmpgt_pd(num, zero);
|
||||||
|
_mm_storeu_pd(p, posi);
|
||||||
|
|
||||||
|
if (nears)
|
||||||
|
{
|
||||||
|
__m128d sqnum = _mm_mul_pd(num, num);
|
||||||
|
__m128d sqdyx = _mm_mul_pd(dyx, dyx);
|
||||||
|
__m128d sqdxy = _mm_mul_pd(dxy, dxy);
|
||||||
|
__m128d l = _mm_add_pd(sqdyx, sqdxy);
|
||||||
|
__m128d dist = _mm_div_pd(sqnum, l);
|
||||||
|
__m128d epsilon = _mm_set1_pd(SIDE_EPSILON);
|
||||||
|
__m128d close = _mm_cmplt_pd(dist, epsilon);
|
||||||
|
_mm_storeu_pd(n, close);
|
||||||
|
if (nears & 2)
|
||||||
|
{
|
||||||
|
if (ni[0])
|
||||||
|
{
|
||||||
|
sidev1 = 0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
sidev1 = pi[0] ? -1 : 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
sidev1 = pi[0] ? -1 : 1;
|
||||||
|
}
|
||||||
|
if (nears & 1)
|
||||||
|
{
|
||||||
|
if (ni[1])
|
||||||
|
{
|
||||||
|
sidev2 = 0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
sidev2 = pi[1] ? -1 : 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
sidev2 = pi[1] ? -1 : 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
sidev1 = pi[0] ? -1 : 1;
|
||||||
|
sidev2 = pi[1] ? -1 : 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((sidev1 | sidev2) == 0)
|
||||||
|
{ // seg is coplanar with the splitter, so use its orientation to determine
|
||||||
|
// which child it ends up in. If it faces the same direction as the splitter,
|
||||||
|
// it goes in front. Otherwise, it goes in back.
|
||||||
|
|
||||||
|
if (node.dx != 0)
|
||||||
|
{
|
||||||
|
if ((node.dx > 0 && v2->x > v1->x) || (node.dx < 0 && v2->x < v1->x))
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if ((node.dy > 0 && v2->y > v1->y) || (node.dy < 0 && v2->y < v1->y))
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (sidev1 <= 0 && sidev2 <= 0)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
else if (sidev1 >= 0 && sidev2 >= 0)
|
||||||
|
{
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
12
workdata.h
12
workdata.h
|
@ -6,6 +6,7 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "zdbsp.h"
|
#include "zdbsp.h"
|
||||||
|
#include <xmmintrin.h>
|
||||||
|
|
||||||
struct vertex_t
|
struct vertex_t
|
||||||
{
|
{
|
||||||
|
@ -14,7 +15,16 @@ struct vertex_t
|
||||||
|
|
||||||
struct node_t
|
struct node_t
|
||||||
{
|
{
|
||||||
fixed_t x, y, dx, dy;
|
union
|
||||||
|
{
|
||||||
|
struct { fixed_t x, y; };
|
||||||
|
__m64 p64;
|
||||||
|
};
|
||||||
|
union
|
||||||
|
{
|
||||||
|
struct { fixed_t dx, dy; };
|
||||||
|
__m64 d64;
|
||||||
|
};
|
||||||
fixed_t bbox[2][4];
|
fixed_t bbox[2][4];
|
||||||
unsigned int intchildren[2];
|
unsigned int intchildren[2];
|
||||||
};
|
};
|
||||||
|
|
|
@ -617,50 +617,54 @@
|
||||||
>
|
>
|
||||||
<FileConfiguration
|
<FileConfiguration
|
||||||
Name="Debug|Win32"
|
Name="Debug|Win32"
|
||||||
|
ExcludedFromBuild="true"
|
||||||
>
|
>
|
||||||
<Tool
|
<Tool
|
||||||
Name="VCCLCompilerTool"
|
Name="VCCLCompilerTool"
|
||||||
EnableEnhancedInstructionSet="2"
|
|
||||||
/>
|
|
||||||
</FileConfiguration>
|
|
||||||
<FileConfiguration
|
|
||||||
Name="Debug|x64"
|
|
||||||
>
|
|
||||||
<Tool
|
|
||||||
Name="VCCLCompilerTool"
|
|
||||||
EnableEnhancedInstructionSet="2"
|
|
||||||
/>
|
/>
|
||||||
</FileConfiguration>
|
</FileConfiguration>
|
||||||
<FileConfiguration
|
<FileConfiguration
|
||||||
Name="Release|Win32"
|
Name="Release|Win32"
|
||||||
|
ExcludedFromBuild="true"
|
||||||
>
|
>
|
||||||
<Tool
|
<Tool
|
||||||
Name="VCCLCompilerTool"
|
Name="VCCLCompilerTool"
|
||||||
EnableEnhancedInstructionSet="2"
|
|
||||||
/>
|
|
||||||
</FileConfiguration>
|
|
||||||
<FileConfiguration
|
|
||||||
Name="Release|x64"
|
|
||||||
>
|
|
||||||
<Tool
|
|
||||||
Name="VCCLCompilerTool"
|
|
||||||
EnableEnhancedInstructionSet="2"
|
|
||||||
/>
|
/>
|
||||||
</FileConfiguration>
|
</FileConfiguration>
|
||||||
<FileConfiguration
|
<FileConfiguration
|
||||||
Name="Release, SSE2|Win32"
|
Name="Release, SSE2|Win32"
|
||||||
|
ExcludedFromBuild="true"
|
||||||
|
>
|
||||||
|
<Tool
|
||||||
|
Name="VCCLCompilerTool"
|
||||||
|
/>
|
||||||
|
</FileConfiguration>
|
||||||
|
</File>
|
||||||
|
<File
|
||||||
|
RelativePath=".\nodebuild_classify_sse2_vect.cpp"
|
||||||
|
>
|
||||||
|
<FileConfiguration
|
||||||
|
Name="Debug|x64"
|
||||||
|
ExcludedFromBuild="true"
|
||||||
|
>
|
||||||
|
<Tool
|
||||||
|
Name="VCCLCompilerTool"
|
||||||
|
/>
|
||||||
|
</FileConfiguration>
|
||||||
|
<FileConfiguration
|
||||||
|
Name="Release|x64"
|
||||||
|
ExcludedFromBuild="true"
|
||||||
>
|
>
|
||||||
<Tool
|
<Tool
|
||||||
Name="VCCLCompilerTool"
|
Name="VCCLCompilerTool"
|
||||||
EnableEnhancedInstructionSet="2"
|
|
||||||
/>
|
/>
|
||||||
</FileConfiguration>
|
</FileConfiguration>
|
||||||
<FileConfiguration
|
<FileConfiguration
|
||||||
Name="Release, SSE2|x64"
|
Name="Release, SSE2|x64"
|
||||||
|
ExcludedFromBuild="true"
|
||||||
>
|
>
|
||||||
<Tool
|
<Tool
|
||||||
Name="VCCLCompilerTool"
|
Name="VCCLCompilerTool"
|
||||||
EnableEnhancedInstructionSet="2"
|
|
||||||
/>
|
/>
|
||||||
</FileConfiguration>
|
</FileConfiguration>
|
||||||
</File>
|
</File>
|
||||||
|
|
Loading…
Reference in a new issue