- removed the separate SSE2 version of the node builder's ClassifyLine function and all code associated with it.

Like everything else related to doing standard math with SSE2 vs. x87, there's nothing to be gained here with anything but first generation SSE2 systems which are irrelevant these days.

Taking 'thespir2.wad' from https://forum.zdoom.org/viewtopic.php?f=1&t=10655 the SSE2 version is reproducably ~3% slower than the x87 version on my Core i7, which quite closely mirrors all my previous tests since 2007.
Overall this just looks like an optimization not worth doing.
This commit is contained in:
Christoph Oelckers 2017-02-26 12:47:16 +01:00
parent bfa7a2d737
commit 90b8dbb096
5 changed files with 2 additions and 311 deletions

View file

@ -397,22 +397,6 @@ if (NOT ZDOOM_USE_SSE2)
endif()
endif()
if( SSE_MATTERS )
if( WIN32 )
set( BACKPATCH 1 CACHE BOOL "Enable backpatching." )
else()
CHECK_FUNCTION_EXISTS(mprotect HAVE_MPROTECT)
if( HAVE_MPROTECT )
set( BACKPATCH 1 CACHE BOOL "Enable backpatching." )
else()
set( BACKPATCH 0 )
endif()
endif()
set( SSE 1 CACHE BOOL "Build SSE and SSE2 versions of key code." )
else()
set( BACKPATCH 0 )
endif()
if( X64 )
set( HAVE_MMX 1 )
else( X64 )
@ -577,10 +561,6 @@ endif()
# Flags
if( BACKPATCH )
add_definitions( -DBACKPATCH )
endif()
# Update gitinfo.h
add_custom_target( revision_check ALL
@ -726,18 +706,6 @@ add_custom_command( OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/sc_man_scanner.h
include_directories( ${CMAKE_CURRENT_BINARY_DIR} )
if( SSE_MATTERS )
if( SSE )
set( X86_SOURCES nodebuild_classify_sse2.cpp )
set_source_files_properties( nodebuild_classify_sse2.cpp PROPERTIES COMPILE_FLAGS "${SSE2_ENABLE}" )
else()
add_definitions( -DDISABLE_SSE )
endif()
else()
add_definitions( -DDISABLE_SSE )
set( X86_SOURCES )
endif()
if( SNDFILE_FOUND )
add_definitions( -DHAVE_SNDFILE )
endif()

View file

@ -1062,95 +1062,3 @@ void FNodeBuilder::PrintSet (int l, DWORD set)
}
Printf (PRINT_LOG, "*\n");
}
#ifdef BACKPATCH
#ifdef _WIN32
extern "C" {
__declspec(dllimport) int __stdcall VirtualProtect(void *, unsigned long, unsigned long, unsigned long *);
}
#define PAGE_EXECUTE_READWRITE 64
#else
#include <sys/mman.h>
#include <limits.h>
#include <unistd.h>
#endif
#ifdef __GNUC__
extern "C" int ClassifyLineBackpatch (node_t &node, const FSimpleVert *v1, const FSimpleVert *v2, int sidev[2])
#else
static int *CallerOffset;
int ClassifyLineBackpatchC (node_t &node, const FSimpleVert *v1, const FSimpleVert *v2, int sidev[2])
#endif
{
// Select the routine based on SSE2 availability and patch the caller so that
// they call that routine directly next time instead of going through here.
int *calleroffset;
int diff;
int (*func)(node_t &, const FSimpleVert *, const FSimpleVert *, int[2]);
#ifdef __GNUC__
calleroffset = (int *)__builtin_return_address(0);
#else
calleroffset = CallerOffset;
#endif
// printf ("Patching for SSE %d @ %p %d\n", SSELevel, calleroffset, *calleroffset);
#ifndef DISABLE_SSE
if (CPU.bSSE2)
{
func = ClassifyLineSSE2;
diff = int((char *)ClassifyLineSSE2 - (char *)calleroffset);
}
else
#endif
{
func = ClassifyLine2;
diff = int((char *)ClassifyLine2 - (char *)calleroffset);
}
calleroffset--;
// Patch the caller.
#ifdef _WIN32
unsigned long oldprotect;
if (VirtualProtect (calleroffset, 4, PAGE_EXECUTE_READWRITE, &oldprotect))
#else
// must make this page-aligned for mprotect
long pagesize = sysconf(_SC_PAGESIZE);
char *callerpage = (char *)((intptr_t)calleroffset & ~(pagesize - 1));
size_t protectlen = (intptr_t)calleroffset + sizeof(void*) - (intptr_t)callerpage;
int ptect;
if (!(ptect = mprotect(callerpage, protectlen, PROT_READ|PROT_WRITE|PROT_EXEC)))
#endif
{
*calleroffset = diff;
#ifdef _WIN32
VirtualProtect (calleroffset, sizeof(void*), oldprotect, &oldprotect);
#else
mprotect(callerpage, protectlen, PROT_READ|PROT_EXEC);
#endif
}
// And return by calling the real function.
return func (node, v1, v2, sidev);
}
#ifndef __GNUC__
// The ClassifyLineBackpatch() function here is a stub that uses inline assembly and nakedness
// to retrieve the return address of the stack before sending control to the real
// ClassifyLineBackpatchC() function. Since BACKPATCH shouldn't be defined on 64-bit builds,
// we're okay that VC++ can't do inline assembly on that target.
extern "C" __declspec(noinline) __declspec(naked) int ClassifyLineBackpatch (node_t &node, const FSimpleVert *v1, const FSimpleVert *v2, int sidev[2])
{
// We store the return address in a global, so as not to need to mess with the parameter list.
__asm
{
mov eax, [esp]
mov CallerOffset, eax
jmp ClassifyLineBackpatchC
}
}
#endif
#endif

View file

@ -53,22 +53,6 @@ struct FSimpleVert
fixed_t x, y;
};
extern "C"
{
int ClassifyLine2 (node_t &node, const FSimpleVert *v1, const FSimpleVert *v2, int sidev[2]);
#ifndef DISABLE_SSE
int ClassifyLineSSE1 (node_t &node, const FSimpleVert *v1, const FSimpleVert *v2, int sidev[2]);
int ClassifyLineSSE2 (node_t &node, const FSimpleVert *v1, const FSimpleVert *v2, int sidev[2]);
#ifdef BACKPATCH
#ifdef __GNUC__
int ClassifyLineBackpatch (node_t &node, const FSimpleVert *v1, const FSimpleVert *v2, int sidev[2]) __attribute__((noinline));
#else
int __declspec(noinline) ClassifyLineBackpatch (node_t &node, const FSimpleVert *v1, const FSimpleVert *v2, int sidev[2]);
#endif
#endif
#endif
}
class FNodeBuilder
{
struct FPrivSeg
@ -282,7 +266,7 @@ private:
// 1 = seg is in back
// -1 = seg cuts the node
inline int ClassifyLine (node_t &node, const FPrivVert *v1, const FPrivVert *v2, int sidev[2]);
int ClassifyLine (node_t &node, const FPrivVert *v1, const FPrivVert *v2, int sidev[2]);
void FixSplitSharers (const node_t &node);
double AddIntersection (const node_t &node, int vertex);
@ -341,28 +325,3 @@ inline int FNodeBuilder::PointOnSide (int x, int y, int x1, int y1, int dx, int
}
return s_num > 0.0 ? -1 : 1;
}
inline int FNodeBuilder::ClassifyLine (node_t &node, const FPrivVert *v1, const FPrivVert *v2, int sidev[2])
{
#ifdef DISABLE_SSE
return ClassifyLine2 (node, v1, v2, sidev);
#else
#if defined(__SSE2__) || defined(_M_X64)
// If compiling with SSE2 support everywhere, just use the SSE2 version.
return ClassifyLineSSE2 (node, v1, v2, sidev);
#elif defined(_MSC_VER) && _MSC_VER < 1300
// VC 6 does not support SSE optimizations.
return ClassifyLine2 (node, v1, v2, sidev);
#else
// Select the routine based on our flag.
#ifdef BACKPATCH
return ClassifyLineBackpatch (node, v1, v2, sidev);
#else
if (CPU.bSSE2)
return ClassifyLineSSE2 (node, v1, v2, sidev);
else
return ClassifyLine2 (node, v1, v2, sidev);
#endif
#endif
#endif
}

View file

@ -3,7 +3,7 @@
#define FAR_ENOUGH 17179869184.f // 4<<32
extern "C" int ClassifyLine2 (node_t &node, const FSimpleVert *v1, const FSimpleVert *v2, int sidev[2])
int FNodeBuilder::ClassifyLine(node_t &node, const FPrivVert *v1, const FPrivVert *v2, int sidev[2])
{
double d_x1 = double(node.x);
double d_y1 = double(node.y);

View file

@ -1,144 +0,0 @@
#ifndef DISABLE_SSE
#include "doomtype.h"
#include "nodebuild.h"
#define FAR_ENOUGH 17179869184.f // 4<<32
// You may notice that this function is identical to ClassifyLine2.
// The reason it is SSE2 is because this file is explicitly compiled
// with SSE2 math enabled, but the other files are not.
extern "C" int ClassifyLineSSE2 (node_t &node, const FSimpleVert *v1, const FSimpleVert *v2, int sidev[2])
{
double d_x1 = double(node.x);
double d_y1 = double(node.y);
double d_dx = double(node.dx);
double d_dy = double(node.dy);
double d_xv1 = double(v1->x);
double d_xv2 = double(v2->x);
double d_yv1 = double(v1->y);
double d_yv2 = double(v2->y);
double s_num1 = (d_y1 - d_yv1) * d_dx - (d_x1 - d_xv1) * d_dy;
double s_num2 = (d_y1 - d_yv2) * d_dx - (d_x1 - d_xv2) * d_dy;
int nears = 0;
if (s_num1 <= -FAR_ENOUGH)
{
if (s_num2 <= -FAR_ENOUGH)
{
sidev[0] = sidev[1] = 1;
return 1;
}
if (s_num2 >= FAR_ENOUGH)
{
sidev[0] = 1;
sidev[1] = -1;
return -1;
}
nears = 1;
}
else if (s_num1 >= FAR_ENOUGH)
{
if (s_num2 >= FAR_ENOUGH)
{
sidev[0] = sidev[1] = -1;
return 0;
}
if (s_num2 <= -FAR_ENOUGH)
{
sidev[0] = -1;
sidev[1] = 1;
return -1;
}
nears = 1;
}
else
{
nears = 2 | int(fabs(s_num2) < FAR_ENOUGH);
}
if (nears)
{
double l = 1.f / (d_dx*d_dx + d_dy*d_dy);
if (nears & 2)
{
double dist = s_num1 * s_num1 * l;
if (dist < SIDE_EPSILON*SIDE_EPSILON)
{
sidev[0] = 0;
}
else
{
sidev[0] = s_num1 > 0.0 ? -1 : 1;
}
}
else
{
sidev[0] = s_num1 > 0.0 ? -1 : 1;
}
if (nears & 1)
{
double dist = s_num2 * s_num2 * l;
if (dist < SIDE_EPSILON*SIDE_EPSILON)
{
sidev[1] = 0;
}
else
{
sidev[1] = s_num2 > 0.0 ? -1 : 1;
}
}
else
{
sidev[1] = s_num2 > 0.0 ? -1 : 1;
}
}
else
{
sidev[0] = s_num1 > 0.0 ? -1 : 1;
sidev[1] = s_num2 > 0.0 ? -1 : 1;
}
if ((sidev[0] | sidev[1]) == 0)
{ // seg is coplanar with the splitter, so use its orientation to determine
// which child it ends up in. If it faces the same direction as the splitter,
// it goes in front. Otherwise, it goes in back.
if (node.dx != 0)
{
if ((node.dx > 0 && v2->x > v1->x) || (node.dx < 0 && v2->x < v1->x))
{
return 0;
}
else
{
return 1;
}
}
else
{
if ((node.dy > 0 && v2->y > v1->y) || (node.dy < 0 && v2->y < v1->y))
{
return 0;
}
else
{
return 1;
}
}
}
else if (sidev[0] <= 0 && sidev[1] <= 0)
{
return 0;
}
else if (sidev[0] >= 0 && sidev[1] >= 0)
{
return 1;
}
return -1;
}
#endif