mirror of
https://github.com/ZDoom/ZDRay.git
synced 2025-02-09 15:51:08 +00:00
- add SSE implementation for the most time critical function
This commit is contained in:
parent
e21679a4d7
commit
ce518820ff
2 changed files with 38 additions and 0 deletions
|
@ -23,6 +23,9 @@
|
||||||
#include "collision.h"
|
#include "collision.h"
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
|
#ifndef NO_SSE
|
||||||
|
#include <immintrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
TriangleMeshShape::TriangleMeshShape(const kexVec3 *vertices, int num_vertices, const unsigned int *elements, int num_elements, const int *surfaces)
|
TriangleMeshShape::TriangleMeshShape(const kexVec3 *vertices, int num_vertices, const unsigned int *elements, int num_elements, const int *surfaces)
|
||||||
: vertices(vertices), num_vertices(num_vertices), elements(elements), num_elements(num_elements), surfaces(surfaces)
|
: vertices(vertices), num_vertices(num_vertices), elements(elements), num_elements(num_elements), surfaces(surfaces)
|
||||||
|
@ -848,8 +851,40 @@ IntersectionTest::Result IntersectionTest::frustum_obb(const FrustumPlanes &frus
|
||||||
return inside;
|
return inside;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const uint32_t clearsignbitmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
|
||||||
|
|
||||||
IntersectionTest::OverlapResult IntersectionTest::ray_aabb(const RayBBox &ray, const CollisionBBox &aabb)
|
IntersectionTest::OverlapResult IntersectionTest::ray_aabb(const RayBBox &ray, const CollisionBBox &aabb)
|
||||||
{
|
{
|
||||||
|
#ifndef NO_SSE
|
||||||
|
|
||||||
|
__m128 v = _mm_loadu_ps(&ray.v.x);
|
||||||
|
__m128 w = _mm_loadu_ps(&ray.w.x);
|
||||||
|
__m128 h = _mm_loadu_ps(&aabb.Extents.x);
|
||||||
|
__m128 c = _mm_sub_ps(_mm_loadu_ps(&ray.c.x), _mm_loadu_ps(&aabb.Center.x));
|
||||||
|
|
||||||
|
__m128 clearsignbit = _mm_loadu_ps(reinterpret_cast<const float*>(clearsignbitmask));
|
||||||
|
|
||||||
|
__m128 abs_c = _mm_and_ps(c, clearsignbit);
|
||||||
|
int mask = _mm_movemask_ps(_mm_cmpgt_ps(abs_c, _mm_add_ps(v, h)));
|
||||||
|
if (mask & 7)
|
||||||
|
return disjoint;
|
||||||
|
|
||||||
|
__m128 c1 = _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 0, 1)); // c.y, c.x, c.x
|
||||||
|
__m128 c2 = _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 1, 2, 2)); // c.z, c.z, c.y
|
||||||
|
__m128 w1 = _mm_shuffle_ps(w, w, _MM_SHUFFLE(3, 1, 2, 2)); // w.z, w.z, w.y
|
||||||
|
__m128 w2 = _mm_shuffle_ps(w, w, _MM_SHUFFLE(3, 0, 0, 1)); // w.y, w.x, w.x
|
||||||
|
__m128 lhs = _mm_and_ps(_mm_sub_ps(_mm_mul_ps(c1, w1), _mm_mul_ps(c2, w2)), clearsignbit);
|
||||||
|
|
||||||
|
__m128 h1 = _mm_shuffle_ps(h, h, _MM_SHUFFLE(3, 0, 0, 1)); // h.y, h.x, h.x
|
||||||
|
__m128 h2 = _mm_shuffle_ps(h, h, _MM_SHUFFLE(3, 1, 2, 2)); // h.z, h.z, h.y
|
||||||
|
__m128 v1 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 1, 2, 2)); // v.z, v.z, v.y
|
||||||
|
__m128 v2 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 0, 0, 1)); // v.y, v.x, v.x
|
||||||
|
__m128 rhs = _mm_add_ps(_mm_mul_ps(h1, v1), _mm_mul_ps(h2, v2));
|
||||||
|
|
||||||
|
mask = _mm_movemask_ps(_mm_cmpgt_ps(lhs, rhs));
|
||||||
|
return (mask & 7) ? disjoint : overlap;
|
||||||
|
|
||||||
|
#else
|
||||||
const kexVec3 &v = ray.v;
|
const kexVec3 &v = ray.v;
|
||||||
const kexVec3 &w = ray.w;
|
const kexVec3 &w = ray.w;
|
||||||
const kexVec3 &h = aabb.Extents;
|
const kexVec3 &h = aabb.Extents;
|
||||||
|
@ -864,6 +899,7 @@ IntersectionTest::OverlapResult IntersectionTest::ray_aabb(const RayBBox &ray, c
|
||||||
return disjoint;
|
return disjoint;
|
||||||
|
|
||||||
return overlap;
|
return overlap;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
|
@ -56,6 +56,7 @@ public:
|
||||||
|
|
||||||
kexVec3 Center;
|
kexVec3 Center;
|
||||||
kexVec3 Extents;
|
kexVec3 Extents;
|
||||||
|
float ssePadding = 0.0f; // Needed to safely load Extents directly into a sse register
|
||||||
};
|
};
|
||||||
|
|
||||||
class RayBBox
|
class RayBBox
|
||||||
|
@ -72,6 +73,7 @@ public:
|
||||||
|
|
||||||
kexVec3 start, end;
|
kexVec3 start, end;
|
||||||
kexVec3 c, w, v;
|
kexVec3 c, w, v;
|
||||||
|
float ssePadding = 0.0f; // Needed to safely load v directly into a sse register
|
||||||
};
|
};
|
||||||
|
|
||||||
class TriangleMeshShape
|
class TriangleMeshShape
|
||||||
|
|
Loading…
Reference in a new issue