mirror of
https://github.com/ZDoom/ZDRay.git
synced 2025-01-25 00:51:20 +00:00
- add SSE implementation for the most time critical function
This commit is contained in:
parent
e21679a4d7
commit
ce518820ff
2 changed files with 38 additions and 0 deletions
|
@ -23,6 +23,9 @@
|
|||
#include "collision.h"
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
#ifndef NO_SSE
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
TriangleMeshShape::TriangleMeshShape(const kexVec3 *vertices, int num_vertices, const unsigned int *elements, int num_elements, const int *surfaces)
|
||||
: vertices(vertices), num_vertices(num_vertices), elements(elements), num_elements(num_elements), surfaces(surfaces)
|
||||
|
@ -848,8 +851,40 @@ IntersectionTest::Result IntersectionTest::frustum_obb(const FrustumPlanes &frus
|
|||
return inside;
|
||||
}
|
||||
|
||||
static const uint32_t clearsignbitmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
|
||||
|
||||
IntersectionTest::OverlapResult IntersectionTest::ray_aabb(const RayBBox &ray, const CollisionBBox &aabb)
|
||||
{
|
||||
#ifndef NO_SSE
|
||||
|
||||
__m128 v = _mm_loadu_ps(&ray.v.x);
|
||||
__m128 w = _mm_loadu_ps(&ray.w.x);
|
||||
__m128 h = _mm_loadu_ps(&aabb.Extents.x);
|
||||
__m128 c = _mm_sub_ps(_mm_loadu_ps(&ray.c.x), _mm_loadu_ps(&aabb.Center.x));
|
||||
|
||||
__m128 clearsignbit = _mm_loadu_ps(reinterpret_cast<const float*>(clearsignbitmask));
|
||||
|
||||
__m128 abs_c = _mm_and_ps(c, clearsignbit);
|
||||
int mask = _mm_movemask_ps(_mm_cmpgt_ps(abs_c, _mm_add_ps(v, h)));
|
||||
if (mask & 7)
|
||||
return disjoint;
|
||||
|
||||
__m128 c1 = _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 0, 1)); // c.y, c.x, c.x
|
||||
__m128 c2 = _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 1, 2, 2)); // c.z, c.z, c.y
|
||||
__m128 w1 = _mm_shuffle_ps(w, w, _MM_SHUFFLE(3, 1, 2, 2)); // w.z, w.z, w.y
|
||||
__m128 w2 = _mm_shuffle_ps(w, w, _MM_SHUFFLE(3, 0, 0, 1)); // w.y, w.x, w.x
|
||||
__m128 lhs = _mm_and_ps(_mm_sub_ps(_mm_mul_ps(c1, w1), _mm_mul_ps(c2, w2)), clearsignbit);
|
||||
|
||||
__m128 h1 = _mm_shuffle_ps(h, h, _MM_SHUFFLE(3, 0, 0, 1)); // h.y, h.x, h.x
|
||||
__m128 h2 = _mm_shuffle_ps(h, h, _MM_SHUFFLE(3, 1, 2, 2)); // h.z, h.z, h.y
|
||||
__m128 v1 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 1, 2, 2)); // v.z, v.z, v.y
|
||||
__m128 v2 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 0, 0, 1)); // v.y, v.x, v.x
|
||||
__m128 rhs = _mm_add_ps(_mm_mul_ps(h1, v1), _mm_mul_ps(h2, v2));
|
||||
|
||||
mask = _mm_movemask_ps(_mm_cmpgt_ps(lhs, rhs));
|
||||
return (mask & 7) ? disjoint : overlap;
|
||||
|
||||
#else
|
||||
const kexVec3 &v = ray.v;
|
||||
const kexVec3 &w = ray.w;
|
||||
const kexVec3 &h = aabb.Extents;
|
||||
|
@ -864,6 +899,7 @@ IntersectionTest::OverlapResult IntersectionTest::ray_aabb(const RayBBox &ray, c
|
|||
return disjoint;
|
||||
|
||||
return overlap;
|
||||
#endif
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
|
|
|
@ -56,6 +56,7 @@ public:
|
|||
|
||||
kexVec3 Center;
|
||||
kexVec3 Extents;
|
||||
float ssePadding = 0.0f; // Needed to safely load Extents directly into a sse register
|
||||
};
|
||||
|
||||
class RayBBox
|
||||
|
@ -72,6 +73,7 @@ public:
|
|||
|
||||
kexVec3 start, end;
|
||||
kexVec3 c, w, v;
|
||||
float ssePadding = 0.0f; // Needed to safely load v directly into a sse register
|
||||
};
|
||||
|
||||
class TriangleMeshShape
|
||||
|
|
Loading…
Reference in a new issue