raze/source/common/rendering/polyrenderer/drawers/poly_thread.cpp
Mitch Richters 9894729fc2 - Replace MAX() from templates.h with version provided in STL.
# Conflicts:
#	source/common/textures/hw_ihwtexture.cpp
#	source/common/utility/templates.h
2021-10-30 10:36:02 +02:00

827 lines
24 KiB
C++

/*
** Polygon Doom software renderer
** Copyright (c) 2016 Magnus Norddahl
**
** This software is provided 'as-is', without any express or implied
** warranty. In no event will the authors be held liable for any damages
** arising from the use of this software.
**
** Permission is granted to anyone to use this software for any purpose,
** including commercial applications, and to alter it and redistribute it
** freely, subject to the following restrictions:
**
** 1. The origin of this software must not be misrepresented; you must not
** claim that you wrote the original software. If you use this software
** in a product, an acknowledgment in the product documentation would be
** appreciated but is not required.
** 2. Altered source versions must be plainly marked as such, and must not be
** misrepresented as being the original software.
** 3. This notice may not be removed or altered from any source distribution.
**
*/
#include <stddef.h>
#include "templates.h"
#include "filesystem.h"
#include "v_video.h"
#include "model.h"
#include "poly_thread.h"
#include "screen_triangle.h"
#ifndef NO_SSE
#include <immintrin.h>
#endif
PolyTriangleThreadData::PolyTriangleThreadData(int32_t core, int32_t num_cores, int32_t numa_node, int32_t num_numa_nodes, int numa_start_y, int numa_end_y)
: core(core), num_cores(num_cores), numa_node(numa_node), num_numa_nodes(num_numa_nodes), numa_start_y(numa_start_y), numa_end_y(numa_end_y)
{
}
void PolyTriangleThreadData::ClearDepth(float value)
{
int width = depthstencil->Width();
int height = depthstencil->Height();
float *data = depthstencil->DepthValues();
int skip = skipped_by_thread(0);
int count = count_for_thread(0, height);
data += skip * width;
for (int i = 0; i < count; i++)
{
for (int x = 0; x < width; x++)
data[x] = value;
data += num_cores * width;
}
}
void PolyTriangleThreadData::ClearStencil(uint8_t value)
{
int width = depthstencil->Width();
int height = depthstencil->Height();
uint8_t *data = depthstencil->StencilValues();
int skip = skipped_by_thread(0);
int count = count_for_thread(0, height);
data += skip * width;
for (int i = 0; i < count; i++)
{
memset(data, value, width);
data += num_cores * width;
}
}
void PolyTriangleThreadData::SetViewport(int x, int y, int width, int height, uint8_t *new_dest, int new_dest_width, int new_dest_height, int new_dest_pitch, bool new_dest_bgra, PolyDepthStencil *new_depthstencil, bool new_topdown)
{
viewport_x = x;
viewport_y = y;
viewport_width = width;
viewport_height = height;
dest = new_dest;
dest_width = new_dest_width;
dest_height = new_dest_height;
dest_pitch = new_dest_pitch;
dest_bgra = new_dest_bgra;
depthstencil = new_depthstencil;
topdown = new_topdown;
UpdateClip();
}
void PolyTriangleThreadData::SetScissor(int x, int y, int w, int h)
{
scissor.left = x;
scissor.right = x + w;
scissor.top = y;
scissor.bottom = y + h;
UpdateClip();
}
void PolyTriangleThreadData::UpdateClip()
{
clip.left = max(max(viewport_x, scissor.left), 0);
clip.top = max(max(viewport_y, scissor.top), 0);
clip.right = MIN(MIN(viewport_x + viewport_width, scissor.right), dest_width);
clip.bottom = MIN(MIN(viewport_y + viewport_height, scissor.bottom), dest_height);
}
void PolyTriangleThreadData::PushStreamData(const StreamData &data, const PolyPushConstants &constants)
{
mainVertexShader.Data = data;
mainVertexShader.uClipSplit = constants.uClipSplit;
PushConstants = &constants;
AlphaThreshold = clamp((int)(PushConstants->uAlphaThreshold * 255.0f + 0.5f), 0, 255) << 24;
numPolyLights = 0;
if (constants.uLightIndex >= 0)
{
const FVector4 &lightRange = lights[constants.uLightIndex];
static_assert(sizeof(FVector4) == 16, "sizeof(FVector4) is not 16 bytes");
if (lightRange.Y > lightRange.X)
{
int start = constants.uLightIndex + 1;
int modulatedStart = static_cast<int>(lightRange.X) + start;
int modulatedEnd = static_cast<int>(lightRange.Y) + start;
for (int i = modulatedStart; i < modulatedEnd; i += 4)
{
if (numPolyLights == maxPolyLights)
break;
auto &lightpos = lights[i];
auto &lightcolor = lights[i + 1];
//auto &lightspot1 = lights[i + 2];
//auto &lightspot2 = lights[i + 3];
uint32_t r = (int)clamp(lightcolor.X * 255.0f, 0.0f, 255.0f);
uint32_t g = (int)clamp(lightcolor.Y * 255.0f, 0.0f, 255.0f);
uint32_t b = (int)clamp(lightcolor.Z * 255.0f, 0.0f, 255.0f);
auto& polylight = polyLights[numPolyLights++];
polylight.x = lightpos.X;
polylight.y = lightpos.Y;
polylight.z = lightpos.Z;
polylight.radius = 256.0f / lightpos.W;
polylight.color = (r << 16) | (g << 8) | b;
if (lightcolor.W < 0.0f)
polylight.radius = -polylight.radius;
}
}
}
}
void PolyTriangleThreadData::PushMatrices(const VSMatrix &modelMatrix, const VSMatrix &normalModelMatrix, const VSMatrix &textureMatrix)
{
mainVertexShader.ModelMatrix = modelMatrix;
mainVertexShader.NormalModelMatrix = normalModelMatrix;
mainVertexShader.TextureMatrix = textureMatrix;
}
void PolyTriangleThreadData::SetViewpointUniforms(const HWViewpointUniforms *uniforms)
{
mainVertexShader.Viewpoint = uniforms;
}
void PolyTriangleThreadData::SetDepthClamp(bool on)
{
DepthClamp = on;
}
void PolyTriangleThreadData::SetDepthMask(bool on)
{
WriteDepth = on;
}
void PolyTriangleThreadData::SetDepthFunc(int func)
{
if (func == DF_LEqual || func == DF_Less)
{
DepthTest = true;
}
else // if (func == DF_Always)
{
DepthTest = false;
}
}
void PolyTriangleThreadData::SetDepthRange(float min, float max)
{
DepthRangeStart = min;
DepthRangeScale = max - min;
}
void PolyTriangleThreadData::SetDepthBias(float depthBiasConstantFactor, float depthBiasSlopeFactor)
{
depthbias = (float)(depthBiasConstantFactor / 2500.0);
}
void PolyTriangleThreadData::SetColorMask(bool r, bool g, bool b, bool a)
{
WriteColor = r;
}
void PolyTriangleThreadData::SetStencil(int stencilRef, int op)
{
StencilTestValue = stencilRef;
if (op == SOP_Increment)
{
StencilWriteValue = MIN(stencilRef + 1, (int)255);
}
else if (op == SOP_Decrement)
{
StencilWriteValue = max(stencilRef - 1, (int)0);
}
else // SOP_Keep
{
StencilWriteValue = stencilRef;
}
WriteStencil = StencilTest && (StencilTestValue != StencilWriteValue);
}
void PolyTriangleThreadData::SetCulling(int mode)
{
SetTwoSided(mode == Cull_None);
SetCullCCW(mode == Cull_CCW);
}
void PolyTriangleThreadData::EnableStencil(bool on)
{
StencilTest = on;
WriteStencil = on && (StencilTestValue != StencilWriteValue);
}
void PolyTriangleThreadData::SetRenderStyle(FRenderStyle style)
{
RenderStyle = style;
}
void PolyTriangleThreadData::SetShader(int specialEffect, int effectState, bool alphaTest, bool colormapShader)
{
SpecialEffect = specialEffect;
EffectState = effectState;
AlphaTest = alphaTest;
ColormapShader = colormapShader;
}
void PolyTriangleThreadData::SetTexture(int unit, const void *pixels, int width, int height, bool bgra)
{
textures[unit].pixels = pixels;
textures[unit].width = width;
textures[unit].height = height;
textures[unit].bgra = bgra;
}
void PolyTriangleThreadData::DrawIndexed(int index, int vcount, PolyDrawMode drawmode)
{
if (vcount < 3)
return;
elements += index;
ShadedTriVertex vertbuffer[3];
ShadedTriVertex *vert[3] = { &vertbuffer[0], &vertbuffer[1], &vertbuffer[2] };
if (drawmode == PolyDrawMode::Triangles)
{
for (int i = 0; i < vcount / 3; i++)
{
for (int j = 0; j < 3; j++)
*vert[j] = ShadeVertex(*(elements++));
DrawShadedTriangle(vert, ccw);
}
}
else if (drawmode == PolyDrawMode::TriangleFan)
{
*vert[0] = ShadeVertex(*(elements++));
*vert[1] = ShadeVertex(*(elements++));
for (int i = 2; i < vcount; i++)
{
*vert[2] = ShadeVertex(*(elements++));
DrawShadedTriangle(vert, ccw);
std::swap(vert[1], vert[2]);
}
}
else if (drawmode == PolyDrawMode::TriangleStrip)
{
bool toggleccw = ccw;
*vert[0] = ShadeVertex(*(elements++));
*vert[1] = ShadeVertex(*(elements++));
for (int i = 2; i < vcount; i++)
{
*vert[2] = ShadeVertex(*(elements++));
DrawShadedTriangle(vert, toggleccw);
ShadedTriVertex *vtmp = vert[0];
vert[0] = vert[1];
vert[1] = vert[2];
vert[2] = vtmp;
toggleccw = !toggleccw;
}
}
else if (drawmode == PolyDrawMode::Lines)
{
for (int i = 0; i < vcount / 2; i++)
{
*vert[0] = ShadeVertex(*(elements++));
*vert[1] = ShadeVertex(*(elements++));
DrawShadedLine(vert);
}
}
else if (drawmode == PolyDrawMode::Points)
{
for (int i = 0; i < vcount; i++)
{
*vert[0] = ShadeVertex(*(elements++));
DrawShadedPoint(vert);
}
}
}
void PolyTriangleThreadData::Draw(int index, int vcount, PolyDrawMode drawmode)
{
if (vcount < 3)
return;
int vinput = index;
ShadedTriVertex vertbuffer[3];
ShadedTriVertex *vert[3] = { &vertbuffer[0], &vertbuffer[1], &vertbuffer[2] };
if (drawmode == PolyDrawMode::Triangles)
{
for (int i = 0; i < vcount / 3; i++)
{
for (int j = 0; j < 3; j++)
*vert[j] = ShadeVertex(vinput++);
DrawShadedTriangle(vert, ccw);
}
}
else if (drawmode == PolyDrawMode::TriangleFan)
{
*vert[0] = ShadeVertex(vinput++);
*vert[1] = ShadeVertex(vinput++);
for (int i = 2; i < vcount; i++)
{
*vert[2] = ShadeVertex(vinput++);
DrawShadedTriangle(vert, ccw);
std::swap(vert[1], vert[2]);
}
}
else if (drawmode == PolyDrawMode::TriangleStrip)
{
bool toggleccw = ccw;
*vert[0] = ShadeVertex(vinput++);
*vert[1] = ShadeVertex(vinput++);
for (int i = 2; i < vcount; i++)
{
*vert[2] = ShadeVertex(vinput++);
DrawShadedTriangle(vert, toggleccw);
ShadedTriVertex *vtmp = vert[0];
vert[0] = vert[1];
vert[1] = vert[2];
vert[2] = vtmp;
toggleccw = !toggleccw;
}
}
else if (drawmode == PolyDrawMode::Lines)
{
for (int i = 0; i < vcount / 2; i++)
{
*vert[0] = ShadeVertex(vinput++);
*vert[1] = ShadeVertex(vinput++);
DrawShadedLine(vert);
}
}
else if (drawmode == PolyDrawMode::Points)
{
for (int i = 0; i < vcount; i++)
{
*vert[0] = ShadeVertex(vinput++);
DrawShadedPoint(vert);
}
}
}
ShadedTriVertex PolyTriangleThreadData::ShadeVertex(int index)
{
inputAssembly->Load(this, vertices, frame0, frame1, index);
mainVertexShader.SIMPLE = (SpecialEffect == EFF_BURN) || (SpecialEffect == EFF_STENCIL);
mainVertexShader.SPHEREMAP = (SpecialEffect == EFF_SPHEREMAP);
mainVertexShader.main();
return mainVertexShader;
}
bool PolyTriangleThreadData::IsDegenerate(const ShadedTriVertex *const* vert)
{
// A degenerate triangle has a zero cross product for two of its sides.
float ax = vert[1]->gl_Position.X - vert[0]->gl_Position.X;
float ay = vert[1]->gl_Position.Y - vert[0]->gl_Position.Y;
float az = vert[1]->gl_Position.W - vert[0]->gl_Position.W;
float bx = vert[2]->gl_Position.X - vert[0]->gl_Position.X;
float by = vert[2]->gl_Position.Y - vert[0]->gl_Position.Y;
float bz = vert[2]->gl_Position.W - vert[0]->gl_Position.W;
float crossx = ay * bz - az * by;
float crossy = az * bx - ax * bz;
float crossz = ax * by - ay * bx;
float crosslengthsqr = crossx * crossx + crossy * crossy + crossz * crossz;
return crosslengthsqr <= 1.e-8f;
}
bool PolyTriangleThreadData::IsFrontfacing(TriDrawTriangleArgs *args)
{
float a =
args->v1->x * args->v2->y - args->v2->x * args->v1->y +
args->v2->x * args->v3->y - args->v3->x * args->v2->y +
args->v3->x * args->v1->y - args->v1->x * args->v3->y;
return a <= 0.0f;
}
void PolyTriangleThreadData::DrawShadedPoint(const ShadedTriVertex *const* vertex)
{
}
void PolyTriangleThreadData::DrawShadedLine(const ShadedTriVertex *const* vert)
{
static const int numclipdistances = 9;
float clipdistance[numclipdistances * 2];
float *clipd = clipdistance;
for (int i = 0; i < 2; i++)
{
const auto &v = *vert[i];
clipd[0] = v.gl_Position.X + v.gl_Position.W;
clipd[1] = v.gl_Position.W - v.gl_Position.X;
clipd[2] = v.gl_Position.Y + v.gl_Position.W;
clipd[3] = v.gl_Position.W - v.gl_Position.Y;
if (DepthClamp)
{
clipd[4] = 1.0f;
clipd[5] = 1.0f;
}
else
{
clipd[4] = v.gl_Position.Z + v.gl_Position.W;
clipd[5] = v.gl_Position.W - v.gl_Position.Z;
}
clipd[6] = v.gl_ClipDistance[0];
clipd[7] = v.gl_ClipDistance[1];
clipd[8] = v.gl_ClipDistance[2];
clipd += numclipdistances;
}
float t1 = 0.0f;
float t2 = 1.0f;
for (int p = 0; p < numclipdistances; p++)
{
float clipdistance1 = clipdistance[0 * numclipdistances + p];
float clipdistance2 = clipdistance[1 * numclipdistances + p];
if (clipdistance1 < 0.0f) t1 = max(-clipdistance1 / (clipdistance2 - clipdistance1), t1);
if (clipdistance2 < 0.0f) t2 = MIN(1.0f + clipdistance2 / (clipdistance1 - clipdistance2), t2);
if (t1 >= t2)
return;
}
float weights[] = { 1.0f - t1, t1, 1.0f - t2, t2 };
ScreenTriVertex clippedvert[2];
for (int i = 0; i < 2; i++)
{
auto &v = clippedvert[i];
memset(&v, 0, sizeof(ScreenTriVertex));
for (int w = 0; w < 2; w++)
{
float weight = weights[i * 2 + w];
v.x += vert[w]->gl_Position.X * weight;
v.y += vert[w]->gl_Position.Y * weight;
v.z += vert[w]->gl_Position.Z * weight;
v.w += vert[w]->gl_Position.W * weight;
}
// Calculate normalized device coordinates:
v.w = 1.0f / v.w;
v.x *= v.w;
v.y *= v.w;
v.z *= v.w;
// Apply viewport scale to get screen coordinates:
v.x = viewport_x + viewport_width * (1.0f + v.x) * 0.5f;
if (topdown)
v.y = viewport_y + viewport_height * (1.0f - v.y) * 0.5f;
else
v.y = viewport_y + viewport_height * (1.0f + v.y) * 0.5f;
}
uint32_t vColorA = (int)(vert[0]->vColor.W * 255.0f + 0.5f);
uint32_t vColorR = (int)(vert[0]->vColor.X * 255.0f + 0.5f);
uint32_t vColorG = (int)(vert[0]->vColor.Y * 255.0f + 0.5f);
uint32_t vColorB = (int)(vert[0]->vColor.Z * 255.0f + 0.5f);
uint32_t color = MAKEARGB(vColorA, vColorR, vColorG, vColorB);
// Slow and naive implementation. Hopefully fast enough..
float x1 = clippedvert[0].x;
float y1 = clippedvert[0].y;
float x2 = clippedvert[1].x;
float y2 = clippedvert[1].y;
float dx = x2 - x1;
float dy = y2 - y1;
float step = (abs(dx) >= abs(dy)) ? abs(dx) : abs(dy);
dx /= step;
dy /= step;
float x = x1;
float y = y1;
int istep = (int)step;
int pixelsize = dest_bgra ? 4 : 1;
for (int i = 0; i <= istep; i++)
{
int scrx = (int)x;
int scry = (int)y;
if (scrx >= clip.left && scrx < clip.right && scry >= clip.top && scry < clip.bottom && !line_skipped_by_thread(scry))
{
uint8_t *destpixel = dest + (scrx + scry * dest_width) * pixelsize;
if (pixelsize == 4)
{
*reinterpret_cast<uint32_t*>(destpixel) = color;
}
else
{
*destpixel = color;
}
}
x += dx;
y += dy;
}
}
void PolyTriangleThreadData::DrawShadedTriangle(const ShadedTriVertex *const* vert, bool ccw)
{
// Reject triangle if degenerate
if (IsDegenerate(vert))
return;
// Cull, clip and generate additional vertices as needed
ScreenTriVertex clippedvert[max_additional_vertices];
int numclipvert = ClipEdge(vert);
// Convert barycentric weights to actual vertices
for (int i = 0; i < numclipvert; i++)
{
auto &v = clippedvert[i];
memset(&v, 0, sizeof(ScreenTriVertex));
for (int w = 0; w < 3; w++)
{
float weight = weights[i * 3 + w];
v.x += vert[w]->gl_Position.X * weight;
v.y += vert[w]->gl_Position.Y * weight;
v.z += vert[w]->gl_Position.Z * weight;
v.w += vert[w]->gl_Position.W * weight;
v.u += vert[w]->vTexCoord.X * weight;
v.v += vert[w]->vTexCoord.Y * weight;
v.worldX += vert[w]->pixelpos.X * weight;
v.worldY += vert[w]->pixelpos.Y * weight;
v.worldZ += vert[w]->pixelpos.Z * weight;
v.a += vert[w]->vColor.W * weight;
v.r += vert[w]->vColor.X * weight;
v.g += vert[w]->vColor.Y * weight;
v.b += vert[w]->vColor.Z * weight;
v.gradientdistZ += vert[w]->gradientdist.Z * weight;
}
}
#ifdef NO_SSE
// Map to 2D viewport:
for (int j = 0; j < numclipvert; j++)
{
auto &v = clippedvert[j];
// Calculate normalized device coordinates:
v.w = 1.0f / v.w;
v.x *= v.w;
v.y *= v.w;
v.z *= v.w;
// Apply viewport scale to get screen coordinates:
v.x = viewport_x + viewport_width * (1.0f + v.x) * 0.5f;
if (topdown)
v.y = viewport_y + viewport_height * (1.0f - v.y) * 0.5f;
else
v.y = viewport_y + viewport_height * (1.0f + v.y) * 0.5f;
}
#else
// Map to 2D viewport:
__m128 mviewport_x = _mm_set1_ps((float)viewport_x);
__m128 mviewport_y = _mm_set1_ps((float)viewport_y);
__m128 mviewport_halfwidth = _mm_set1_ps(viewport_width * 0.5f);
__m128 mviewport_halfheight = _mm_set1_ps(viewport_height * 0.5f);
__m128 mone = _mm_set1_ps(1.0f);
int sse_length = (numclipvert + 3) / 4 * 4;
for (int j = 0; j < sse_length; j += 4)
{
__m128 vx = _mm_loadu_ps(&clippedvert[j].x);
__m128 vy = _mm_loadu_ps(&clippedvert[j + 1].x);
__m128 vz = _mm_loadu_ps(&clippedvert[j + 2].x);
__m128 vw = _mm_loadu_ps(&clippedvert[j + 3].x);
_MM_TRANSPOSE4_PS(vx, vy, vz, vw);
// Calculate normalized device coordinates:
vw = _mm_div_ps(mone, vw);
vx = _mm_mul_ps(vx, vw);
vy = _mm_mul_ps(vy, vw);
vz = _mm_mul_ps(vz, vw);
// Apply viewport scale to get screen coordinates:
vx = _mm_add_ps(mviewport_x, _mm_mul_ps(mviewport_halfwidth, _mm_add_ps(mone, vx)));
if (topdown)
vy = _mm_add_ps(mviewport_y, _mm_mul_ps(mviewport_halfheight, _mm_sub_ps(mone, vy)));
else
vy = _mm_add_ps(mviewport_y, _mm_mul_ps(mviewport_halfheight, _mm_add_ps(mone, vy)));
_MM_TRANSPOSE4_PS(vx, vy, vz, vw);
_mm_storeu_ps(&clippedvert[j].x, vx);
_mm_storeu_ps(&clippedvert[j + 1].x, vy);
_mm_storeu_ps(&clippedvert[j + 2].x, vz);
_mm_storeu_ps(&clippedvert[j + 3].x, vw);
}
#endif
if (!topdown) ccw = !ccw;
TriDrawTriangleArgs args;
if (twosided && numclipvert > 2)
{
args.v1 = &clippedvert[0];
args.v2 = &clippedvert[1];
args.v3 = &clippedvert[2];
ccw = !IsFrontfacing(&args);
}
// Draw screen triangles
if (ccw)
{
for (int i = numclipvert - 1; i > 1; i--)
{
args.v1 = &clippedvert[numclipvert - 1];
args.v2 = &clippedvert[i - 1];
args.v3 = &clippedvert[i - 2];
if (IsFrontfacing(&args) == ccw && args.CalculateGradients())
{
ScreenTriangle::Draw(&args, this);
}
}
}
else
{
for (int i = 2; i < numclipvert; i++)
{
args.v1 = &clippedvert[0];
args.v2 = &clippedvert[i - 1];
args.v3 = &clippedvert[i];
if (IsFrontfacing(&args) != ccw && args.CalculateGradients())
{
ScreenTriangle::Draw(&args, this);
}
}
}
}
int PolyTriangleThreadData::ClipEdge(const ShadedTriVertex *const* verts)
{
// use barycentric weights for clipped vertices
weights = weightsbuffer;
for (int i = 0; i < 3; i++)
{
weights[i * 3 + 0] = 0.0f;
weights[i * 3 + 1] = 0.0f;
weights[i * 3 + 2] = 0.0f;
weights[i * 3 + i] = 1.0f;
}
// Clip and cull so that the following is true for all vertices:
// -v.w <= v.x <= v.w
// -v.w <= v.y <= v.w
// -v.w <= v.z <= v.w
// halfspace clip distances
static const int numclipdistances = 9;
#ifdef NO_SSE
float clipdistance[numclipdistances * 3];
bool needsclipping = false;
float *clipd = clipdistance;
for (int i = 0; i < 3; i++)
{
const auto &v = *verts[i];
clipd[0] = v.gl_Position.X + v.gl_Position.W;
clipd[1] = v.gl_Position.W - v.gl_Position.X;
clipd[2] = v.gl_Position.Y + v.gl_Position.W;
clipd[3] = v.gl_Position.W - v.gl_Position.Y;
if (DepthClamp)
{
clipd[4] = 1.0f;
clipd[5] = 1.0f;
}
else
{
clipd[4] = v.gl_Position.Z + v.gl_Position.W;
clipd[5] = v.gl_Position.W - v.gl_Position.Z;
}
clipd[6] = v.gl_ClipDistance[0];
clipd[7] = v.gl_ClipDistance[1];
clipd[8] = v.gl_ClipDistance[2];
for (int j = 0; j < 9; j++)
needsclipping = needsclipping || clipd[i];
clipd += numclipdistances;
}
// If all halfspace clip distances are positive then the entire triangle is visible. Skip the expensive clipping step.
if (!needsclipping)
{
return 3;
}
#else
__m128 mx = _mm_loadu_ps(&verts[0]->gl_Position.X);
__m128 my = _mm_loadu_ps(&verts[1]->gl_Position.X);
__m128 mz = _mm_loadu_ps(&verts[2]->gl_Position.X);
__m128 mw = _mm_setzero_ps();
_MM_TRANSPOSE4_PS(mx, my, mz, mw);
__m128 clipd0 = _mm_add_ps(mx, mw);
__m128 clipd1 = _mm_sub_ps(mw, mx);
__m128 clipd2 = _mm_add_ps(my, mw);
__m128 clipd3 = _mm_sub_ps(mw, my);
__m128 clipd4 = DepthClamp ? _mm_set1_ps(1.0f) : _mm_add_ps(mz, mw);
__m128 clipd5 = DepthClamp ? _mm_set1_ps(1.0f) : _mm_sub_ps(mw, mz);
__m128 clipd6 = _mm_setr_ps(verts[0]->gl_ClipDistance[0], verts[1]->gl_ClipDistance[0], verts[2]->gl_ClipDistance[0], 0.0f);
__m128 clipd7 = _mm_setr_ps(verts[0]->gl_ClipDistance[1], verts[1]->gl_ClipDistance[1], verts[2]->gl_ClipDistance[1], 0.0f);
__m128 clipd8 = _mm_setr_ps(verts[0]->gl_ClipDistance[2], verts[1]->gl_ClipDistance[2], verts[2]->gl_ClipDistance[2], 0.0f);
__m128 mneedsclipping = _mm_cmplt_ps(clipd0, _mm_setzero_ps());
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd1, _mm_setzero_ps()));
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd2, _mm_setzero_ps()));
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd3, _mm_setzero_ps()));
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd4, _mm_setzero_ps()));
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd5, _mm_setzero_ps()));
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd6, _mm_setzero_ps()));
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd7, _mm_setzero_ps()));
mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd8, _mm_setzero_ps()));
if (_mm_movemask_ps(mneedsclipping) == 0)
{
return 3;
}
float clipdistance[numclipdistances * 4];
_mm_storeu_ps(clipdistance, clipd0);
_mm_storeu_ps(clipdistance + 4, clipd1);
_mm_storeu_ps(clipdistance + 8, clipd2);
_mm_storeu_ps(clipdistance + 12, clipd3);
_mm_storeu_ps(clipdistance + 16, clipd4);
_mm_storeu_ps(clipdistance + 20, clipd5);
_mm_storeu_ps(clipdistance + 24, clipd6);
_mm_storeu_ps(clipdistance + 28, clipd7);
_mm_storeu_ps(clipdistance + 32, clipd8);
#endif
// Clip against each halfspace
float *input = weights;
float *output = weights + max_additional_vertices * 3;
int inputverts = 3;
for (int p = 0; p < numclipdistances; p++)
{
// Clip each edge
int outputverts = 0;
for (int i = 0; i < inputverts; i++)
{
int j = (i + 1) % inputverts;
#ifdef NO_SSE
float clipdistance1 =
clipdistance[0 * numclipdistances + p] * input[i * 3 + 0] +
clipdistance[1 * numclipdistances + p] * input[i * 3 + 1] +
clipdistance[2 * numclipdistances + p] * input[i * 3 + 2];
float clipdistance2 =
clipdistance[0 * numclipdistances + p] * input[j * 3 + 0] +
clipdistance[1 * numclipdistances + p] * input[j * 3 + 1] +
clipdistance[2 * numclipdistances + p] * input[j * 3 + 2];
#else
float clipdistance1 =
clipdistance[0 + p * 4] * input[i * 3 + 0] +
clipdistance[1 + p * 4] * input[i * 3 + 1] +
clipdistance[2 + p * 4] * input[i * 3 + 2];
float clipdistance2 =
clipdistance[0 + p * 4] * input[j * 3 + 0] +
clipdistance[1 + p * 4] * input[j * 3 + 1] +
clipdistance[2 + p * 4] * input[j * 3 + 2];
#endif
// Clip halfspace
if ((clipdistance1 >= 0.0f || clipdistance2 >= 0.0f) && outputverts + 1 < max_additional_vertices)
{
float t1 = (clipdistance1 < 0.0f) ? max(-clipdistance1 / (clipdistance2 - clipdistance1), 0.0f) : 0.0f;
float t2 = (clipdistance2 < 0.0f) ? MIN(1.0f + clipdistance2 / (clipdistance1 - clipdistance2), 1.0f) : 1.0f;
// add t1 vertex
for (int k = 0; k < 3; k++)
output[outputverts * 3 + k] = input[i * 3 + k] * (1.0f - t1) + input[j * 3 + k] * t1;
outputverts++;
if (t2 != 1.0f && t2 > t1)
{
// add t2 vertex
for (int k = 0; k < 3; k++)
output[outputverts * 3 + k] = input[i * 3 + k] * (1.0f - t2) + input[j * 3 + k] * t2;
outputverts++;
}
}
}
std::swap(input, output);
inputverts = outputverts;
if (inputverts == 0)
break;
}
weights = input;
return inputverts;
}
PolyTriangleThreadData *PolyTriangleThreadData::Get(DrawerThread *thread)
{
if (!thread->poly)
thread->poly = std::make_shared<PolyTriangleThreadData>(thread->core, thread->num_cores, thread->numa_node, thread->num_numa_nodes, thread->numa_start_y, thread->numa_end_y);
return thread->poly.get();
}