From 545241aa06ef73320cf84c3ae57950186a01ca83 Mon Sep 17 00:00:00 2001 From: Magnus Norddahl Date: Sat, 22 Apr 2017 20:38:06 +0200 Subject: [PATCH] - add CPU.bSSE2 branching support to softpoly --- src/polyrenderer/drawers/poly_drawer32_sse2.h | 68 ++-- src/polyrenderer/drawers/poly_triangle.cpp | 201 ++++++++---- src/polyrenderer/drawers/poly_triangle.h | 6 +- src/polyrenderer/drawers/screen_triangle.cpp | 302 +++++++++++------- src/polyrenderer/drawers/screen_triangle.h | 5 + 5 files changed, 372 insertions(+), 210 deletions(-) diff --git a/src/polyrenderer/drawers/poly_drawer32_sse2.h b/src/polyrenderer/drawers/poly_drawer32_sse2.h index 5125c93c7..2f690f7e8 100644 --- a/src/polyrenderer/drawers/poly_drawer32_sse2.h +++ b/src/polyrenderer/drawers/poly_drawer32_sse2.h @@ -27,7 +27,7 @@ namespace TriScreenDrawerModes { template - FORCEINLINE unsigned int VECTORCALL Sample32(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, uint32_t oneU, uint32_t oneV, uint32_t color, const uint32_t *translation) + FORCEINLINE unsigned int VECTORCALL Sample32_SSE2(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, uint32_t oneU, uint32_t oneV, uint32_t color, const uint32_t *translation) { uint32_t texel; if (SamplerT::Mode == (int)Samplers::Shaded || SamplerT::Mode == (int)Samplers::Stencil || SamplerT::Mode == (int)Samplers::Fill || SamplerT::Mode == (int)Samplers::Fuzz) @@ -107,7 +107,7 @@ namespace TriScreenDrawerModes } template - FORCEINLINE unsigned int VECTORCALL SampleShade32(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, int &fuzzpos) + FORCEINLINE unsigned int VECTORCALL SampleShade32_SSE2(int32_t u, int32_t v, const uint32_t *texPixels, int texWidth, int texHeight, int &fuzzpos) { if (SamplerT::Mode == (int)Samplers::Shaded) { @@ -143,7 +143,7 @@ namespace TriScreenDrawerModes } template - FORCEINLINE __m128i VECTORCALL Shade32(__m128i fgcolor, __m128i mlight, unsigned int ifgcolor0, unsigned int ifgcolor1, int desaturate, __m128i inv_desaturate, __m128i shade_fade, __m128i shade_light) + FORCEINLINE __m128i VECTORCALL Shade32_SSE2(__m128i fgcolor, __m128i mlight, unsigned int ifgcolor0, unsigned int ifgcolor1, int desaturate, __m128i inv_desaturate, __m128i shade_fade, __m128i shade_light) { if (ShadeModeT::Mode == (int)ShadeMode::Simple) { @@ -172,7 +172,7 @@ namespace TriScreenDrawerModes } template - FORCEINLINE __m128i VECTORCALL Blend32(__m128i fgcolor, __m128i bgcolor, unsigned int ifgcolor0, unsigned int ifgcolor1, unsigned int ifgshade0, unsigned int ifgshade1, uint32_t srcalpha, uint32_t destalpha) + FORCEINLINE __m128i VECTORCALL Blend32_SSE2(__m128i fgcolor, __m128i bgcolor, unsigned int ifgcolor0, unsigned int ifgcolor1, unsigned int ifgshade0, unsigned int ifgshade1, uint32_t srcalpha, uint32_t destalpha) { if (BlendT::Mode == (int)BlendModes::Opaque) { @@ -275,7 +275,7 @@ namespace TriScreenDrawerModes } template -class TriScreenDrawer32 +class TriScreenDrawer32_SSE2 { public: static void Execute(int x, int y, uint32_t mask0, uint32_t mask1, const TriDrawTriangleArgs *args) @@ -430,13 +430,13 @@ private: // Sample fgcolor unsigned int ifgcolor[2], ifgshade[2]; - ifgcolor[0] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - ifgshade[0] = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + ifgcolor[0] = Sample32_SSE2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + ifgshade[0] = SampleShade32_SSE2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); posU += stepU; posV += stepV; - ifgcolor[1] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - ifgshade[1] = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + ifgcolor[1] = Sample32_SSE2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + ifgshade[1] = SampleShade32_SSE2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); posU += stepU; posV += stepV; @@ -460,8 +460,8 @@ private: // Shade and blend __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - fgcolor = Shade32(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); - __m128i outcolor = Blend32(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); + fgcolor = Shade32_SSE2(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); + __m128i outcolor = Blend32_SSE2(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); // Store result _mm_storel_epi64((__m128i*)(dest + ix * 2), outcolor); @@ -517,13 +517,13 @@ private: // Sample fgcolor unsigned int ifgcolor[2], ifgshade[2]; - ifgcolor[0] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - ifgshade[0] = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + ifgcolor[0] = Sample32_SSE2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + ifgshade[0] = SampleShade32_SSE2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); posU += stepU; posV += stepV; - ifgcolor[1] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - ifgshade[1] = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + ifgcolor[1] = Sample32_SSE2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + ifgshade[1] = SampleShade32_SSE2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); posU += stepU; posV += stepV; @@ -547,8 +547,8 @@ private: // Shade and blend __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - fgcolor = Shade32(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); - __m128i outcolor = Blend32(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); + fgcolor = Shade32_SSE2(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); + __m128i outcolor = Blend32_SSE2(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); // Store result _mm_storel_epi64((__m128i*)desttmp, outcolor); @@ -606,13 +606,13 @@ private: // Sample fgcolor unsigned int ifgcolor[2], ifgshade[2]; - ifgcolor[0] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - ifgshade[0] = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + ifgcolor[0] = Sample32_SSE2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + ifgshade[0] = SampleShade32_SSE2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); posU += stepU; posV += stepV; - ifgcolor[1] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - ifgshade[1] = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + ifgcolor[1] = Sample32_SSE2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + ifgshade[1] = SampleShade32_SSE2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); posU += stepU; posV += stepV; @@ -636,8 +636,8 @@ private: // Shade and blend __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - fgcolor = Shade32(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); - __m128i outcolor = Blend32(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); + fgcolor = Shade32_SSE2(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); + __m128i outcolor = Blend32_SSE2(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); // Store result _mm_storel_epi64((__m128i*)desttmp, outcolor); @@ -658,7 +658,7 @@ private: }; template -class RectScreenDrawer32 +class RectScreenDrawer32_SSE2 { public: static void Execute(const void *destOrg, int destWidth, int destHeight, int destPitch, const RectDrawArgs *args, WorkerThreadData *thread) @@ -780,18 +780,18 @@ private: // Sample fgcolor unsigned int ifgcolor[2], ifgshade[2]; - ifgcolor[0] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - ifgshade[0] = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + ifgcolor[0] = Sample32_SSE2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + ifgshade[0] = SampleShade32_SSE2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); posU += stepU; - ifgcolor[1] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - ifgshade[1] = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + ifgcolor[1] = Sample32_SSE2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + ifgshade[1] = SampleShade32_SSE2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); posU += stepU; // Shade and blend __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - fgcolor = Shade32(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); - __m128i outcolor = Blend32(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); + fgcolor = Shade32_SSE2(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); + __m128i outcolor = Blend32_SSE2(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); // Store result _mm_storel_epi64((__m128i*)dest, outcolor); @@ -809,16 +809,16 @@ private: // Sample fgcolor unsigned int ifgcolor[2], ifgshade[2]; - ifgcolor[0] = Sample32(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); - ifgshade[0] = SampleShade32(posU, posV, texPixels, texWidth, texHeight, fuzzpos); + ifgcolor[0] = Sample32_SSE2(posU, posV, texPixels, texWidth, texHeight, oneU, oneV, color, translation); + ifgshade[0] = SampleShade32_SSE2(posU, posV, texPixels, texWidth, texHeight, fuzzpos); ifgcolor[1] = 0; ifgshade[1] = 0; posU += stepU; // Shade and blend __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - fgcolor = Shade32(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); - __m128i outcolor = Blend32(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); + fgcolor = Shade32_SSE2(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light); + __m128i outcolor = Blend32_SSE2(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); // Store result *dest = _mm_cvtsi128_si32(outcolor); diff --git a/src/polyrenderer/drawers/poly_triangle.cpp b/src/polyrenderer/drawers/poly_triangle.cpp index 3e7caec90..fdb7f6de9 100644 --- a/src/polyrenderer/drawers/poly_triangle.cpp +++ b/src/polyrenderer/drawers/poly_triangle.cpp @@ -151,14 +151,8 @@ ShadedTriVertex PolyTriangleDrawer::shade_vertex(const TriMatrix &objectToClip, return sv; } -void PolyTriangleDrawer::draw_shaded_triangle(const ShadedTriVertex *vert, bool ccw, TriDrawTriangleArgs *args, WorkerThreadData *thread) +void PolyTriangleDrawer::clip_to_viewport(TriVertex *clippedvert, int numclipvert) { - // Cull, clip and generate additional vertices as needed - TriVertex clippedvert[max_additional_vertices]; - int numclipvert = clipedge(vert, clippedvert); - -#ifdef NO_SSE - // Map to 2D viewport: for (int j = 0; j < numclipvert; j++) { auto &v = clippedvert[j]; @@ -173,8 +167,11 @@ void PolyTriangleDrawer::draw_shaded_triangle(const ShadedTriVertex *vert, bool v.x = viewport_x + viewport_width * (1.0f + v.x) * 0.5f; v.y = viewport_y + viewport_height * (1.0f - v.y) * 0.5f; } -#else - // Map to 2D viewport: +} + +#ifndef NO_SSE +void PolyTriangleDrawer::clip_to_viewport_sse2(TriVertex *clippedvert, int numclipvert) +{ __m128 mviewport_x = _mm_set1_ps((float)viewport_x); __m128 mviewport_y = _mm_set1_ps((float)viewport_y); __m128 mviewport_halfwidth = _mm_set1_ps(viewport_width * 0.5f); @@ -205,8 +202,21 @@ void PolyTriangleDrawer::draw_shaded_triangle(const ShadedTriVertex *vert, bool _mm_storeu_ps(&clippedvert[j + 2].x, vz); _mm_storeu_ps(&clippedvert[j + 3].x, vw); } +} #endif +void PolyTriangleDrawer::draw_shaded_triangle(const ShadedTriVertex *vert, bool ccw, TriDrawTriangleArgs *args, WorkerThreadData *thread) +{ + // Cull, clip and generate additional vertices as needed + TriVertex clippedvert[max_additional_vertices]; + int numclipvert = CPU.bSSE2 ? clipedge_sse2(vert, clippedvert) : clipedge(vert, clippedvert); + + // Map to 2D viewport: + if (CPU.bSSE2) + clip_to_viewport_sse2(clippedvert, numclipvert); + else + clip_to_viewport(clippedvert, numclipvert); + // Keep varyings in -128 to 128 range if possible if (numclipvert > 0) { @@ -255,7 +265,6 @@ int PolyTriangleDrawer::clipedge(const ShadedTriVertex *verts, TriVertex *clippe // halfspace clip distances static const int numclipdistances = 7; -#ifdef NO_SSE float clipdistance[numclipdistances * 3]; bool needsclipping = false; float *clipd = clipdistance; @@ -282,43 +291,6 @@ int PolyTriangleDrawer::clipedge(const ShadedTriVertex *verts, TriVertex *clippe } return 3; } -#else - __m128 mx = _mm_loadu_ps(&verts[0].x); - __m128 my = _mm_loadu_ps(&verts[1].x); - __m128 mz = _mm_loadu_ps(&verts[2].x); - __m128 mw = _mm_setzero_ps(); - _MM_TRANSPOSE4_PS(mx, my, mz, mw); - __m128 clipd0 = _mm_add_ps(mx, mw); - __m128 clipd1 = _mm_sub_ps(mw, mx); - __m128 clipd2 = _mm_add_ps(my, mw); - __m128 clipd3 = _mm_sub_ps(mw, my); - __m128 clipd4 = _mm_add_ps(mz, mw); - __m128 clipd5 = _mm_sub_ps(mw, mz); - __m128 clipd6 = _mm_setr_ps(verts[0].clipDistance0, verts[1].clipDistance0, verts[2].clipDistance0, 0.0f); - __m128 mneedsclipping = _mm_cmplt_ps(clipd0, _mm_setzero_ps()); - mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd1, _mm_setzero_ps())); - mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd2, _mm_setzero_ps())); - mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd3, _mm_setzero_ps())); - mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd4, _mm_setzero_ps())); - mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd5, _mm_setzero_ps())); - mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd6, _mm_setzero_ps())); - if (_mm_movemask_ps(mneedsclipping) == 0) - { - for (int i = 0; i < 3; i++) - { - memcpy(clippedvert + i, verts + i, sizeof(TriVertex)); - } - return 3; - } - float clipdistance[numclipdistances * 4]; - _mm_storeu_ps(clipdistance, clipd0); - _mm_storeu_ps(clipdistance + 4, clipd1); - _mm_storeu_ps(clipdistance + 8, clipd2); - _mm_storeu_ps(clipdistance + 12, clipd3); - _mm_storeu_ps(clipdistance + 16, clipd4); - _mm_storeu_ps(clipdistance + 20, clipd5); - _mm_storeu_ps(clipdistance + 24, clipd6); -#endif // use barycentric weights while clipping vertices float weights[max_additional_vertices * 3 * 2]; @@ -341,7 +313,6 @@ int PolyTriangleDrawer::clipedge(const ShadedTriVertex *verts, TriVertex *clippe for (int i = 0; i < inputverts; i++) { int j = (i + 1) % inputverts; -#ifdef NO_SSE float clipdistance1 = clipdistance[0 * numclipdistances + p] * input[i * 3 + 0] + clipdistance[1 * numclipdistances + p] * input[i * 3 + 1] + @@ -351,17 +322,6 @@ int PolyTriangleDrawer::clipedge(const ShadedTriVertex *verts, TriVertex *clippe clipdistance[0 * numclipdistances + p] * input[j * 3 + 0] + clipdistance[1 * numclipdistances + p] * input[j * 3 + 1] + clipdistance[2 * numclipdistances + p] * input[j * 3 + 2]; -#else - float clipdistance1 = - clipdistance[0 + p * 4] * input[i * 3 + 0] + - clipdistance[1 + p * 4] * input[i * 3 + 1] + - clipdistance[2 + p * 4] * input[i * 3 + 2]; - - float clipdistance2 = - clipdistance[0 + p * 4] * input[j * 3 + 0] + - clipdistance[1 + p * 4] * input[j * 3 + 1] + - clipdistance[2 + p * 4] * input[j * 3 + 2]; -#endif // Clip halfspace if ((clipdistance1 >= 0.0f || clipdistance2 >= 0.0f) && outputverts + 1 < max_additional_vertices) @@ -408,6 +368,129 @@ int PolyTriangleDrawer::clipedge(const ShadedTriVertex *verts, TriVertex *clippe return inputverts; } +#ifndef NO_SSE +int PolyTriangleDrawer::clipedge_sse2(const ShadedTriVertex *verts, TriVertex *clippedvert) +{ + // Clip and cull so that the following is true for all vertices: + // -v.w <= v.x <= v.w + // -v.w <= v.y <= v.w + // -v.w <= v.z <= v.w + + // halfspace clip distances + static const int numclipdistances = 7; + __m128 mx = _mm_loadu_ps(&verts[0].x); + __m128 my = _mm_loadu_ps(&verts[1].x); + __m128 mz = _mm_loadu_ps(&verts[2].x); + __m128 mw = _mm_setzero_ps(); + _MM_TRANSPOSE4_PS(mx, my, mz, mw); + __m128 clipd0 = _mm_add_ps(mx, mw); + __m128 clipd1 = _mm_sub_ps(mw, mx); + __m128 clipd2 = _mm_add_ps(my, mw); + __m128 clipd3 = _mm_sub_ps(mw, my); + __m128 clipd4 = _mm_add_ps(mz, mw); + __m128 clipd5 = _mm_sub_ps(mw, mz); + __m128 clipd6 = _mm_setr_ps(verts[0].clipDistance0, verts[1].clipDistance0, verts[2].clipDistance0, 0.0f); + __m128 mneedsclipping = _mm_cmplt_ps(clipd0, _mm_setzero_ps()); + mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd1, _mm_setzero_ps())); + mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd2, _mm_setzero_ps())); + mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd3, _mm_setzero_ps())); + mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd4, _mm_setzero_ps())); + mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd5, _mm_setzero_ps())); + mneedsclipping = _mm_or_ps(mneedsclipping, _mm_cmplt_ps(clipd6, _mm_setzero_ps())); + if (_mm_movemask_ps(mneedsclipping) == 0) + { + for (int i = 0; i < 3; i++) + { + memcpy(clippedvert + i, verts + i, sizeof(TriVertex)); + } + return 3; + } + float clipdistance[numclipdistances * 4]; + _mm_storeu_ps(clipdistance, clipd0); + _mm_storeu_ps(clipdistance + 4, clipd1); + _mm_storeu_ps(clipdistance + 8, clipd2); + _mm_storeu_ps(clipdistance + 12, clipd3); + _mm_storeu_ps(clipdistance + 16, clipd4); + _mm_storeu_ps(clipdistance + 20, clipd5); + _mm_storeu_ps(clipdistance + 24, clipd6); + + // use barycentric weights while clipping vertices + float weights[max_additional_vertices * 3 * 2]; + for (int i = 0; i < 3; i++) + { + weights[i * 3 + 0] = 0.0f; + weights[i * 3 + 1] = 0.0f; + weights[i * 3 + 2] = 0.0f; + weights[i * 3 + i] = 1.0f; + } + + // Clip against each halfspace + float *input = weights; + float *output = weights + max_additional_vertices * 3; + int inputverts = 3; + for (int p = 0; p < numclipdistances; p++) + { + // Clip each edge + int outputverts = 0; + for (int i = 0; i < inputverts; i++) + { + int j = (i + 1) % inputverts; + float clipdistance1 = + clipdistance[0 + p * 4] * input[i * 3 + 0] + + clipdistance[1 + p * 4] * input[i * 3 + 1] + + clipdistance[2 + p * 4] * input[i * 3 + 2]; + + float clipdistance2 = + clipdistance[0 + p * 4] * input[j * 3 + 0] + + clipdistance[1 + p * 4] * input[j * 3 + 1] + + clipdistance[2 + p * 4] * input[j * 3 + 2]; + + // Clip halfspace + if ((clipdistance1 >= 0.0f || clipdistance2 >= 0.0f) && outputverts + 1 < max_additional_vertices) + { + float t1 = (clipdistance1 < 0.0f) ? MAX(-clipdistance1 / (clipdistance2 - clipdistance1), 0.0f) : 0.0f; + float t2 = (clipdistance2 < 0.0f) ? MIN(1.0f + clipdistance2 / (clipdistance1 - clipdistance2), 1.0f) : 1.0f; + + // add t1 vertex + for (int k = 0; k < 3; k++) + output[outputverts * 3 + k] = input[i * 3 + k] * (1.0f - t1) + input[j * 3 + k] * t1; + outputverts++; + + if (t2 != 1.0f && t2 > t1) + { + // add t2 vertex + for (int k = 0; k < 3; k++) + output[outputverts * 3 + k] = input[i * 3 + k] * (1.0f - t2) + input[j * 3 + k] * t2; + outputverts++; + } + } + } + std::swap(input, output); + inputverts = outputverts; + if (inputverts == 0) + break; + } + + // Convert barycentric weights to actual vertices + for (int i = 0; i < inputverts; i++) + { + auto &v = clippedvert[i]; + memset(&v, 0, sizeof(TriVertex)); + for (int w = 0; w < 3; w++) + { + float weight = input[i * 3 + w]; + v.x += verts[w].x * weight; + v.y += verts[w].y * weight; + v.z += verts[w].z * weight; + v.w += verts[w].w * weight; + v.u += verts[w].u * weight; + v.v += verts[w].v * weight; + } + } + return inputverts; +} +#endif + ///////////////////////////////////////////////////////////////////////////// DrawPolyTrianglesCommand::DrawPolyTrianglesCommand(const PolyDrawArgs &args, bool mirror) diff --git a/src/polyrenderer/drawers/poly_triangle.h b/src/polyrenderer/drawers/poly_triangle.h index c939149d3..ca92ac735 100644 --- a/src/polyrenderer/drawers/poly_triangle.h +++ b/src/polyrenderer/drawers/poly_triangle.h @@ -47,8 +47,12 @@ private: static ShadedTriVertex shade_vertex(const TriMatrix &objectToClip, const float *clipPlane, const TriVertex &v); static void draw_arrays(const PolyDrawArgs &args, WorkerThreadData *thread); static void draw_shaded_triangle(const ShadedTriVertex *vertices, bool ccw, TriDrawTriangleArgs *args, WorkerThreadData *thread); - + static void clip_to_viewport(TriVertex *clippedvert, int numclipvert); static int clipedge(const ShadedTriVertex *verts, TriVertex *clippedvert); +#ifndef NO_SSE + static void clip_to_viewport_sse2(TriVertex *clippedvert, int numclipvert); + static int clipedge_sse2(const ShadedTriVertex *verts, TriVertex *clippedvert); +#endif static int viewport_x, viewport_y, viewport_width, viewport_height, dest_pitch, dest_width, dest_height; static bool dest_bgra; diff --git a/src/polyrenderer/drawers/screen_triangle.cpp b/src/polyrenderer/drawers/screen_triangle.cpp index 8547335ed..95bf7826b 100644 --- a/src/polyrenderer/drawers/screen_triangle.cpp +++ b/src/polyrenderer/drawers/screen_triangle.cpp @@ -41,6 +41,13 @@ #endif #include "poly_drawer8.h" +namespace +{ + class SSE2CPU { public: static const int HasSSE2 = 1; }; + class GenericCPU { public: static const int HasSSE2 = 0; }; +} + +template class TriangleBlock { public: @@ -114,9 +121,17 @@ private: void ClipTest(); void StencilWrite(); void SubsectorWrite(); + +#ifndef NO_SSE + void CoverageTestSSE2(); + void StencilEqualTestSSE2(); + void SubsectorTestSSE2(); + void SubsectorWriteSSE2(); +#endif }; -TriangleBlock::TriangleBlock(const TriDrawTriangleArgs *args) +template +TriangleBlock::TriangleBlock(const TriDrawTriangleArgs *args) { const TriVertex &v1 = *args->v1; const TriVertex &v2 = *args->v2; @@ -145,19 +160,32 @@ TriangleBlock::TriangleBlock(const TriDrawTriangleArgs *args) const int X2 = (int)round(16.0f * v2.x); const int X3 = (int)round(16.0f * v3.x); #else - int tempround[4 * 3]; - __m128 m16 = _mm_set1_ps(16.0f); - __m128 mhalf = _mm_set1_ps(65536.5f); - __m128i m65536 = _mm_set1_epi32(65536); - _mm_storeu_si128((__m128i*)tempround, _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v1), m16), mhalf)), m65536)); - _mm_storeu_si128((__m128i*)(tempround + 4), _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v2), m16), mhalf)), m65536)); - _mm_storeu_si128((__m128i*)(tempround + 8), _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v3), m16), mhalf)), m65536)); - const int X1 = tempround[0]; - const int X2 = tempround[4]; - const int X3 = tempround[8]; - const int Y1 = tempround[1]; - const int Y2 = tempround[5]; - const int Y3 = tempround[9]; + int Y1, Y2, Y3, X1, X2, X3; + if (CPUType::HasSSE2 == 1) + { + int tempround[4 * 3]; + __m128 m16 = _mm_set1_ps(16.0f); + __m128 mhalf = _mm_set1_ps(65536.5f); + __m128i m65536 = _mm_set1_epi32(65536); + _mm_storeu_si128((__m128i*)tempround, _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v1), m16), mhalf)), m65536)); + _mm_storeu_si128((__m128i*)(tempround + 4), _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v2), m16), mhalf)), m65536)); + _mm_storeu_si128((__m128i*)(tempround + 8), _mm_sub_epi32(_mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_loadu_ps((const float*)&v3), m16), mhalf)), m65536)); + X1 = tempround[0]; + X2 = tempround[4]; + X3 = tempround[8]; + Y1 = tempround[1]; + Y2 = tempround[5]; + Y3 = tempround[9]; + } + else + { + Y1 = (int)round(16.0f * v1.y); + Y2 = (int)round(16.0f * v2.y); + Y3 = (int)round(16.0f * v3.y); + X1 = (int)round(16.0f * v1.x); + X2 = (int)round(16.0f * v2.x); + X3 = (int)round(16.0f * v3.x); + } #endif // Deltas @@ -203,28 +231,32 @@ TriangleBlock::TriangleBlock(const TriDrawTriangleArgs *args) if (DY31 < 0 || (DY31 == 0 && DX31 > 0)) C3++; #ifndef NO_SSE - mFDY12Offset = _mm_setr_epi32(0, FDY12, FDY12 * 2, FDY12 * 3); - mFDY23Offset = _mm_setr_epi32(0, FDY23, FDY23 * 2, FDY23 * 3); - mFDY31Offset = _mm_setr_epi32(0, FDY31, FDY31 * 2, FDY31 * 3); - mFDY12x4 = _mm_set1_epi32(FDY12 * 4); - mFDY23x4 = _mm_set1_epi32(FDY23 * 4); - mFDY31x4 = _mm_set1_epi32(FDY31 * 4); - mFDX12 = _mm_set1_epi32(FDX12); - mFDX23 = _mm_set1_epi32(FDX23); - mFDX31 = _mm_set1_epi32(FDX31); - mC1 = _mm_set1_epi32(C1); - mC2 = _mm_set1_epi32(C2); - mC3 = _mm_set1_epi32(C3); - mDX12 = _mm_set1_epi32(DX12); - mDY12 = _mm_set1_epi32(DY12); - mDX23 = _mm_set1_epi32(DX23); - mDY23 = _mm_set1_epi32(DY23); - mDX31 = _mm_set1_epi32(DX31); - mDY31 = _mm_set1_epi32(DY31); + if (CPUType::HasSSE2 == 1) + { + mFDY12Offset = _mm_setr_epi32(0, FDY12, FDY12 * 2, FDY12 * 3); + mFDY23Offset = _mm_setr_epi32(0, FDY23, FDY23 * 2, FDY23 * 3); + mFDY31Offset = _mm_setr_epi32(0, FDY31, FDY31 * 2, FDY31 * 3); + mFDY12x4 = _mm_set1_epi32(FDY12 * 4); + mFDY23x4 = _mm_set1_epi32(FDY23 * 4); + mFDY31x4 = _mm_set1_epi32(FDY31 * 4); + mFDX12 = _mm_set1_epi32(FDX12); + mFDX23 = _mm_set1_epi32(FDX23); + mFDX31 = _mm_set1_epi32(FDX31); + mC1 = _mm_set1_epi32(C1); + mC2 = _mm_set1_epi32(C2); + mC3 = _mm_set1_epi32(C3); + mDX12 = _mm_set1_epi32(DX12); + mDY12 = _mm_set1_epi32(DY12); + mDX23 = _mm_set1_epi32(DX23); + mDY23 = _mm_set1_epi32(DY23); + mDX31 = _mm_set1_epi32(DX31); + mDY31 = _mm_set1_epi32(DY31); + } #endif } -void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thread) +template +void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thread) { // First block line for this thread int core = thread->core; @@ -236,9 +268,18 @@ void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thre bool writeColor = args->uniforms->WriteColor(); bool writeStencil = args->uniforms->WriteStencil(); bool writeSubsector = args->uniforms->WriteSubsector(); - int bmode = (int)args->uniforms->BlendMode(); + + // Find the drawer function for the given blend mode +#ifndef NO_SSE + void(*drawFunc)(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *); + if (CPUType::HasSSE2 == 1) + drawFunc = args->destBgra ? ScreenTriangle::TriDrawers32_SSE2[bmode] : ScreenTriangle::TriDrawers8[bmode]; + else + drawFunc = args->destBgra ? ScreenTriangle::TriDrawers32[bmode] : ScreenTriangle::TriDrawers8[bmode]; +#else auto drawFunc = args->destBgra ? ScreenTriangle::TriDrawers32[bmode] : ScreenTriangle::TriDrawers8[bmode]; +#endif // Loop through blocks for (int y = start_miny; y < maxy; y += q * num_cores) @@ -248,7 +289,11 @@ void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thre X = x; Y = y; - CoverageTest(); + if (CPUType::HasSSE2 == 1) + CoverageTestSSE2(); + else + CoverageTest(); + if (Mask0 == 0 && Mask1 == 0) continue; @@ -259,7 +304,11 @@ void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thre // To do: make the stencil test use its own flag for comparison mode instead of abusing the subsector test.. if (!subsectorTest) { - StencilEqualTest(); + if (CPUType::HasSSE2 == 1) + StencilEqualTestSSE2(); + else + StencilEqualTest(); + if (Mask0 == 0 && Mask1 == 0) continue; } @@ -269,7 +318,11 @@ void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thre if (Mask0 == 0 && Mask1 == 0) continue; - SubsectorTest(); + if (CPUType::HasSSE2 == 1) + SubsectorTestSSE2(); + else + SubsectorTest(); + if (Mask0 == 0 && Mask1 == 0) continue; } @@ -279,14 +332,18 @@ void TriangleBlock::Loop(const TriDrawTriangleArgs *args, WorkerThreadData *thre if (writeStencil) StencilWrite(); if (writeSubsector) - SubsectorWrite(); + { + if (CPUType::HasSSE2 == 1) + SubsectorWriteSSE2(); + else + SubsectorWrite(); + } } } } -#ifdef NO_SSE - -void TriangleBlock::SubsectorTest() +template +void TriangleBlock::SubsectorTest() { int block = (X >> 3) + (Y >> 3) * subsectorPitch; uint32_t *subsector = subsectorGBuffer + block * 64; @@ -312,9 +369,10 @@ void TriangleBlock::SubsectorTest() Mask1 = Mask1 & mask1; } -#else +#ifndef NO_SSE -void TriangleBlock::SubsectorTest() +template +void TriangleBlock::SubsectorTestSSE2() { int block = (X >> 3) + (Y >> 3) * subsectorPitch; uint32_t *subsector = subsectorGBuffer + block * 64; @@ -342,7 +400,8 @@ void TriangleBlock::SubsectorTest() #endif -void TriangleBlock::ClipTest() +template +void TriangleBlock::ClipTest() { static const uint32_t clipxmask[8] = { @@ -376,9 +435,8 @@ void TriangleBlock::ClipTest() Mask1 = Mask1 & xmask & ymask1; } -#ifdef NO_SSE - -void TriangleBlock::StencilEqualTest() +template +void TriangleBlock::StencilEqualTest() { // Stencil test the whole block, if possible int block = (X >> 3) + (Y >> 3) * stencilPitch; @@ -421,9 +479,10 @@ void TriangleBlock::StencilEqualTest() } } -#else +#ifndef NO_SSE -void TriangleBlock::StencilEqualTest() +template +void TriangleBlock::StencilEqualTestSSE2() { // Stencil test the whole block, if possible int block = (X >> 3) + (Y >> 3) * stencilPitch; @@ -489,7 +548,8 @@ void TriangleBlock::StencilEqualTest() #endif -void TriangleBlock::StencilGreaterEqualTest() +template +void TriangleBlock::StencilGreaterEqualTest() { // Stencil test the whole block, if possible int block = (X >> 3) + (Y >> 3) * stencilPitch; @@ -532,9 +592,8 @@ void TriangleBlock::StencilGreaterEqualTest() } } -#ifdef NO_SSE - -void TriangleBlock::CoverageTest() +template +void TriangleBlock::CoverageTest() { // Corners of block int x0 = X << 4; @@ -631,9 +690,10 @@ void TriangleBlock::CoverageTest() } } -#else +#ifndef NO_SSE -void TriangleBlock::CoverageTest() +template +void TriangleBlock::CoverageTestSSE2() { // Corners of block int x0 = X << 4; @@ -743,7 +803,8 @@ void TriangleBlock::CoverageTest() #endif -void TriangleBlock::StencilWrite() +template +void TriangleBlock::StencilWrite() { int block = (X >> 3) + (Y >> 3) * stencilPitch; uint8_t *stencilBlock = &stencilValues[block * 64]; @@ -793,9 +854,8 @@ void TriangleBlock::StencilWrite() } } -#ifdef NO_SSE - -void TriangleBlock::SubsectorWrite() +template +void TriangleBlock::SubsectorWrite() { int block = (X >> 3) + (Y >> 3) * subsectorPitch; uint32_t *subsector = subsectorGBuffer + block * 64; @@ -828,9 +888,10 @@ void TriangleBlock::SubsectorWrite() } } -#else +#ifndef NO_SSE -void TriangleBlock::SubsectorWrite() +template +void TriangleBlock::SubsectorWriteSSE2() { int block = (X >> 3) + (Y >> 3) * subsectorPitch; uint32_t *subsector = subsectorGBuffer + block * 64; @@ -887,8 +948,21 @@ void TriangleBlock::SubsectorWrite() void ScreenTriangle::Draw(const TriDrawTriangleArgs *args, WorkerThreadData *thread) { - TriangleBlock block(args); +#ifdef NO_SSE + TriangleBlock block(args); block.Loop(args, thread); +#else + if (CPU.bSSE2) + { + TriangleBlock block(args); + block.Loop(args, thread); + } + else + { + TriangleBlock block(args); + block.Loop(args, thread); + } +#endif } void(*ScreenTriangle::TriDrawers8[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *) = @@ -918,40 +992,38 @@ void(*ScreenTriangle::TriDrawers8[])(int, int, uint32_t, uint32_t, const TriDraw &TriScreenDrawer8::Execute // Fuzz }; -#ifdef NO_SSE - void(*ScreenTriangle::TriDrawers32[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *) = { nullptr }; -#else +#ifndef NO_SSE -void(*ScreenTriangle::TriDrawers32[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *) = +void(*ScreenTriangle::TriDrawers32_SSE2[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *) = { - &TriScreenDrawer32::Execute, // TextureOpaque - &TriScreenDrawer32::Execute, // TextureMasked - &TriScreenDrawer32::Execute, // TextureAdd - &TriScreenDrawer32::Execute, // TextureSub - &TriScreenDrawer32::Execute, // TextureRevSub - &TriScreenDrawer32::Execute, // TextureAddSrcColor - &TriScreenDrawer32::Execute, // TranslatedOpaque - &TriScreenDrawer32::Execute, // TranslatedMasked - &TriScreenDrawer32::Execute, // TranslatedAdd - &TriScreenDrawer32::Execute, // TranslatedSub - &TriScreenDrawer32::Execute, // TranslatedRevSub - &TriScreenDrawer32::Execute, // TranslatedAddSrcColor - &TriScreenDrawer32::Execute, // Shaded - &TriScreenDrawer32::Execute, // AddShaded - &TriScreenDrawer32::Execute, // Stencil - &TriScreenDrawer32::Execute, // AddStencil - &TriScreenDrawer32::Execute, // FillOpaque - &TriScreenDrawer32::Execute, // FillAdd - &TriScreenDrawer32::Execute, // FillSub - &TriScreenDrawer32::Execute, // FillRevSub - &TriScreenDrawer32::Execute, // FillAddSrcColor - &TriScreenDrawer32::Execute, // Skycap - &TriScreenDrawer32::Execute // Fuzz + &TriScreenDrawer32_SSE2::Execute, // TextureOpaque + &TriScreenDrawer32_SSE2::Execute, // TextureMasked + &TriScreenDrawer32_SSE2::Execute, // TextureAdd + &TriScreenDrawer32_SSE2::Execute, // TextureSub + &TriScreenDrawer32_SSE2::Execute, // TextureRevSub + &TriScreenDrawer32_SSE2::Execute, // TextureAddSrcColor + &TriScreenDrawer32_SSE2::Execute, // TranslatedOpaque + &TriScreenDrawer32_SSE2::Execute, // TranslatedMasked + &TriScreenDrawer32_SSE2::Execute, // TranslatedAdd + &TriScreenDrawer32_SSE2::Execute, // TranslatedSub + &TriScreenDrawer32_SSE2::Execute, // TranslatedRevSub + &TriScreenDrawer32_SSE2::Execute, // TranslatedAddSrcColor + &TriScreenDrawer32_SSE2::Execute, // Shaded + &TriScreenDrawer32_SSE2::Execute, // AddShaded + &TriScreenDrawer32_SSE2::Execute, // Stencil + &TriScreenDrawer32_SSE2::Execute, // AddStencil + &TriScreenDrawer32_SSE2::Execute, // FillOpaque + &TriScreenDrawer32_SSE2::Execute, // FillAdd + &TriScreenDrawer32_SSE2::Execute, // FillSub + &TriScreenDrawer32_SSE2::Execute, // FillRevSub + &TriScreenDrawer32_SSE2::Execute, // FillAddSrcColor + &TriScreenDrawer32_SSE2::Execute, // Skycap + &TriScreenDrawer32_SSE2::Execute // Fuzz }; #endif @@ -983,40 +1055,38 @@ void(*ScreenTriangle::RectDrawers8[])(const void *, int, int, int, const RectDra &RectScreenDrawer8::Execute // Fuzz }; -#ifdef NO_SSE - void(*ScreenTriangle::RectDrawers32[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *) = { nullptr }; -#else +#ifndef NO_SSE -void(*ScreenTriangle::RectDrawers32[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *) = +void(*ScreenTriangle::RectDrawers32_SSE2[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *) = { - &RectScreenDrawer32::Execute, // TextureOpaque - &RectScreenDrawer32::Execute, // TextureMasked - &RectScreenDrawer32::Execute, // TextureAdd - &RectScreenDrawer32::Execute, // TextureSub - &RectScreenDrawer32::Execute, // TextureRevSub - &RectScreenDrawer32::Execute, // TextureAddSrcColor - &RectScreenDrawer32::Execute, // TranslatedOpaque - &RectScreenDrawer32::Execute, // TranslatedMasked - &RectScreenDrawer32::Execute, // TranslatedAdd - &RectScreenDrawer32::Execute, // TranslatedSub - &RectScreenDrawer32::Execute, // TranslatedRevSub - &RectScreenDrawer32::Execute, // TranslatedAddSrcColor - &RectScreenDrawer32::Execute, // Shaded - &RectScreenDrawer32::Execute, // AddShaded - &RectScreenDrawer32::Execute, // Stencil - &RectScreenDrawer32::Execute, // AddStencil - &RectScreenDrawer32::Execute, // FillOpaque - &RectScreenDrawer32::Execute, // FillAdd - &RectScreenDrawer32::Execute, // FillSub - &RectScreenDrawer32::Execute, // FillRevSub - &RectScreenDrawer32::Execute, // FillAddSrcColor - &RectScreenDrawer32::Execute, // Skycap - &RectScreenDrawer32::Execute // Fuzz + &RectScreenDrawer32_SSE2::Execute, // TextureOpaque + &RectScreenDrawer32_SSE2::Execute, // TextureMasked + &RectScreenDrawer32_SSE2::Execute, // TextureAdd + &RectScreenDrawer32_SSE2::Execute, // TextureSub + &RectScreenDrawer32_SSE2::Execute, // TextureRevSub + &RectScreenDrawer32_SSE2::Execute, // TextureAddSrcColor + &RectScreenDrawer32_SSE2::Execute, // TranslatedOpaque + &RectScreenDrawer32_SSE2::Execute, // TranslatedMasked + &RectScreenDrawer32_SSE2::Execute, // TranslatedAdd + &RectScreenDrawer32_SSE2::Execute, // TranslatedSub + &RectScreenDrawer32_SSE2::Execute, // TranslatedRevSub + &RectScreenDrawer32_SSE2::Execute, // TranslatedAddSrcColor + &RectScreenDrawer32_SSE2::Execute, // Shaded + &RectScreenDrawer32_SSE2::Execute, // AddShaded + &RectScreenDrawer32_SSE2::Execute, // Stencil + &RectScreenDrawer32_SSE2::Execute, // AddStencil + &RectScreenDrawer32_SSE2::Execute, // FillOpaque + &RectScreenDrawer32_SSE2::Execute, // FillAdd + &RectScreenDrawer32_SSE2::Execute, // FillSub + &RectScreenDrawer32_SSE2::Execute, // FillRevSub + &RectScreenDrawer32_SSE2::Execute, // FillAddSrcColor + &RectScreenDrawer32_SSE2::Execute, // Skycap + &RectScreenDrawer32_SSE2::Execute // Fuzz }; #endif diff --git a/src/polyrenderer/drawers/screen_triangle.h b/src/polyrenderer/drawers/screen_triangle.h index 3dd4c24eb..615a0c631 100644 --- a/src/polyrenderer/drawers/screen_triangle.h +++ b/src/polyrenderer/drawers/screen_triangle.h @@ -131,6 +131,11 @@ public: static void(*RectDrawers8[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *); static void(*RectDrawers32[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *); +#ifndef NO_SSE + static void(*TriDrawers32_SSE2[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *); + static void(*RectDrawers32_SSE2[])(const void *, int, int, int, const RectDrawArgs *, WorkerThreadData *); +#endif + static int FuzzStart; };