From ca2ef805b83ca632b9a4bca47e72e31f288f391f Mon Sep 17 00:00:00 2001 From: Magnus Norddahl Date: Fri, 22 Sep 2017 00:57:51 +0200 Subject: [PATCH] - Implement the affine part of the dynamic lights in the softpoly TC SSE2 drawer --- src/polyrenderer/drawers/poly_drawer32_sse2.h | 66 ++++++++++++++----- 1 file changed, 49 insertions(+), 17 deletions(-) diff --git a/src/polyrenderer/drawers/poly_drawer32_sse2.h b/src/polyrenderer/drawers/poly_drawer32_sse2.h index 1cef53719..0f69f5a4d 100644 --- a/src/polyrenderer/drawers/poly_drawer32_sse2.h +++ b/src/polyrenderer/drawers/poly_drawer32_sse2.h @@ -142,7 +142,14 @@ namespace TriScreenDrawerModes } } - FORCEINLINE __m128i VECTORCALL AddLights(__m128i material, __m128i fgcolor, const PolyLight *lights, int num_lights, __m128 worldpos, __m128 worldnormal) + FORCEINLINE __m128i VECTORCALL AddLights(__m128i material, __m128i fgcolor, __m128i dynlight) + { + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, dynlight), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); + return fgcolor; + } + + FORCEINLINE __m128i VECTORCALL CalcDynamicLight(const PolyLight *lights, int num_lights, __m128 worldpos, __m128 worldnormal) { __m128i lit = _mm_setzero_si128(); @@ -189,15 +196,11 @@ namespace TriScreenDrawerModes lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); } - lit = _mm_min_epi16(lit, _mm_set1_epi16(256)); - - fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); - fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); - return fgcolor; + return _mm_min_epi16(lit, _mm_set1_epi16(256)); } template - FORCEINLINE __m128i VECTORCALL Shade32(__m128i fgcolor, __m128i mlight, unsigned int ifgcolor0, unsigned int ifgcolor1, int desaturate, __m128i inv_desaturate, __m128i shade_fade, __m128i shade_light, const PolyLight *lights, int num_lights, __m128 worldpos, __m128 worldnormal) + FORCEINLINE __m128i VECTORCALL Shade32(__m128i fgcolor, __m128i mlight, unsigned int ifgcolor0, unsigned int ifgcolor1, int desaturate, __m128i inv_desaturate, __m128i shade_fade, __m128i shade_light, __m128i dynlight) { __m128i material = fgcolor; if (ShadeModeT::Mode == (int)ShadeMode::Simple) @@ -224,7 +227,7 @@ namespace TriScreenDrawerModes fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); } - return AddLights(material, fgcolor, lights, num_lights, worldpos, worldnormal); + return AddLights(material, fgcolor, dynlight); } template @@ -391,7 +394,6 @@ private: auto lights = args->uniforms->Lights(); auto num_lights = args->uniforms->NumLights(); - __m128 worldpos = _mm_setzero_ps(); __m128 worldnormal = _mm_setzero_ps(); // Calculate gradients @@ -472,7 +474,8 @@ private: lightpos = (lightpos & lightmask) | ((light << 8) & ~lightmask); __m128 mrcpW = _mm_set1_ps(1.0f / blockPosY.W); - worldpos = _mm_mul_ps(_mm_loadu_ps(&blockPosY.WorldX), mrcpW); + __m128 worldpos = _mm_mul_ps(_mm_loadu_ps(&blockPosY.WorldX), mrcpW); + __m128i dynlight = CalcDynamicLight(lights, num_lights, worldpos, worldnormal); ScreenTriangleStepVariables blockPosX = blockPosY; blockPosX.W += gradientX.W; @@ -492,6 +495,13 @@ private: fixed_t lightstep = (lightnext - lightpos) / 8; lightstep = lightstep & lightmask; + mrcpW = _mm_set1_ps(1.0f / blockPosX.W); + worldpos = _mm_mul_ps(_mm_loadu_ps(&blockPosX.WorldX), mrcpW); + __m128i dynlightnext = CalcDynamicLight(lights, num_lights, worldpos, worldnormal); + __m128i dynlightstep = _mm_srai_epi16(_mm_sub_epi16(dynlightnext, dynlight), 3); + dynlight = _mm_max_epi16(_mm_min_epi16(_mm_add_epi16(dynlight, _mm_and_si128(dynlightstep, _mm_set_epi32(0xffff,0xffff,0,0))), _mm_set1_epi16(256)), _mm_setzero_si128()); + dynlightstep = _mm_slli_epi16(dynlightstep, 1); + for (int ix = 0; ix < 4; ix++) { // Load bgcolor @@ -535,11 +545,13 @@ private: // Shade and blend __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - fgcolor = Shade32(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light, lights, num_lights, worldpos, worldnormal); + fgcolor = Shade32(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light, dynlight); __m128i outcolor = Blend32(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); // Store result _mm_storel_epi64((__m128i*)(dest + ix * 2), outcolor); + + dynlight = _mm_max_epi16(_mm_min_epi16(_mm_add_epi16(dynlight, dynlightstep), _mm_set1_epi16(256)), _mm_setzero_si128()); } blockPosY.W += gradientY.W; @@ -565,7 +577,8 @@ private: lightpos = (lightpos & lightmask) | ((light << 8) & ~lightmask); __m128 mrcpW = _mm_set1_ps(1.0f / blockPosY.W); - worldpos = _mm_mul_ps(_mm_loadu_ps(&blockPosY.WorldX), mrcpW); + __m128 worldpos = _mm_mul_ps(_mm_loadu_ps(&blockPosY.WorldX), mrcpW); + __m128i dynlight = CalcDynamicLight(lights, num_lights, worldpos, worldnormal); ScreenTriangleStepVariables blockPosX = blockPosY; blockPosX.W += gradientX.W; @@ -585,6 +598,13 @@ private: fixed_t lightstep = (lightnext - lightpos) / 8; lightstep = lightstep & lightmask; + mrcpW = _mm_set1_ps(1.0f / blockPosX.W); + worldpos = _mm_mul_ps(_mm_loadu_ps(&blockPosX.WorldX), mrcpW); + __m128i dynlightnext = CalcDynamicLight(lights, num_lights, worldpos, worldnormal); + __m128i dynlightstep = _mm_srai_epi16(_mm_sub_epi16(dynlightnext, dynlight), 3); + dynlight = _mm_max_epi16(_mm_min_epi16(_mm_add_epi16(dynlight, _mm_and_si128(dynlightstep, _mm_set_epi32(0xffff, 0xffff, 0, 0))), _mm_set1_epi16(256)), _mm_setzero_si128()); + dynlightstep = _mm_slli_epi16(dynlightstep, 1); + for (int x = 0; x < 4; x++) { // Load bgcolor @@ -633,7 +653,7 @@ private: // Shade and blend __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - fgcolor = Shade32(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light, lights, num_lights, worldpos, worldnormal); + fgcolor = Shade32(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light, dynlight); __m128i outcolor = Blend32(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); // Store result @@ -641,6 +661,8 @@ private: if (mask0 & (1 << 31)) dest[x * 2] = desttmp[0]; if (mask0 & (1 << 30)) dest[x * 2 + 1] = desttmp[1]; + dynlight = _mm_max_epi16(_mm_min_epi16(_mm_add_epi16(dynlight, dynlightstep), _mm_set1_epi16(256)), _mm_setzero_si128()); + mask0 <<= 2; } @@ -665,7 +687,8 @@ private: lightpos = (lightpos & lightmask) | ((light << 8) & ~lightmask); __m128 mrcpW = _mm_set1_ps(1.0f / blockPosY.W); - worldpos = _mm_mul_ps(_mm_loadu_ps(&blockPosY.WorldX), mrcpW); + __m128 worldpos = _mm_mul_ps(_mm_loadu_ps(&blockPosY.WorldX), mrcpW); + __m128i dynlight = CalcDynamicLight(lights, num_lights, worldpos, worldnormal); ScreenTriangleStepVariables blockPosX = blockPosY; blockPosX.W += gradientX.W; @@ -685,6 +708,13 @@ private: fixed_t lightstep = (lightnext - lightpos) / 8; lightstep = lightstep & lightmask; + mrcpW = _mm_set1_ps(1.0f / blockPosX.W); + worldpos = _mm_mul_ps(_mm_loadu_ps(&blockPosX.WorldX), mrcpW); + __m128i dynlightnext = CalcDynamicLight(lights, num_lights, worldpos, worldnormal); + __m128i dynlightstep = _mm_srai_epi16(_mm_sub_epi16(dynlightnext, dynlight), 3); + dynlight = _mm_max_epi16(_mm_min_epi16(_mm_add_epi16(dynlight, _mm_and_si128(dynlightstep, _mm_set_epi32(0xffff, 0xffff, 0, 0))), _mm_set1_epi16(256)), _mm_setzero_si128()); + dynlightstep = _mm_slli_epi16(dynlightstep, 1); + for (int x = 0; x < 4; x++) { // Load bgcolor @@ -733,7 +763,7 @@ private: // Shade and blend __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - fgcolor = Shade32(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light, lights, num_lights, worldpos, worldnormal); + fgcolor = Shade32(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light, dynlight); __m128i outcolor = Blend32(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); // Store result @@ -741,6 +771,8 @@ private: if (mask1 & (1 << 31)) dest[x * 2] = desttmp[0]; if (mask1 & (1 << 30)) dest[x * 2 + 1] = desttmp[1]; + dynlight = _mm_max_epi16(_mm_min_epi16(_mm_add_epi16(dynlight, dynlightstep), _mm_set1_epi16(256)), _mm_setzero_si128()); + mask1 <<= 2; } @@ -892,7 +924,7 @@ private: // Shade and blend __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - fgcolor = Shade32(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light, nullptr, 0, _mm_setzero_ps(), _mm_setzero_ps()); + fgcolor = Shade32(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light, _mm_setzero_si128()); __m128i outcolor = Blend32(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); // Store result @@ -920,7 +952,7 @@ private: // Shade and blend __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); - fgcolor = Shade32(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light, nullptr, 0, _mm_setzero_ps(), _mm_setzero_ps()); + fgcolor = Shade32(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light, _mm_setzero_si128()); __m128i outcolor = Blend32(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha); // Store result