- Implement the affine part of the dynamic lights in the softpoly TC SSE2 drawer

This commit is contained in:
Magnus Norddahl 2017-09-22 00:57:51 +02:00
parent 94a33229cc
commit ca2ef805b8

View file

@ -142,7 +142,14 @@ namespace TriScreenDrawerModes
}
}
FORCEINLINE __m128i VECTORCALL AddLights(__m128i material, __m128i fgcolor, const PolyLight *lights, int num_lights, __m128 worldpos, __m128 worldnormal)
FORCEINLINE __m128i VECTORCALL AddLights(__m128i material, __m128i fgcolor, __m128i dynlight)
{
fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, dynlight), 8));
fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255));
return fgcolor;
}
FORCEINLINE __m128i VECTORCALL CalcDynamicLight(const PolyLight *lights, int num_lights, __m128 worldpos, __m128 worldnormal)
{
__m128i lit = _mm_setzero_si128();
@ -189,15 +196,11 @@ namespace TriScreenDrawerModes
lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8));
}
lit = _mm_min_epi16(lit, _mm_set1_epi16(256));
fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8));
fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255));
return fgcolor;
return _mm_min_epi16(lit, _mm_set1_epi16(256));
}
template<typename ShadeModeT>
FORCEINLINE __m128i VECTORCALL Shade32(__m128i fgcolor, __m128i mlight, unsigned int ifgcolor0, unsigned int ifgcolor1, int desaturate, __m128i inv_desaturate, __m128i shade_fade, __m128i shade_light, const PolyLight *lights, int num_lights, __m128 worldpos, __m128 worldnormal)
FORCEINLINE __m128i VECTORCALL Shade32(__m128i fgcolor, __m128i mlight, unsigned int ifgcolor0, unsigned int ifgcolor1, int desaturate, __m128i inv_desaturate, __m128i shade_fade, __m128i shade_light, __m128i dynlight)
{
__m128i material = fgcolor;
if (ShadeModeT::Mode == (int)ShadeMode::Simple)
@ -224,7 +227,7 @@ namespace TriScreenDrawerModes
fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8);
}
return AddLights(material, fgcolor, lights, num_lights, worldpos, worldnormal);
return AddLights(material, fgcolor, dynlight);
}
template<typename BlendT>
@ -391,7 +394,6 @@ private:
auto lights = args->uniforms->Lights();
auto num_lights = args->uniforms->NumLights();
__m128 worldpos = _mm_setzero_ps();
__m128 worldnormal = _mm_setzero_ps();
// Calculate gradients
@ -472,7 +474,8 @@ private:
lightpos = (lightpos & lightmask) | ((light << 8) & ~lightmask);
__m128 mrcpW = _mm_set1_ps(1.0f / blockPosY.W);
worldpos = _mm_mul_ps(_mm_loadu_ps(&blockPosY.WorldX), mrcpW);
__m128 worldpos = _mm_mul_ps(_mm_loadu_ps(&blockPosY.WorldX), mrcpW);
__m128i dynlight = CalcDynamicLight(lights, num_lights, worldpos, worldnormal);
ScreenTriangleStepVariables blockPosX = blockPosY;
blockPosX.W += gradientX.W;
@ -492,6 +495,13 @@ private:
fixed_t lightstep = (lightnext - lightpos) / 8;
lightstep = lightstep & lightmask;
mrcpW = _mm_set1_ps(1.0f / blockPosX.W);
worldpos = _mm_mul_ps(_mm_loadu_ps(&blockPosX.WorldX), mrcpW);
__m128i dynlightnext = CalcDynamicLight(lights, num_lights, worldpos, worldnormal);
__m128i dynlightstep = _mm_srai_epi16(_mm_sub_epi16(dynlightnext, dynlight), 3);
dynlight = _mm_max_epi16(_mm_min_epi16(_mm_add_epi16(dynlight, _mm_and_si128(dynlightstep, _mm_set_epi32(0xffff,0xffff,0,0))), _mm_set1_epi16(256)), _mm_setzero_si128());
dynlightstep = _mm_slli_epi16(dynlightstep, 1);
for (int ix = 0; ix < 4; ix++)
{
// Load bgcolor
@ -535,11 +545,13 @@ private:
// Shade and blend
__m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128());
fgcolor = Shade32<ShadeModeT>(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light, lights, num_lights, worldpos, worldnormal);
fgcolor = Shade32<ShadeModeT>(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light, dynlight);
__m128i outcolor = Blend32<BlendT>(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha);
// Store result
_mm_storel_epi64((__m128i*)(dest + ix * 2), outcolor);
dynlight = _mm_max_epi16(_mm_min_epi16(_mm_add_epi16(dynlight, dynlightstep), _mm_set1_epi16(256)), _mm_setzero_si128());
}
blockPosY.W += gradientY.W;
@ -565,7 +577,8 @@ private:
lightpos = (lightpos & lightmask) | ((light << 8) & ~lightmask);
__m128 mrcpW = _mm_set1_ps(1.0f / blockPosY.W);
worldpos = _mm_mul_ps(_mm_loadu_ps(&blockPosY.WorldX), mrcpW);
__m128 worldpos = _mm_mul_ps(_mm_loadu_ps(&blockPosY.WorldX), mrcpW);
__m128i dynlight = CalcDynamicLight(lights, num_lights, worldpos, worldnormal);
ScreenTriangleStepVariables blockPosX = blockPosY;
blockPosX.W += gradientX.W;
@ -585,6 +598,13 @@ private:
fixed_t lightstep = (lightnext - lightpos) / 8;
lightstep = lightstep & lightmask;
mrcpW = _mm_set1_ps(1.0f / blockPosX.W);
worldpos = _mm_mul_ps(_mm_loadu_ps(&blockPosX.WorldX), mrcpW);
__m128i dynlightnext = CalcDynamicLight(lights, num_lights, worldpos, worldnormal);
__m128i dynlightstep = _mm_srai_epi16(_mm_sub_epi16(dynlightnext, dynlight), 3);
dynlight = _mm_max_epi16(_mm_min_epi16(_mm_add_epi16(dynlight, _mm_and_si128(dynlightstep, _mm_set_epi32(0xffff, 0xffff, 0, 0))), _mm_set1_epi16(256)), _mm_setzero_si128());
dynlightstep = _mm_slli_epi16(dynlightstep, 1);
for (int x = 0; x < 4; x++)
{
// Load bgcolor
@ -633,7 +653,7 @@ private:
// Shade and blend
__m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128());
fgcolor = Shade32<ShadeModeT>(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light, lights, num_lights, worldpos, worldnormal);
fgcolor = Shade32<ShadeModeT>(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light, dynlight);
__m128i outcolor = Blend32<BlendT>(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha);
// Store result
@ -641,6 +661,8 @@ private:
if (mask0 & (1 << 31)) dest[x * 2] = desttmp[0];
if (mask0 & (1 << 30)) dest[x * 2 + 1] = desttmp[1];
dynlight = _mm_max_epi16(_mm_min_epi16(_mm_add_epi16(dynlight, dynlightstep), _mm_set1_epi16(256)), _mm_setzero_si128());
mask0 <<= 2;
}
@ -665,7 +687,8 @@ private:
lightpos = (lightpos & lightmask) | ((light << 8) & ~lightmask);
__m128 mrcpW = _mm_set1_ps(1.0f / blockPosY.W);
worldpos = _mm_mul_ps(_mm_loadu_ps(&blockPosY.WorldX), mrcpW);
__m128 worldpos = _mm_mul_ps(_mm_loadu_ps(&blockPosY.WorldX), mrcpW);
__m128i dynlight = CalcDynamicLight(lights, num_lights, worldpos, worldnormal);
ScreenTriangleStepVariables blockPosX = blockPosY;
blockPosX.W += gradientX.W;
@ -685,6 +708,13 @@ private:
fixed_t lightstep = (lightnext - lightpos) / 8;
lightstep = lightstep & lightmask;
mrcpW = _mm_set1_ps(1.0f / blockPosX.W);
worldpos = _mm_mul_ps(_mm_loadu_ps(&blockPosX.WorldX), mrcpW);
__m128i dynlightnext = CalcDynamicLight(lights, num_lights, worldpos, worldnormal);
__m128i dynlightstep = _mm_srai_epi16(_mm_sub_epi16(dynlightnext, dynlight), 3);
dynlight = _mm_max_epi16(_mm_min_epi16(_mm_add_epi16(dynlight, _mm_and_si128(dynlightstep, _mm_set_epi32(0xffff, 0xffff, 0, 0))), _mm_set1_epi16(256)), _mm_setzero_si128());
dynlightstep = _mm_slli_epi16(dynlightstep, 1);
for (int x = 0; x < 4; x++)
{
// Load bgcolor
@ -733,7 +763,7 @@ private:
// Shade and blend
__m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128());
fgcolor = Shade32<ShadeModeT>(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light, lights, num_lights, worldpos, worldnormal);
fgcolor = Shade32<ShadeModeT>(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light, dynlight);
__m128i outcolor = Blend32<BlendT>(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha);
// Store result
@ -741,6 +771,8 @@ private:
if (mask1 & (1 << 31)) dest[x * 2] = desttmp[0];
if (mask1 & (1 << 30)) dest[x * 2 + 1] = desttmp[1];
dynlight = _mm_max_epi16(_mm_min_epi16(_mm_add_epi16(dynlight, dynlightstep), _mm_set1_epi16(256)), _mm_setzero_si128());
mask1 <<= 2;
}
@ -892,7 +924,7 @@ private:
// Shade and blend
__m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128());
fgcolor = Shade32<ShadeModeT>(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light, nullptr, 0, _mm_setzero_ps(), _mm_setzero_ps());
fgcolor = Shade32<ShadeModeT>(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light, _mm_setzero_si128());
__m128i outcolor = Blend32<BlendT>(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha);
// Store result
@ -920,7 +952,7 @@ private:
// Shade and blend
__m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128());
fgcolor = Shade32<ShadeModeT>(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light, nullptr, 0, _mm_setzero_ps(), _mm_setzero_ps());
fgcolor = Shade32<ShadeModeT>(fgcolor, mlight, ifgcolor[0], ifgcolor[1], desaturate, inv_desaturate, shade_fade_lit, shade_light, _mm_setzero_si128());
__m128i outcolor = Blend32<BlendT>(fgcolor, bgcolor, ifgcolor[0], ifgcolor[1], ifgshade[0], ifgshade[1], srcalpha, destalpha);
// Store result