From 52b7a777715664aa39bff70da44fa81e2088d19b Mon Sep 17 00:00:00 2001 From: Magnus Norddahl Date: Mon, 26 Nov 2018 00:49:07 +0100 Subject: [PATCH 1/4] - use SSE for the dynlights --- src/polyrenderer/drawers/screen_triangle.cpp | 56 ++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/src/polyrenderer/drawers/screen_triangle.cpp b/src/polyrenderer/drawers/screen_triangle.cpp index f897684b51..02c9fcbf00 100644 --- a/src/polyrenderer/drawers/screen_triangle.cpp +++ b/src/polyrenderer/drawers/screen_triangle.cpp @@ -539,6 +539,61 @@ void DrawSpanOpt32(int y, int x0, int x1, const TriDrawTriangleArgs *args, PolyT worldnormalZ *= rcplen; } +#ifndef NO_SSE + __m128 mworldnormalX = _mm_set1_ps(worldnormalX); + __m128 mworldnormalY = _mm_set1_ps(worldnormalY); + __m128 mworldnormalZ = _mm_set1_ps(worldnormalZ); + for (int x = x0; x < x1; x += 4) + { + __m128i litlo = _mm_shuffle_epi32(_mm_unpacklo_epi8(_mm_cvtsi32_si128(dynlightcolor), _mm_setzero_si128()), _MM_SHUFFLE(1, 0, 1, 0)); + __m128i lithi = litlo; + + for (int i = 0; i < num_lights; i++) + { + __m128 lightposX = _mm_set1_ps(lights[i].x); + __m128 lightposY = _mm_set1_ps(lights[i].y); + __m128 lightposZ = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128i light_color = _mm_shuffle_epi32(_mm_unpacklo_epi8(_mm_cvtsi32_si128(lights[i].color), _mm_setzero_si128()), _MM_SHUFFLE(1, 0, 1, 0)); + + __m128 is_attenuated = _mm_cmplt_ps(light_radius, _mm_setzero_ps()); + light_radius = _mm_andnot_ps(_mm_set1_ps(-0.0f), light_radius); // clear sign bit + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lx = _mm_sub_ps(lightposX, _mm_loadu_ps(&worldposX[x])); + __m128 Ly = _mm_sub_ps(lightposY, _mm_loadu_ps(&worldposY[x])); + __m128 Lz = _mm_sub_ps(lightposZ, _mm_loadu_ps(&worldposZ[x])); + __m128 dist2 = _mm_add_ps(_mm_mul_ps(Lx, Lx), _mm_add_ps(_mm_mul_ps(Ly, Ly), _mm_mul_ps(Lz, Lz))); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(_mm_set1_ps(256.0f), _mm_min_ps(_mm_mul_ps(dist, light_radius), _mm_set1_ps(256.0f))); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = max(dot(N,normalize(L)),0) * attenuation + Lx = _mm_mul_ps(Lx, rcp_dist); + Ly = _mm_mul_ps(Ly, rcp_dist); + Lz = _mm_mul_ps(Lz, rcp_dist); + __m128 dotNL = _mm_add_ps(_mm_add_ps(_mm_mul_ps(mworldnormalX, Lx), _mm_mul_ps(mworldnormalY, Ly)), _mm_mul_ps(mworldnormalZ, Lz)); + __m128 point_attenuation = _mm_mul_ps(_mm_max_ps(dotNL, _mm_setzero_ps()), distance_attenuation); + + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, point_attenuation), _mm_andnot_ps(is_attenuated, simple_attenuation))); + + attenuation = _mm_shufflehi_epi16(_mm_shufflelo_epi16(attenuation, _MM_SHUFFLE(2, 2, 0, 0)), _MM_SHUFFLE(2, 2, 0, 0)); + __m128i attenlo = _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1, 1, 0, 0)); + __m128i attenhi = _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(3, 3, 2, 2)); + + litlo = _mm_add_epi16(litlo, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenlo), 8)); + lithi = _mm_add_epi16(lithi, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenhi), 8)); + } + + _mm_storeu_si128((__m128i*)&dynlights[x], _mm_packus_epi16(litlo, lithi)); + } +#else for (int x = x0; x < x1; x++) { uint32_t lit_r = RPART(dynlightcolor); @@ -596,6 +651,7 @@ void DrawSpanOpt32(int y, int x0, int x1, const TriDrawTriangleArgs *args, PolyT lit_b = MIN(lit_b, 255); dynlights[x] = MAKEARGB(255, lit_r, lit_g, lit_b); } +#endif } if (OptT::Flags & SWOPT_FixedLight) From 2db433e68f855b979c3428f699f31219cc2b7938 Mon Sep 17 00:00:00 2001 From: Magnus Norddahl Date: Mon, 26 Nov 2018 12:18:07 +0100 Subject: [PATCH 2/4] - only do shade clamps if needed --- src/polyrenderer/drawers/poly_triangle.h | 2 +- src/polyrenderer/drawers/screen_triangle.cpp | 36 ++++++++++++++------ 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/src/polyrenderer/drawers/poly_triangle.h b/src/polyrenderer/drawers/poly_triangle.h index 0d369fa000..132823af4e 100644 --- a/src/polyrenderer/drawers/poly_triangle.h +++ b/src/polyrenderer/drawers/poly_triangle.h @@ -83,7 +83,7 @@ public: float worldposZ[MAXWIDTH]; uint32_t texel[MAXWIDTH]; int32_t texelV[MAXWIDTH]; - fixed_t lightarray[MAXWIDTH]; + uint16_t lightarray[MAXWIDTH]; uint32_t dynlights[MAXWIDTH]; static PolyTriangleThreadData *Get(DrawerThread *thread); diff --git a/src/polyrenderer/drawers/screen_triangle.cpp b/src/polyrenderer/drawers/screen_triangle.cpp index 02c9fcbf00..af91292e37 100644 --- a/src/polyrenderer/drawers/screen_triangle.cpp +++ b/src/polyrenderer/drawers/screen_triangle.cpp @@ -403,7 +403,7 @@ void DrawSpanOpt32(int y, int x0, int x1, const TriDrawTriangleArgs *args, PolyT float *worldposZ = thread->worldposZ; uint32_t *texel = thread->texel; int32_t *texelV = thread->texelV; - fixed_t *lightarray = thread->lightarray; + uint16_t *lightarray = thread->lightarray; uint32_t *dynlights = thread->dynlights; if (!(ModeT::SWFlags & SWSTYLEF_Fill) && !(ModeT::SWFlags & SWSTYLEF_FogBoundary)) @@ -463,6 +463,29 @@ void DrawSpanOpt32(int y, int x0, int x1, const TriDrawTriangleArgs *args, PolyT shade = (fixed_t)((2.0f - (light + 12.0f) / 128.0f) * (float)FRACUNIT); lightpos = (fixed_t)(globVis * posW * (float)FRACUNIT); lightstep = (fixed_t)(globVis * stepW * (float)FRACUNIT); + + fixed_t maxvis = 24 * FRACUNIT / 32; + fixed_t maxlight = 31 * FRACUNIT / 32; + + fixed_t lightend = lightpos + lightstep * (x1 - x0); + if (lightpos < maxvis && shade >= lightpos && shade - lightpos <= maxlight && + lightend < maxvis && shade >= lightend && shade - lightend <= maxlight) + { + lightpos += FRACUNIT - shade; + for (int x = x0; x < x1; x++) + { + lightarray[x] = lightpos >> 8; + lightpos += lightstep; + } + } + else + { + for (int x = x0; x < x1; x++) + { + lightarray[x] = (FRACUNIT - clamp(shade - MIN(maxvis, lightpos), 0, maxlight)) >> 8; + lightpos += lightstep; + } + } } for (int x = x0; x < x1; x++) @@ -477,15 +500,6 @@ void DrawSpanOpt32(int y, int x0, int x1, const TriDrawTriangleArgs *args, PolyT posWorldY += stepWorldY; posWorldZ += stepWorldZ; } - - if (!(OptT::Flags & SWOPT_FixedLight)) - { - fixed_t maxvis = 24 * FRACUNIT / 32; - fixed_t maxlight = 31 * FRACUNIT / 32; - lightarray[x] = (FRACUNIT - clamp(shade - MIN(maxvis, lightpos), 0, maxlight)) >> 8; - lightpos += lightstep; - } - if (!(ModeT::SWFlags & SWSTYLEF_Fill) && !(ModeT::SWFlags & SWSTYLEF_FogBoundary)) { float rcpW = 0x01000000 / posW; @@ -706,7 +720,7 @@ void DrawSpanOpt32(int y, int x0, int x1, const TriDrawTriangleArgs *args, PolyT __m128i mfg = _mm_unpacklo_epi8(_mm_setr_epi32(texPixels[texel[x]], texPixels[texel[x + 1]], 0, 0), _mm_setzero_si128()); if (!(OptT::Flags & SWOPT_FixedLight)) - mlightshade = _mm_shuffle_epi32(_mm_shufflelo_epi16(_mm_loadl_epi64((const __m128i*)&lightarray[x]), _MM_SHUFFLE(2, 2, 0, 0)), _MM_SHUFFLE(1, 1, 0, 0)); + mlightshade = _mm_shuffle_epi32(_mm_shufflelo_epi16(_mm_cvtsi32_si128(*(int*)&lightarray[x]), _MM_SHUFFLE(1, 1, 0, 0)), _MM_SHUFFLE(1, 1, 0, 0)); if (OptT::Flags & SWOPT_DynLights) { From dbb7df998d37ea3c0dc8a93188329f50a4b00564 Mon Sep 17 00:00:00 2001 From: Magnus Norddahl Date: Mon, 26 Nov 2018 14:32:56 +0100 Subject: [PATCH 3/4] - step with SSE --- src/polyrenderer/drawers/screen_triangle.cpp | 75 ++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/src/polyrenderer/drawers/screen_triangle.cpp b/src/polyrenderer/drawers/screen_triangle.cpp index af91292e37..5d991267d0 100644 --- a/src/polyrenderer/drawers/screen_triangle.cpp +++ b/src/polyrenderer/drawers/screen_triangle.cpp @@ -488,6 +488,80 @@ void DrawSpanOpt32(int y, int x0, int x1, const TriDrawTriangleArgs *args, PolyT } } +#ifndef NO_SSE + __m128 mposW, mposU, mposV, mstepW, mstepU, mstepV; + __m128 mposWorldX, mposWorldY, mposWorldZ, mstepWorldX, mstepWorldY, mstepWorldZ; + __m128i mtexMul1, mtexMul2; + + #define SETUP_STEP_SSE(mpos,mstep,pos,step) \ + mstep = _mm_load_ss(&step); \ + mpos = _mm_load_ss(&pos); \ + mpos = _mm_shuffle_ps(mpos, mpos, _MM_SHUFFLE(2, 1, 0, 0)); \ + mpos = _mm_add_ss(mpos, mstep); \ + mpos = _mm_shuffle_ps(mpos, mpos, _MM_SHUFFLE(2, 1, 0, 0)); \ + mpos = _mm_add_ss(mpos, mstep); \ + mpos = _mm_shuffle_ps(mpos, mpos, _MM_SHUFFLE(2, 1, 0, 0)); \ + mpos = _mm_add_ss(mpos, mstep); \ + mpos = _mm_shuffle_ps(mpos, mpos, _MM_SHUFFLE(0, 1, 2, 3)); \ + mstep = _mm_mul_ss(mstep, _mm_set1_ps(4.0f)); \ + mstep = _mm_shuffle_ps(mstep, mstep, _MM_SHUFFLE(0, 0, 0, 0)); + + SETUP_STEP_SSE(mposW, mstepW, posW, stepW); + + if (OptT::Flags & SWOPT_DynLights) + { + SETUP_STEP_SSE(mposWorldX, mstepWorldX, posWorldX, stepWorldX); + SETUP_STEP_SSE(mposWorldY, mstepWorldY, posWorldY, stepWorldY); + SETUP_STEP_SSE(mposWorldZ, mstepWorldZ, posWorldZ, stepWorldZ); + } + + if (!(ModeT::SWFlags & SWSTYLEF_Fill) && !(ModeT::SWFlags & SWSTYLEF_FogBoundary)) + { + SETUP_STEP_SSE(mposU, mstepU, posU, stepU); + SETUP_STEP_SSE(mposV, mstepV, posV, stepV); + + mtexMul1 = _mm_setr_epi16(texWidth, texWidth, texWidth, texWidth, texHeight, texHeight, texHeight, texHeight); + mtexMul2 = _mm_setr_epi16(texHeight, texHeight, texHeight, texHeight, 1, 1, 1, 1); + } + + #undef SETUP_STEP_SSE + + for (int x = x0; x < x1; x += 4) + { + __m128 rcp_posW = _mm_rcp_ps(mposW); + + if (OptT::Flags & SWOPT_DynLights) + { + _mm_storeu_ps(&worldposX[x], _mm_mul_ps(mposWorldX, rcp_posW)); + _mm_storeu_ps(&worldposY[x], _mm_mul_ps(mposWorldY, rcp_posW)); + _mm_storeu_ps(&worldposZ[x], _mm_mul_ps(mposWorldZ, rcp_posW)); + mposWorldX = _mm_add_ps(mposWorldX, mstepWorldX); + mposWorldY = _mm_add_ps(mposWorldY, mstepWorldY); + mposWorldZ = _mm_add_ps(mposWorldZ, mstepWorldZ); + } + if (!(ModeT::SWFlags & SWSTYLEF_Fill) && !(ModeT::SWFlags & SWSTYLEF_FogBoundary)) + { + __m128 rcpW = _mm_mul_ps(_mm_set1_ps(0x01000000), rcp_posW); + __m128i u = _mm_cvtps_epi32(_mm_mul_ps(mposU, rcpW)); + __m128i v = _mm_cvtps_epi32(_mm_mul_ps(mposV, rcpW)); + _mm_storeu_si128((__m128i*)&texelV[x], v); + + __m128i texelX = _mm_srli_epi32(_mm_slli_epi32(u, 8), 17); + __m128i texelY = _mm_srli_epi32(_mm_slli_epi32(v, 8), 17); + __m128i texelXY = _mm_mulhi_epu16(_mm_slli_epi16(_mm_packs_epi32(texelX, texelY), 1), mtexMul1); + __m128i texlo = _mm_mullo_epi16(texelXY, mtexMul2); + __m128i texhi = _mm_mulhi_epi16(texelXY, mtexMul2); + texelX = _mm_unpacklo_epi16(texlo, texhi); + texelY = _mm_unpackhi_epi16(texlo, texhi); + _mm_storeu_si128((__m128i*)&texel[x], _mm_add_epi32(texelX, texelY)); + + mposU = _mm_add_ps(mposU, mstepU); + mposV = _mm_add_ps(mposV, mstepV); + } + + mposW = _mm_add_ps(mposW, mstepW); + } +#else for (int x = x0; x < x1; x++) { if (OptT::Flags & SWOPT_DynLights) @@ -515,6 +589,7 @@ void DrawSpanOpt32(int y, int x0, int x1, const TriDrawTriangleArgs *args, PolyT posW += stepW; } +#endif if (OptT::Flags & SWOPT_DynLights) { From 5e01a874beb6b390113dc1d96e5ddff53f0b2757 Mon Sep 17 00:00:00 2001 From: Magnus Norddahl Date: Mon, 26 Nov 2018 17:02:01 +0100 Subject: [PATCH 4/4] - merge the span step code used by both the 8 bit and 32 bit drawers --- src/polyrenderer/drawers/screen_triangle.cpp | 422 ++++++------------- 1 file changed, 128 insertions(+), 294 deletions(-) diff --git a/src/polyrenderer/drawers/screen_triangle.cpp b/src/polyrenderer/drawers/screen_triangle.cpp index 5d991267d0..1323da2709 100644 --- a/src/polyrenderer/drawers/screen_triangle.cpp +++ b/src/polyrenderer/drawers/screen_triangle.cpp @@ -378,23 +378,16 @@ void ScreenTriangle::Draw(const TriDrawTriangleArgs *args, PolyTriangleThreadDat } } -template -void DrawSpanOpt32(int y, int x0, int x1, const TriDrawTriangleArgs *args, PolyTriangleThreadData *thread) +template +void StepSpan(int y, int x0, int x1, const TriDrawTriangleArgs *args, PolyTriangleThreadData *thread) { using namespace TriScreenDrawerModes; - uint32_t fixedlight; - uint32_t shade_fade_r, shade_fade_g, shade_fade_b, shade_light_r, shade_light_g, shade_light_b, desaturate, inv_desaturate; - fixed_t fuzzscale; - int _fuzzpos; - const uint32_t *texPixels, *translation; - int texWidth, texHeight; - uint32_t fillcolor; - int actoralpha; float v1X, v1Y, v1W, v1U, v1V, v1WorldX, v1WorldY, v1WorldZ; float startX, startY; float stepW, stepU, stepV, stepWorldX, stepWorldY, stepWorldZ; float posW, posU, posV, posWorldX, posWorldY, posWorldZ; + int texWidth, texHeight; uint32_t light; fixed_t shade, lightpos, lightstep; @@ -406,28 +399,6 @@ void DrawSpanOpt32(int y, int x0, int x1, const TriDrawTriangleArgs *args, PolyT uint16_t *lightarray = thread->lightarray; uint32_t *dynlights = thread->dynlights; - if (!(ModeT::SWFlags & SWSTYLEF_Fill) && !(ModeT::SWFlags & SWSTYLEF_FogBoundary)) - { - texPixels = (const uint32_t*)args->uniforms->TexturePixels(); - texWidth = args->uniforms->TextureWidth(); - texHeight = args->uniforms->TextureHeight(); - } - - if (ModeT::SWFlags & SWSTYLEF_Translated) - { - translation = (const uint32_t*)args->uniforms->Translation(); - } - - if ((ModeT::SWFlags & SWSTYLEF_Fill) || (ModeT::SWFlags & SWSTYLEF_Skycap) || (ModeT::Flags & STYLEF_ColorIsFixed)) - { - fillcolor = args->uniforms->Color(); - } - - if (!(ModeT::Flags & STYLEF_Alpha1)) - { - actoralpha = args->uniforms->Alpha(); - } - v1X = args->v1->x; v1Y = args->v1->y; v1W = args->v1->w; @@ -442,6 +413,12 @@ void DrawSpanOpt32(int y, int x0, int x1, const TriDrawTriangleArgs *args, PolyT posU = v1U + stepU * startX + args->gradientY.U * startY; posV = v1V + stepV * startX + args->gradientY.V * startY; + if (!(ModeT::SWFlags & SWSTYLEF_Fill) && !(ModeT::SWFlags & SWSTYLEF_FogBoundary)) + { + texWidth = args->uniforms->TextureWidth(); + texHeight = args->uniforms->TextureHeight(); + } + if (OptT::Flags & SWOPT_DynLights) { v1WorldX = args->v1->worldX * v1W; @@ -471,19 +448,42 @@ void DrawSpanOpt32(int y, int x0, int x1, const TriDrawTriangleArgs *args, PolyT if (lightpos < maxvis && shade >= lightpos && shade - lightpos <= maxlight && lightend < maxvis && shade >= lightend && shade - lightend <= maxlight) { - lightpos += FRACUNIT - shade; - for (int x = x0; x < x1; x++) + if (BitsPerPixel == 32) { - lightarray[x] = lightpos >> 8; - lightpos += lightstep; + lightpos += FRACUNIT - shade; + for (int x = x0; x < x1; x++) + { + lightarray[x] = lightpos >> 8; + lightpos += lightstep; + } + } + else + { + lightpos = shade - lightpos; + for (int x = x0; x < x1; x++) + { + lightarray[x] = (lightpos >> 3) & 0xffffff00; + lightpos -= lightstep; + } } } else { - for (int x = x0; x < x1; x++) + if (BitsPerPixel == 32) { - lightarray[x] = (FRACUNIT - clamp(shade - MIN(maxvis, lightpos), 0, maxlight)) >> 8; - lightpos += lightstep; + for (int x = x0; x < x1; x++) + { + lightarray[x] = (FRACUNIT - clamp(shade - MIN(maxvis, lightpos), 0, maxlight)) >> 8; + lightpos += lightstep; + } + } + else + { + for (int x = x0; x < x1; x++) + { + lightarray[x] = (clamp(shade - MIN(maxvis, lightpos), 0, maxlight) >> 3) & 0xffffff00; + lightpos += lightstep; + } } } } @@ -742,6 +742,47 @@ void DrawSpanOpt32(int y, int x0, int x1, const TriDrawTriangleArgs *args, PolyT } #endif } +} + +template +void DrawSpanOpt32(int y, int x0, int x1, const TriDrawTriangleArgs *args, PolyTriangleThreadData *thread) +{ + using namespace TriScreenDrawerModes; + + StepSpan(y, x0, x1, args, thread); + + uint32_t fixedlight; + uint32_t shade_fade_r, shade_fade_g, shade_fade_b, shade_light_r, shade_light_g, shade_light_b, desaturate, inv_desaturate; + fixed_t fuzzscale; + int _fuzzpos; + const uint32_t *texPixels, *translation; + uint32_t fillcolor; + int actoralpha; + + uint32_t *texel = thread->texel; + int32_t *texelV = thread->texelV; + uint16_t *lightarray = thread->lightarray; + uint32_t *dynlights = thread->dynlights; + + if (!(ModeT::SWFlags & SWSTYLEF_Fill) && !(ModeT::SWFlags & SWSTYLEF_FogBoundary)) + { + texPixels = (const uint32_t*)args->uniforms->TexturePixels(); + } + + if (ModeT::SWFlags & SWSTYLEF_Translated) + { + translation = (const uint32_t*)args->uniforms->Translation(); + } + + if ((ModeT::SWFlags & SWSTYLEF_Fill) || (ModeT::SWFlags & SWSTYLEF_Skycap) || (ModeT::Flags & STYLEF_ColorIsFixed)) + { + fillcolor = args->uniforms->Color(); + } + + if (!(ModeT::Flags & STYLEF_Alpha1)) + { + actoralpha = args->uniforms->Alpha(); + } if (OptT::Flags & SWOPT_FixedLight) { @@ -1142,226 +1183,52 @@ void DrawSpanOpt8(int y, int x0, int x1, const TriDrawTriangleArgs *args, PolyTr { using namespace TriScreenDrawerModes; - float v1X, v1Y, v1W, v1U, v1V, v1WorldX, v1WorldY, v1WorldZ; - float startX, startY; - float stepW, stepU, stepV, stepWorldX, stepWorldY, stepWorldZ; - float posW, posU, posV, posWorldX, posWorldY, posWorldZ; + StepSpan(y, x0, x1, args, thread); - PolyLight *lights; - int num_lights; - float worldnormalX, worldnormalY, worldnormalZ; - uint32_t dynlightcolor; + uint32_t fixedlight, capcolor; + fixed_t fuzzscale; + int _fuzzpos; const uint8_t *colormaps, *texPixels, *translation; - int texWidth, texHeight; - uint32_t fillcolor, capcolor; - int alpha; - uint32_t light; - fixed_t shade, lightpos, lightstep; - int16_t dynlights_r[MAXWIDTH / 16], dynlights_g[MAXWIDTH / 16], dynlights_b[MAXWIDTH / 16]; - int16_t posdynlight_r, posdynlight_g, posdynlight_b; - fixed_t lightarray[MAXWIDTH / 16]; + uint32_t fillcolor; + int actoralpha; - v1X = args->v1->x; - v1Y = args->v1->y; - v1W = args->v1->w; - v1U = args->v1->u * v1W; - v1V = args->v1->v * v1W; - startX = x0 + (0.5f - v1X); - startY = y + (0.5f - v1Y); - stepW = args->gradientX.W; - stepU = args->gradientX.U; - stepV = args->gradientX.V; - posW = v1W + stepW * startX + args->gradientY.W * startY; - posU = v1U + stepU * startX + args->gradientY.U * startY; - posV = v1V + stepV * startX + args->gradientY.V * startY; + uint32_t *texel = thread->texel; + int32_t *texelV = thread->texelV; + uint16_t *lightarray = thread->lightarray; + uint32_t *dynlights = thread->dynlights; - texPixels = args->uniforms->TexturePixels(); - translation = args->uniforms->Translation(); - texWidth = args->uniforms->TextureWidth(); - texHeight = args->uniforms->TextureHeight(); - fillcolor = args->uniforms->Color(); - alpha = args->uniforms->Alpha(); colormaps = args->uniforms->BaseColormap(); - light = args->uniforms->Light(); + + if (!(ModeT::SWFlags & SWSTYLEF_Fill) && !(ModeT::SWFlags & SWSTYLEF_FogBoundary)) + { + texPixels = args->uniforms->TexturePixels(); + } + + if (ModeT::SWFlags & SWSTYLEF_Translated) + { + translation = args->uniforms->Translation(); + } + + if ((ModeT::SWFlags & SWSTYLEF_Fill) || (ModeT::SWFlags & SWSTYLEF_Skycap) || (ModeT::Flags & STYLEF_ColorIsFixed)) + { + fillcolor = args->uniforms->Color(); + } + + if (!(ModeT::Flags & STYLEF_Alpha1)) + { + actoralpha = args->uniforms->Alpha(); + } if (ModeT::SWFlags & SWSTYLEF_Skycap) capcolor = GPalette.BaseColors[fillcolor].d; if (OptT::Flags & SWOPT_FixedLight) { - light += light >> 7; // 255 -> 256 - light = ((256 - light) * NUMCOLORMAPS) & 0xffffff00; - } - else - { - float globVis = args->uniforms->GlobVis() * (1.0f / 32.0f); - - shade = (fixed_t)((2.0f - (light + 12.0f) / 128.0f) * (float)FRACUNIT); - lightpos = (fixed_t)(globVis * posW * (float)FRACUNIT); - lightstep = (fixed_t)(globVis * stepW * (float)FRACUNIT); - - int affineOffset = x0 / 16 * 16 - x0; - lightpos = lightpos + lightstep * affineOffset; - lightstep = lightstep * 16; - - fixed_t maxvis = 24 * FRACUNIT / 32; - fixed_t maxlight = 31 * FRACUNIT / 32; - - for (int x = x0 / 16; x <= x1 / 16 + 1; x++) - { - lightarray[x] = (clamp(shade - MIN(maxvis, lightpos), 0, maxlight) >> 8) << 5; - lightpos += lightstep; - } - - int offset = x0 >> 4; - int t1 = x0 & 15; - int t0 = 16 - t1; - lightpos = (lightarray[offset] * t0 + lightarray[offset + 1] * t1); - - for (int x = x0 / 16; x <= x1 / 16; x++) - { - lightarray[x] = lightarray[x + 1] - lightarray[x]; - } + fixedlight = args->uniforms->Light(); + fixedlight += fixedlight >> 7; // 255 -> 256 + fixedlight = ((256 - fixedlight) * NUMCOLORMAPS) & 0xffffff00; } - if (OptT::Flags & SWOPT_DynLights) - { - v1WorldX = args->v1->worldX * v1W; - v1WorldY = args->v1->worldY * v1W; - v1WorldZ = args->v1->worldZ * v1W; - stepWorldX = args->gradientX.WorldX; - stepWorldY = args->gradientX.WorldY; - stepWorldZ = args->gradientX.WorldZ; - posWorldX = v1WorldX + stepWorldX * startX + args->gradientY.WorldX * startY; - posWorldY = v1WorldY + stepWorldY * startX + args->gradientY.WorldY * startY; - posWorldZ = v1WorldZ + stepWorldZ * startX + args->gradientY.WorldZ * startY; - - lights = args->uniforms->Lights(); - num_lights = args->uniforms->NumLights(); - worldnormalX = args->uniforms->Normal().X; - worldnormalY = args->uniforms->Normal().Y; - worldnormalZ = args->uniforms->Normal().Z; - dynlightcolor = args->uniforms->DynLightColor(); - - // The normal vector cannot be uniform when drawing models. Calculate and use the face normal: - if (worldnormalX == 0.0f && worldnormalY == 0.0f && worldnormalZ == 0.0f) - { - float dx1 = args->v2->worldX - args->v1->worldX; - float dy1 = args->v2->worldY - args->v1->worldY; - float dz1 = args->v2->worldZ - args->v1->worldZ; - float dx2 = args->v3->worldX - args->v1->worldX; - float dy2 = args->v3->worldY - args->v1->worldY; - float dz2 = args->v3->worldZ - args->v1->worldZ; - worldnormalX = dy1 * dz2 - dz1 * dy2; - worldnormalY = dz1 * dx2 - dx1 * dz2; - worldnormalZ = dx1 * dy2 - dy1 * dx2; - float lensqr = worldnormalX * worldnormalX + worldnormalY * worldnormalY + worldnormalZ * worldnormalZ; -#ifndef NO_SSE - float rcplen = _mm_cvtss_f32(_mm_rsqrt_ss(_mm_set_ss(lensqr))); -#else - float rcplen = 1.0f / sqrt(lensqr); -#endif - worldnormalX *= rcplen; - worldnormalY *= rcplen; - worldnormalZ *= rcplen; - } - - int affineOffset = x0 / 16 * 16 - x0; - float posLightW = posW + stepW * affineOffset; - posWorldX = posWorldX + stepWorldX * affineOffset; - posWorldY = posWorldY + stepWorldY * affineOffset; - posWorldZ = posWorldZ + stepWorldZ * affineOffset; - float stepLightW = stepW * 16.0f; - stepWorldX *= 16.0f; - stepWorldY *= 16.0f; - stepWorldZ *= 16.0f; - - for (int x = x0 / 16; x <= x1 / 16 + 1; x++) - { - uint32_t lit_r = RPART(dynlightcolor); - uint32_t lit_g = GPART(dynlightcolor); - uint32_t lit_b = BPART(dynlightcolor); - - float rcp_posW = 1.0f / posLightW; - float worldposX = posWorldX * rcp_posW; - float worldposY = posWorldY * rcp_posW; - float worldposZ = posWorldZ * rcp_posW; - for (int i = 0; i < num_lights; i++) - { - float lightposX = lights[i].x; - float lightposY = lights[i].y; - float lightposZ = lights[i].z; - float light_radius = lights[i].radius; - uint32_t light_color = lights[i].color; - - bool is_attenuated = light_radius < 0.0f; - if (is_attenuated) - light_radius = -light_radius; - - // L = light-pos - // dist = sqrt(dot(L, L)) - // distance_attenuation = 1 - MIN(dist * (1/radius), 1) - float Lx = lightposX - worldposX; - float Ly = lightposY - worldposY; - float Lz = lightposZ - worldposZ; - float dist2 = Lx * Lx + Ly * Ly + Lz * Lz; -#ifdef NO_SSE - //float rcp_dist = 1.0f / sqrt(dist2); - float rcp_dist = 1.0f / (dist2 * 0.01f); -#else - float rcp_dist = _mm_cvtss_f32(_mm_rsqrt_ss(_mm_set_ss(dist2))); -#endif - float dist = dist2 * rcp_dist; - float distance_attenuation = 256.0f - MIN(dist * light_radius, 256.0f); - - // The simple light type - float simple_attenuation = distance_attenuation; - - // The point light type - // diffuse = max(dot(N,normalize(L)),0) * attenuation - Lx *= rcp_dist; - Ly *= rcp_dist; - Lz *= rcp_dist; - float dotNL = worldnormalX * Lx + worldnormalY * Ly + worldnormalZ * Lz; - float point_attenuation = MAX(dotNL, 0.0f) * distance_attenuation; - - uint32_t attenuation = (uint32_t)(is_attenuated ? (int32_t)point_attenuation : (int32_t)simple_attenuation); - - lit_r += (RPART(light_color) * attenuation) >> 8; - lit_g += (GPART(light_color) * attenuation) >> 8; - lit_b += (BPART(light_color) * attenuation) >> 8; - } - - lit_r = MIN(lit_r, 255); - lit_g = MIN(lit_g, 255); - lit_b = MIN(lit_b, 255); - dynlights_r[x] = lit_r; - dynlights_g[x] = lit_g; - dynlights_b[x] = lit_b; - - posLightW += stepLightW; - posWorldX += stepWorldX; - posWorldY += stepWorldY; - posWorldZ += stepWorldZ; - } - - int offset = x0 >> 4; - int t1 = x0 & 15; - int t0 = 16 - t1; - posdynlight_r = (dynlights_r[offset] * t0 + dynlights_r[offset + 1] * t1); - posdynlight_g = (dynlights_g[offset] * t0 + dynlights_g[offset + 1] * t1); - posdynlight_b = (dynlights_b[offset] * t0 + dynlights_b[offset + 1] * t1); - - for (int x = x0 / 16; x <= x1 / 16; x++) - { - dynlights_r[x] = dynlights_r[x + 1] - dynlights_r[x]; - dynlights_g[x] = dynlights_g[x + 1] - dynlights_g[x]; - dynlights_b[x] = dynlights_b[x + 1] - dynlights_b[x]; - } - } - - fixed_t fuzzscale; - int _fuzzpos; if (ModeT::BlendOp == STYLEOP_Fuzz) { fuzzscale = (200 << FRACBITS) / viewheight; @@ -1371,19 +1238,13 @@ void DrawSpanOpt8(int y, int x0, int x1, const TriDrawTriangleArgs *args, PolyTr uint8_t *dest = (uint8_t*)args->dest; uint8_t *destLine = dest + args->pitch * y; - int x = x0; - while (x < x1) + for (int x = x0; x < x1; x++) { if (ModeT::BlendOp == STYLEOP_Fuzz) { using namespace swrenderer; - float rcpW = 0x01000000 / posW; - int32_t u = (int32_t)(posU * rcpW); - int32_t v = (int32_t)(posV * rcpW); - uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16; - uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16; - unsigned int sampleshadeout = (texPixels[texelX * texHeight + texelY] != 0) ? 256 : 0; + unsigned int sampleshadeout = (texPixels[texel[x]] != 0) ? 256 : 0; int scaled_x = (x * fuzzscale) >> FRACBITS; int fuzz_x = fuzz_random_x_offset[scaled_x % FUZZ_RANDOM_X_SIZE] + _fuzzpos; @@ -1404,12 +1265,8 @@ void DrawSpanOpt8(int y, int x0, int x1, const TriDrawTriangleArgs *args, PolyTr } else if (ModeT::SWFlags & SWSTYLEF_Skycap) { - float rcpW = 0x01000000 / posW; - int32_t u = (int32_t)(posU * rcpW); - int32_t v = (int32_t)(posV * rcpW); - uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16; - uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16; - int fg = texPixels[texelX * texHeight + texelY]; + int32_t v = texelV[x]; + int fg = texPixels[texel[x]]; int start_fade = 2; // How fast it should fade out int alpha_top = clamp(v >> (16 - start_fade), 0, 256); @@ -1446,12 +1303,11 @@ void DrawSpanOpt8(int y, int x0, int x1, const TriDrawTriangleArgs *args, PolyTr uint8_t shadedfg; if (OptT::Flags & SWOPT_FixedLight) { - shadedfg = colormaps[light + fg]; + shadedfg = colormaps[fixedlight + fg]; } else { - int lightshade = (lightpos >> 4) & 0xffffff00; - shadedfg = colormaps[lightshade + fg]; + shadedfg = colormaps[lightarray[x] + fg]; } destLine[x] = shadedfg; @@ -1465,12 +1321,7 @@ void DrawSpanOpt8(int y, int x0, int x1, const TriDrawTriangleArgs *args, PolyTr } else { - float rcpW = 0x01000000 / posW; - int32_t u = (int32_t)(posU * rcpW); - int32_t v = (int32_t)(posV * rcpW); - uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16; - uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16; - fg = texPixels[texelX * texHeight + texelY]; + fg = texPixels[texel[x]]; } int fgalpha = 255; @@ -1490,7 +1341,7 @@ void DrawSpanOpt8(int y, int x0, int x1, const TriDrawTriangleArgs *args, PolyTr if (!(ModeT::Flags & STYLEF_Alpha1)) { - fgalpha = (fgalpha * alpha) >> 8; + fgalpha = (fgalpha * actoralpha) >> 8; } if (ModeT::SWFlags & SWSTYLEF_Translated) @@ -1499,28 +1350,24 @@ void DrawSpanOpt8(int y, int x0, int x1, const TriDrawTriangleArgs *args, PolyTr uint8_t shadedfg; if (OptT::Flags & SWOPT_FixedLight) { - shadedfg = colormaps[light + fg]; + shadedfg = colormaps[fixedlight + fg]; } else { - int lightshade = (lightpos >> 4) & 0xffffff00; - shadedfg = colormaps[lightshade + fg]; + shadedfg = colormaps[lightarray[x] + fg]; } if (OptT::Flags & SWOPT_DynLights) { - if (posdynlight_r | posdynlight_g | posdynlight_b) + uint32_t lit = dynlights[x]; + if (lit & 0x00ffffff) { - uint32_t lit_r = posdynlight_r >> 4; - uint32_t lit_g = posdynlight_g >> 4; - uint32_t lit_b = posdynlight_b >> 4; - uint32_t fgrgb = GPalette.BaseColors[fg]; uint32_t shadedfgrgb = GPalette.BaseColors[shadedfg]; - uint32_t out_r = MIN(((RPART(fgrgb) * lit_r) >> 8) + RPART(shadedfgrgb), (uint32_t)255); - uint32_t out_g = MIN(((GPART(fgrgb) * lit_g) >> 8) + GPART(shadedfgrgb), (uint32_t)255); - uint32_t out_b = MIN(((BPART(fgrgb) * lit_b) >> 8) + BPART(shadedfgrgb), (uint32_t)255); + uint32_t out_r = MIN(((RPART(fgrgb) * RPART(lit)) >> 8) + RPART(shadedfgrgb), (uint32_t)255); + uint32_t out_g = MIN(((GPART(fgrgb) * GPART(lit)) >> 8) + GPART(shadedfgrgb), (uint32_t)255); + uint32_t out_b = MIN(((BPART(fgrgb) * BPART(lit)) >> 8) + BPART(shadedfgrgb), (uint32_t)255); shadedfg = RGB256k.All[((out_r >> 2) << 12) | ((out_g >> 2) << 6) | (out_b >> 2)]; } } @@ -1637,19 +1484,6 @@ void DrawSpanOpt8(int y, int x0, int x1, const TriDrawTriangleArgs *args, PolyTr destLine[x] = RGB256k.All[((out_r >> 2) << 12) | ((out_g >> 2) << 6) | (out_b >> 2)]; } } - - posW += stepW; - posU += stepU; - posV += stepV; - if (OptT::Flags & SWOPT_DynLights) - { - posdynlight_r += dynlights_r[x >> 4]; - posdynlight_g += dynlights_g[x >> 4]; - posdynlight_b += dynlights_b[x >> 4]; - } - if (!(OptT::Flags & SWOPT_FixedLight)) - lightpos += lightarray[x >> 4]; - x++; } }