diff --git a/src/polyrenderer/drawers/screen_triangle.cpp b/src/polyrenderer/drawers/screen_triangle.cpp index f3aad8f47..b4d2168ae 100644 --- a/src/polyrenderer/drawers/screen_triangle.cpp +++ b/src/polyrenderer/drawers/screen_triangle.cpp @@ -1177,21 +1177,30 @@ void ScreenTriangle::DrawSWRender(const TriDrawTriangleArgs *args, WorkerThreadD } } - // Make variables local so the compiler can optimize without worrying about pointer aliasing + // Draw the triangle: - bool depthTest = args->uniforms->DepthTest(); - bool writeColor = args->uniforms->WriteColor(); - bool writeStencil = args->uniforms->WriteStencil(); - bool writeDepth = args->uniforms->WriteDepth(); + if (args->destBgra) + { + int num_cores = thread->num_cores; + for (int y = topY + thread->skipped_by_thread(topY); y < bottomY; y += num_cores) + { + DrawSpan32(y, leftEdge[y], rightEdge[y], args); + } + } + else + { + int num_cores = thread->num_cores; + for (int y = topY + thread->skipped_by_thread(topY); y < bottomY; y += num_cores) + { + DrawSpan8(y, leftEdge[y], rightEdge[y], args); + } + } +} - int bmode = (int)args->uniforms->BlendMode(); - auto drawFunc = args->destBgra ? ScreenTriangle::TriDrawers32[bmode] : ScreenTriangle::TriDrawers8[bmode]; - - float *zbuffer = args->zbuffer; - int pitch = args->pitch; - int stencilpitch = args->stencilPitch * 8; - int color = ((int)(ptrdiff_t)args->uniforms->TexturePixels()) >> 2; +#ifndef NO_SSE +void ScreenTriangle::DrawSpan32(int y, int x0, int x1, const TriDrawTriangleArgs *args) +{ float v1X = args->v1->x; float v1Y = args->v1->y; float v1W = args->v1->w; @@ -1200,12 +1209,15 @@ void ScreenTriangle::DrawSWRender(const TriDrawTriangleArgs *args, WorkerThreadD float stepXW = args->gradientX.W; float stepXU = args->gradientX.U; float stepXV = args->gradientX.V; - float stepYW = args->gradientY.W; - float stepYU = args->gradientY.U; - float stepYV = args->gradientY.V; + float startX = x0 + (0.5f - v1X); + float startY = y + (0.5f - v1Y); + float posXW = v1W + stepXW * startX + args->gradientY.W * startY; + float posXU = v1U + stepXU * startX + args->gradientY.U * startY; + float posXV = v1V + stepXV * startX + args->gradientY.V * startY; + + const uint32_t *texPixels = (const uint32_t*)args->uniforms->TexturePixels(); int texWidth = args->uniforms->TextureWidth(); int texHeight = args->uniforms->TextureHeight(); - auto colormaps = args->uniforms->BaseColormap(); bool is_fixed_light = args->uniforms->FixedLight(); uint32_t lightmask = is_fixed_light ? 0 : 0xffffffff; @@ -1214,69 +1226,100 @@ void ScreenTriangle::DrawSWRender(const TriDrawTriangleArgs *args, WorkerThreadD float globVis = args->uniforms->GlobVis() * (1.0f / 32.0f); light += light >> 7; // 255 -> 256 - // Draw the triangle: + float *zbufferLine = args->zbuffer + args->stencilPitch * 8 * y; - if (args->destBgra) + uint32_t *dest = (uint32_t*)args->dest; + uint32_t *destLine = dest + args->pitch * y; + + int x = x0; + int sseEnd = x0 + (x1 - x0) / 2 * 2; + while (x < sseEnd) { - uint32_t *dest = (uint32_t*)args->dest; - const uint32_t *texPixels = (const uint32_t*)args->uniforms->TexturePixels(); - - int num_cores = thread->num_cores; - for (int y = topY + thread->skipped_by_thread(topY); y < bottomY; y += num_cores) + if (zbufferLine[x] <= posXW && zbufferLine[x + 1] <= posXW + stepXW) { - int x0 = leftEdge[y]; - int x1 = rightEdge[y]; + zbufferLine[x] = posXW; + zbufferLine[x + 1] = posXW + stepXW; - uint32_t *destLine = dest + pitch * y; - float *zbufferLine = zbuffer + stencilpitch * y; + uint32_t fgcolor[2]; + int32_t lightshade[2]; - float posXW = v1W + stepXW * (x0 + (0.5f - v1X)) + stepYW * (y + (0.5f - v1Y)); - float posXU = v1U + stepXU * (x0 + (0.5f - v1X)) + stepYU * (y + (0.5f - v1Y)); - float posXV = v1V + stepXV * (x0 + (0.5f - v1X)) + stepYV * (y + (0.5f - v1Y)); + float rcpW = 0x01000000 / posXW; + int32_t u = (int32_t)(posXU * rcpW); + int32_t v = (int32_t)(posXV * rcpW); + uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16; + uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16; + fgcolor[0] = texPixels[texelX * texHeight + texelY]; - int x = x0; - while (x < x1) + fixed_t lightpos = FRACUNIT - (int)(clamp(shade - MIN(24.0f / 32.0f, globVis * posXW), 0.0f, 31.0f / 32.0f) * (float)FRACUNIT); + lightpos = (lightpos & lightmask) | ((light << 8) & ~lightmask); + lightshade[0] = lightpos >> 8; + + posXW += stepXW; + posXU += stepXU; + posXV += stepXV; + + rcpW = 0x01000000 / posXW; + u = (int32_t)(posXU * rcpW); + v = (int32_t)(posXV * rcpW); + texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16; + texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16; + fgcolor[1] = texPixels[texelX * texHeight + texelY]; + + lightpos = FRACUNIT - (int)(clamp(shade - MIN(24.0f / 32.0f, globVis * posXW), 0.0f, 31.0f / 32.0f) * (float)FRACUNIT); + lightpos = (lightpos & lightmask) | ((light << 8) & ~lightmask); + lightshade[1] = lightpos >> 8; + + posXW += stepXW; + posXU += stepXU; + posXV += stepXV; + + __m128i mfgcolor = _mm_loadl_epi64((const __m128i*)fgcolor); + mfgcolor = _mm_unpacklo_epi8(mfgcolor, _mm_setzero_si128()); + + __m128i mlightshade = _mm_loadl_epi64((const __m128i*)lightshade); + mlightshade = _mm_shuffle_epi32(mlightshade, _MM_SHUFFLE(1, 0, 1, 0)); + mlightshade = _mm_packs_epi32(mlightshade, mlightshade); + + __m128i mdestcolor = _mm_srli_epi16(_mm_mullo_epi16(mfgcolor, mlightshade), 8); + mdestcolor = _mm_packus_epi16(mdestcolor, _mm_setzero_si128()); + mdestcolor = _mm_or_si128(mdestcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)(destLine + x), mdestcolor); + + x += 2; + } + else + { + for (int i = 0; i < 2; i++) { - bool processPixel = depthTest ? zbufferLine[x] <= posXW : true; - if (processPixel) // Pixel is visible (passed stencil and depth tests) + if (zbufferLine[x] <= posXW) { - if (writeColor) + float rcpW = 0x01000000 / posXW; + int32_t u = (int32_t)(posXU * rcpW); + int32_t v = (int32_t)(posXV * rcpW); + + uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16; + uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16; + uint32_t fgcolor = texPixels[texelX * texHeight + texelY]; + + uint32_t fgcolor_r = RPART(fgcolor); + uint32_t fgcolor_g = GPART(fgcolor); + uint32_t fgcolor_b = BPART(fgcolor); + uint32_t fgcolor_a = APART(fgcolor); + if (fgcolor_a > 127) { - if (texPixels) - { - float rcpW = 0x01000000 / posXW; - int32_t u = (int32_t)(posXU * rcpW); - int32_t v = (int32_t)(posXV * rcpW); + fixed_t lightpos = FRACUNIT - (int)(clamp(shade - MIN(24.0f / 32.0f, globVis * posXW), 0.0f, 31.0f / 32.0f) * (float)FRACUNIT); + lightpos = (lightpos & lightmask) | ((light << 8) & ~lightmask); + int lightshade = lightpos >> 8; - uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16; - uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16; - uint32_t fgcolor = texPixels[texelX * texHeight + texelY]; + fgcolor_r = (fgcolor_r * lightshade) >> 8; + fgcolor_g = (fgcolor_g * lightshade) >> 8; + fgcolor_b = (fgcolor_b * lightshade) >> 8; - uint32_t fgcolor_r = RPART(fgcolor); - uint32_t fgcolor_g = GPART(fgcolor); - uint32_t fgcolor_b = BPART(fgcolor); - uint32_t fgcolor_a = APART(fgcolor); - if (fgcolor_a > 127) - { - fixed_t lightpos = FRACUNIT - (int)(clamp(shade - MIN(24.0f / 32.0f, globVis * posXW), 0.0f, 31.0f / 32.0f) * (float)FRACUNIT); - lightpos = (lightpos & lightmask) | ((light << 8) & ~lightmask); - int lightshade = lightpos >> 8; - - fgcolor_r = (fgcolor_r * lightshade) >> 8; - fgcolor_g = (fgcolor_g * lightshade) >> 8; - fgcolor_b = (fgcolor_b * lightshade) >> 8; - - destLine[x] = 0xff000000 | (fgcolor_r << 16) | (fgcolor_g << 8) | fgcolor_b; - } - } - else - { - destLine[x] = color; - } + destLine[x] = 0xff000000 | (fgcolor_r << 16) | (fgcolor_g << 8) | fgcolor_b; } - if (writeDepth) - zbufferLine[x] = posXW; + zbufferLine[x] = posXW; } posXW += stepXW; @@ -1286,68 +1329,184 @@ void ScreenTriangle::DrawSWRender(const TriDrawTriangleArgs *args, WorkerThreadD } } } - else + + while (x < x1) { - uint8_t *dest = args->dest; - const uint8_t *texPixels = args->uniforms->TexturePixels(); - - int num_cores = thread->num_cores; - for (int y = topY + thread->skipped_by_thread(topY); y < bottomY; y += num_cores) + if (zbufferLine[x] <= posXW) { - int x0 = leftEdge[y]; - int x1 = rightEdge[y]; + float rcpW = 0x01000000 / posXW; + int32_t u = (int32_t)(posXU * rcpW); + int32_t v = (int32_t)(posXV * rcpW); - uint8_t *destLine = dest + pitch * y; - float *zbufferLine = zbuffer + stencilpitch * y; + uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16; + uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16; + uint32_t fgcolor = texPixels[texelX * texHeight + texelY]; - float posXW = v1W + stepXW * (x0 + (0.5f - v1X)) + stepYW * (y + (0.5f - v1Y)); - float posXU = v1U + stepXU * (x0 + (0.5f - v1X)) + stepYU * (y + (0.5f - v1Y)); - float posXV = v1V + stepXV * (x0 + (0.5f - v1X)) + stepYV * (y + (0.5f - v1Y)); - - int x = x0; - while (x < x1) + uint32_t fgcolor_r = RPART(fgcolor); + uint32_t fgcolor_g = GPART(fgcolor); + uint32_t fgcolor_b = BPART(fgcolor); + uint32_t fgcolor_a = APART(fgcolor); + if (fgcolor_a > 127) { - bool processPixel = depthTest ? zbufferLine[x] <= posXW : true; - if (processPixel) // Pixel is visible (passed stencil and depth tests) - { - if (writeColor) - { - if (texPixels) - { - float rcpW = 0x01000000 / posXW; - int32_t u = (int32_t)(posXU * rcpW); - int32_t v = (int32_t)(posXV * rcpW); + fixed_t lightpos = FRACUNIT - (int)(clamp(shade - MIN(24.0f / 32.0f, globVis * posXW), 0.0f, 31.0f / 32.0f) * (float)FRACUNIT); + lightpos = (lightpos & lightmask) | ((light << 8) & ~lightmask); + int lightshade = lightpos >> 8; - uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16; - uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16; - uint8_t fgcolor = texPixels[texelX * texHeight + texelY]; + fgcolor_r = (fgcolor_r * lightshade) >> 8; + fgcolor_g = (fgcolor_g * lightshade) >> 8; + fgcolor_b = (fgcolor_b * lightshade) >> 8; - fixed_t lightpos = FRACUNIT - (int)(clamp(shade - MIN(24.0f / 32.0f, globVis * posXW), 0.0f, 31.0f / 32.0f) * (float)FRACUNIT); - lightpos = (lightpos & lightmask) | ((light << 8) & ~lightmask); - int lightshade = lightpos >> 8; - - lightshade = ((256 - lightshade) * NUMCOLORMAPS) & 0xffffff00; - uint8_t shadedfg = colormaps[lightshade + fgcolor]; - - if (fgcolor != 0) - destLine[x] = shadedfg; - } - else - { - destLine[x] = color; - } - } - - if (writeDepth) - zbufferLine[x] = posXW; - } - - posXW += stepXW; - posXU += stepXU; - posXV += stepXV; - x++; + destLine[x] = 0xff000000 | (fgcolor_r << 16) | (fgcolor_g << 8) | fgcolor_b; } + + zbufferLine[x] = posXW; } + + posXW += stepXW; + posXU += stepXU; + posXV += stepXV; + x++; + } +} + +#else + +void ScreenTriangle::DrawSpan32(int y, int x0, int x1, const TriDrawTriangleArgs *args) +{ + float v1X = args->v1->x; + float v1Y = args->v1->y; + float v1W = args->v1->w; + float v1U = args->v1->u * v1W; + float v1V = args->v1->v * v1W; + float stepXW = args->gradientX.W; + float stepXU = args->gradientX.U; + float stepXV = args->gradientX.V; + float startX = x0 + (0.5f - v1X); + float startY = y + (0.5f - v1Y); + float posXW = v1W + stepXW * startX + args->gradientY.W * startY; + float posXU = v1U + stepXU * startX + args->gradientY.U * startY; + float posXV = v1V + stepXV * startX + args->gradientY.V * startY; + + const uint32_t *texPixels = (const uint32_t*)args->uniforms->TexturePixels(); + int texWidth = args->uniforms->TextureWidth(); + int texHeight = args->uniforms->TextureHeight(); + + bool is_fixed_light = args->uniforms->FixedLight(); + uint32_t lightmask = is_fixed_light ? 0 : 0xffffffff; + uint32_t light = args->uniforms->Light(); + float shade = 2.0f - (light + 12.0f) / 128.0f; + float globVis = args->uniforms->GlobVis() * (1.0f / 32.0f); + light += light >> 7; // 255 -> 256 + + float *zbufferLine = args->zbuffer + args->stencilPitch * 8 * y; + + uint32_t *dest = (uint32_t*)args->dest; + uint32_t *destLine = dest + args->pitch * y; + + int x = x0; + while (x < x1) + { + if (zbufferLine[x] <= posXW) + { + float rcpW = 0x01000000 / posXW; + int32_t u = (int32_t)(posXU * rcpW); + int32_t v = (int32_t)(posXV * rcpW); + + uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16; + uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16; + uint32_t fgcolor = texPixels[texelX * texHeight + texelY]; + + uint32_t fgcolor_r = RPART(fgcolor); + uint32_t fgcolor_g = GPART(fgcolor); + uint32_t fgcolor_b = BPART(fgcolor); + uint32_t fgcolor_a = APART(fgcolor); + if (fgcolor_a > 127) + { + fixed_t lightpos = FRACUNIT - (int)(clamp(shade - MIN(24.0f / 32.0f, globVis * posXW), 0.0f, 31.0f / 32.0f) * (float)FRACUNIT); + lightpos = (lightpos & lightmask) | ((light << 8) & ~lightmask); + int lightshade = lightpos >> 8; + + fgcolor_r = (fgcolor_r * lightshade) >> 8; + fgcolor_g = (fgcolor_g * lightshade) >> 8; + fgcolor_b = (fgcolor_b * lightshade) >> 8; + + destLine[x] = 0xff000000 | (fgcolor_r << 16) | (fgcolor_g << 8) | fgcolor_b; + } + + zbufferLine[x] = posXW; + } + + posXW += stepXW; + posXU += stepXU; + posXV += stepXV; + x++; + } +} + +#endif + +void ScreenTriangle::DrawSpan8(int y, int x0, int x1, const TriDrawTriangleArgs *args) +{ + float v1X = args->v1->x; + float v1Y = args->v1->y; + float v1W = args->v1->w; + float v1U = args->v1->u * v1W; + float v1V = args->v1->v * v1W; + float stepXW = args->gradientX.W; + float stepXU = args->gradientX.U; + float stepXV = args->gradientX.V; + float startX = x0 + (0.5f - v1X); + float startY = y + (0.5f - v1Y); + float posXW = v1W + stepXW * startX + args->gradientY.W * startY; + float posXU = v1U + stepXU * startX + args->gradientY.U * startY; + float posXV = v1V + stepXV * startX + args->gradientY.V * startY; + + auto colormaps = args->uniforms->BaseColormap(); + + const uint32_t *texPixels = (const uint32_t*)args->uniforms->TexturePixels(); + int texWidth = args->uniforms->TextureWidth(); + int texHeight = args->uniforms->TextureHeight(); + + bool is_fixed_light = args->uniforms->FixedLight(); + uint32_t lightmask = is_fixed_light ? 0 : 0xffffffff; + uint32_t light = args->uniforms->Light(); + float shade = 2.0f - (light + 12.0f) / 128.0f; + float globVis = args->uniforms->GlobVis() * (1.0f / 32.0f); + light += light >> 7; // 255 -> 256 + + float *zbufferLine = args->zbuffer + args->stencilPitch * 8 * y; + + uint32_t *dest = (uint32_t*)args->dest; + uint32_t *destLine = dest + args->pitch * y; + + int x = x0; + while (x < x1) + { + if (zbufferLine[x] <= posXW) + { + float rcpW = 0x01000000 / posXW; + int32_t u = (int32_t)(posXU * rcpW); + int32_t v = (int32_t)(posXV * rcpW); + + uint32_t texelX = ((((uint32_t)u << 8) >> 16) * texWidth) >> 16; + uint32_t texelY = ((((uint32_t)v << 8) >> 16) * texHeight) >> 16; + uint8_t fgcolor = texPixels[texelX * texHeight + texelY]; + + fixed_t lightpos = FRACUNIT - (int)(clamp(shade - MIN(24.0f / 32.0f, globVis * posXW), 0.0f, 31.0f / 32.0f) * (float)FRACUNIT); + lightpos = (lightpos & lightmask) | ((light << 8) & ~lightmask); + int lightshade = lightpos >> 8; + + lightshade = ((256 - lightshade) * NUMCOLORMAPS) & 0xffffff00; + uint8_t shadedfg = colormaps[lightshade + fgcolor]; + + if (fgcolor != 0) + destLine[x] = shadedfg; + } + + posXW += stepXW; + posXU += stepXU; + posXV += stepXV; + x++; } } diff --git a/src/polyrenderer/drawers/screen_triangle.h b/src/polyrenderer/drawers/screen_triangle.h index 4b0666ccf..62f59ba44 100644 --- a/src/polyrenderer/drawers/screen_triangle.h +++ b/src/polyrenderer/drawers/screen_triangle.h @@ -150,6 +150,8 @@ class ScreenTriangle public: static void Draw(const TriDrawTriangleArgs *args, WorkerThreadData *thread); static void DrawSWRender(const TriDrawTriangleArgs *args, WorkerThreadData *thread); + static void DrawSpan8(int y, int x0, int x1, const TriDrawTriangleArgs *args); + static void DrawSpan32(int y, int x0, int x1, const TriDrawTriangleArgs *args); static void(*TriDrawers8[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *); static void(*TriDrawers32[])(int, int, uint32_t, uint32_t, const TriDrawTriangleArgs *);