From 01a8df7eb3e51ac30997d9c69c0bb3a965cd7be8 Mon Sep 17 00:00:00 2001 From: Magnus Norddahl Date: Wed, 22 Feb 2017 08:05:23 +0100 Subject: [PATCH] Add dynlights to wall drawer --- src/swrenderer/drawers/r_draw_wall32.h | 1880 +++++++++++++++++++++- src/swrenderer/drawers/r_draw_wall32.php | 62 +- 2 files changed, 1919 insertions(+), 23 deletions(-) diff --git a/src/swrenderer/drawers/r_draw_wall32.h b/src/swrenderer/drawers/r_draw_wall32.h index b6c446a53..d4ba5d51e 100644 --- a/src/swrenderer/drawers/r_draw_wall32.h +++ b/src/swrenderer/drawers/r_draw_wall32.h @@ -56,7 +56,7 @@ namespace swrenderer int light = 256 - (args.Light() >> (FRACBITS - 8)); __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - + int count = args.Count(); int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); uint32_t fracstep = args.TextureVStep(); @@ -64,6 +64,15 @@ namespace swrenderer uint32_t texturefracx = args.TextureUPos(); uint32_t *dest = (uint32_t*)args.Dest(); int dest_y = args.DestY(); + + auto lights = args.dc_lights; + auto num_lights = args.dc_num_lights; + float vpz = args.dc_viewpos.Z; + float stepvpz = args.dc_viewpos_step.Z; + vpz += thread->skipped_by_thread(dest_y) * stepvpz; + stepvpz *= thread->num_cores; + __m128 viewpos_z = _mm_set_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); + __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); count = thread->count_for_thread(dest_y, count); if (count <= 0) return; @@ -99,7 +108,48 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend __m128i outcolor = fgcolor; @@ -108,6 +158,7 @@ namespace swrenderer _mm_storel_epi64((__m128i*)desttmp, outcolor); dest[offset] = desttmp[0]; dest[offset + pitch] = desttmp[1]; + viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); } if (ssecount * 2 != count) @@ -124,7 +175,48 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend __m128i outcolor = fgcolor; @@ -142,7 +234,7 @@ namespace swrenderer int light = 256 - (args.Light() >> (FRACBITS - 8)); __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - + int count = args.Count(); int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); uint32_t fracstep = args.TextureVStep(); @@ -150,6 +242,15 @@ namespace swrenderer uint32_t texturefracx = args.TextureUPos(); uint32_t *dest = (uint32_t*)args.Dest(); int dest_y = args.DestY(); + + auto lights = args.dc_lights; + auto num_lights = args.dc_num_lights; + float vpz = args.dc_viewpos.Z; + float stepvpz = args.dc_viewpos_step.Z; + vpz += thread->skipped_by_thread(dest_y) * stepvpz; + stepvpz *= thread->num_cores; + __m128 viewpos_z = _mm_set_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); + __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); count = thread->count_for_thread(dest_y, count); if (count <= 0) return; @@ -224,7 +325,48 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend __m128i outcolor = fgcolor; @@ -233,6 +375,7 @@ namespace swrenderer _mm_storel_epi64((__m128i*)desttmp, outcolor); dest[offset] = desttmp[0]; dest[offset + pitch] = desttmp[1]; + viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); } if (ssecount * 2 != count) @@ -268,7 +411,48 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend __m128i outcolor = fgcolor; @@ -297,7 +481,7 @@ namespace swrenderer shade_fade = _mm_mullo_epi16(shade_fade, inv_light); __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); int desaturate = shade_constants.desaturate; - + int count = args.Count(); int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); uint32_t fracstep = args.TextureVStep(); @@ -305,6 +489,15 @@ namespace swrenderer uint32_t texturefracx = args.TextureUPos(); uint32_t *dest = (uint32_t*)args.Dest(); int dest_y = args.DestY(); + + auto lights = args.dc_lights; + auto num_lights = args.dc_num_lights; + float vpz = args.dc_viewpos.Z; + float stepvpz = args.dc_viewpos_step.Z; + vpz += thread->skipped_by_thread(dest_y) * stepvpz; + stepvpz *= thread->num_cores; + __m128 viewpos_z = _mm_set_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); + __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); count = thread->count_for_thread(dest_y, count); if (count <= 0) return; @@ -340,6 +533,7 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; int blue0 = BPART(ifgcolor[0]); int green0 = GPART(ifgcolor[0]); int red0 = RPART(ifgcolor[0]); @@ -356,6 +550,46 @@ namespace swrenderer fgcolor = _mm_mullo_epi16(fgcolor, mlight); fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend __m128i outcolor = fgcolor; @@ -364,6 +598,7 @@ namespace swrenderer _mm_storel_epi64((__m128i*)desttmp, outcolor); dest[offset] = desttmp[0]; dest[offset + pitch] = desttmp[1]; + viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); } if (ssecount * 2 != count) @@ -380,6 +615,7 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; int blue0 = BPART(ifgcolor[0]); int green0 = GPART(ifgcolor[0]); int red0 = RPART(ifgcolor[0]); @@ -396,6 +632,46 @@ namespace swrenderer fgcolor = _mm_mullo_epi16(fgcolor, mlight); fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend __m128i outcolor = fgcolor; @@ -418,7 +694,7 @@ namespace swrenderer shade_fade = _mm_mullo_epi16(shade_fade, inv_light); __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); int desaturate = shade_constants.desaturate; - + int count = args.Count(); int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); uint32_t fracstep = args.TextureVStep(); @@ -426,6 +702,15 @@ namespace swrenderer uint32_t texturefracx = args.TextureUPos(); uint32_t *dest = (uint32_t*)args.Dest(); int dest_y = args.DestY(); + + auto lights = args.dc_lights; + auto num_lights = args.dc_num_lights; + float vpz = args.dc_viewpos.Z; + float stepvpz = args.dc_viewpos_step.Z; + vpz += thread->skipped_by_thread(dest_y) * stepvpz; + stepvpz *= thread->num_cores; + __m128 viewpos_z = _mm_set_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); + __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); count = thread->count_for_thread(dest_y, count); if (count <= 0) return; @@ -500,6 +785,7 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; int blue0 = BPART(ifgcolor[0]); int green0 = GPART(ifgcolor[0]); int red0 = RPART(ifgcolor[0]); @@ -516,6 +802,46 @@ namespace swrenderer fgcolor = _mm_mullo_epi16(fgcolor, mlight); fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend __m128i outcolor = fgcolor; @@ -524,6 +850,7 @@ namespace swrenderer _mm_storel_epi64((__m128i*)desttmp, outcolor); dest[offset] = desttmp[0]; dest[offset + pitch] = desttmp[1]; + viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); } if (ssecount * 2 != count) @@ -559,6 +886,7 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; int blue0 = BPART(ifgcolor[0]); int green0 = GPART(ifgcolor[0]); int red0 = RPART(ifgcolor[0]); @@ -575,6 +903,46 @@ namespace swrenderer fgcolor = _mm_mullo_epi16(fgcolor, mlight); fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend __m128i outcolor = fgcolor; @@ -614,7 +982,7 @@ namespace swrenderer int light = 256 - (args.Light() >> (FRACBITS - 8)); __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - + int count = args.Count(); int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); uint32_t fracstep = args.TextureVStep(); @@ -622,6 +990,15 @@ namespace swrenderer uint32_t texturefracx = args.TextureUPos(); uint32_t *dest = (uint32_t*)args.Dest(); int dest_y = args.DestY(); + + auto lights = args.dc_lights; + auto num_lights = args.dc_num_lights; + float vpz = args.dc_viewpos.Z; + float stepvpz = args.dc_viewpos_step.Z; + vpz += thread->skipped_by_thread(dest_y) * stepvpz; + stepvpz *= thread->num_cores; + __m128 viewpos_z = _mm_set_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); + __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); count = thread->count_for_thread(dest_y, count); if (count <= 0) return; @@ -658,7 +1035,48 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); @@ -675,6 +1093,7 @@ namespace swrenderer _mm_storel_epi64((__m128i*)desttmp, outcolor); dest[offset] = desttmp[0]; dest[offset + pitch] = desttmp[1]; + viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); } if (ssecount * 2 != count) @@ -692,7 +1111,48 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); @@ -718,7 +1178,7 @@ namespace swrenderer int light = 256 - (args.Light() >> (FRACBITS - 8)); __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - + int count = args.Count(); int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); uint32_t fracstep = args.TextureVStep(); @@ -726,6 +1186,15 @@ namespace swrenderer uint32_t texturefracx = args.TextureUPos(); uint32_t *dest = (uint32_t*)args.Dest(); int dest_y = args.DestY(); + + auto lights = args.dc_lights; + auto num_lights = args.dc_num_lights; + float vpz = args.dc_viewpos.Z; + float stepvpz = args.dc_viewpos_step.Z; + vpz += thread->skipped_by_thread(dest_y) * stepvpz; + stepvpz *= thread->num_cores; + __m128 viewpos_z = _mm_set_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); + __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); count = thread->count_for_thread(dest_y, count); if (count <= 0) return; @@ -801,7 +1270,48 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); @@ -818,6 +1328,7 @@ namespace swrenderer _mm_storel_epi64((__m128i*)desttmp, outcolor); dest[offset] = desttmp[0]; dest[offset + pitch] = desttmp[1]; + viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); } if (ssecount * 2 != count) @@ -854,7 +1365,48 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); @@ -891,7 +1443,7 @@ namespace swrenderer shade_fade = _mm_mullo_epi16(shade_fade, inv_light); __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); int desaturate = shade_constants.desaturate; - + int count = args.Count(); int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); uint32_t fracstep = args.TextureVStep(); @@ -899,6 +1451,15 @@ namespace swrenderer uint32_t texturefracx = args.TextureUPos(); uint32_t *dest = (uint32_t*)args.Dest(); int dest_y = args.DestY(); + + auto lights = args.dc_lights; + auto num_lights = args.dc_num_lights; + float vpz = args.dc_viewpos.Z; + float stepvpz = args.dc_viewpos_step.Z; + vpz += thread->skipped_by_thread(dest_y) * stepvpz; + stepvpz *= thread->num_cores; + __m128 viewpos_z = _mm_set_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); + __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); count = thread->count_for_thread(dest_y, count); if (count <= 0) return; @@ -935,6 +1496,7 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; int blue0 = BPART(ifgcolor[0]); int green0 = GPART(ifgcolor[0]); int red0 = RPART(ifgcolor[0]); @@ -951,6 +1513,46 @@ namespace swrenderer fgcolor = _mm_mullo_epi16(fgcolor, mlight); fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); @@ -967,6 +1569,7 @@ namespace swrenderer _mm_storel_epi64((__m128i*)desttmp, outcolor); dest[offset] = desttmp[0]; dest[offset + pitch] = desttmp[1]; + viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); } if (ssecount * 2 != count) @@ -984,6 +1587,7 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; int blue0 = BPART(ifgcolor[0]); int green0 = GPART(ifgcolor[0]); int red0 = RPART(ifgcolor[0]); @@ -1000,6 +1604,46 @@ namespace swrenderer fgcolor = _mm_mullo_epi16(fgcolor, mlight); fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); @@ -1030,7 +1674,7 @@ namespace swrenderer shade_fade = _mm_mullo_epi16(shade_fade, inv_light); __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); int desaturate = shade_constants.desaturate; - + int count = args.Count(); int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); uint32_t fracstep = args.TextureVStep(); @@ -1038,6 +1682,15 @@ namespace swrenderer uint32_t texturefracx = args.TextureUPos(); uint32_t *dest = (uint32_t*)args.Dest(); int dest_y = args.DestY(); + + auto lights = args.dc_lights; + auto num_lights = args.dc_num_lights; + float vpz = args.dc_viewpos.Z; + float stepvpz = args.dc_viewpos_step.Z; + vpz += thread->skipped_by_thread(dest_y) * stepvpz; + stepvpz *= thread->num_cores; + __m128 viewpos_z = _mm_set_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); + __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); count = thread->count_for_thread(dest_y, count); if (count <= 0) return; @@ -1113,6 +1766,7 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; int blue0 = BPART(ifgcolor[0]); int green0 = GPART(ifgcolor[0]); int red0 = RPART(ifgcolor[0]); @@ -1129,6 +1783,46 @@ namespace swrenderer fgcolor = _mm_mullo_epi16(fgcolor, mlight); fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); @@ -1145,6 +1839,7 @@ namespace swrenderer _mm_storel_epi64((__m128i*)desttmp, outcolor); dest[offset] = desttmp[0]; dest[offset + pitch] = desttmp[1]; + viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); } if (ssecount * 2 != count) @@ -1181,6 +1876,7 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; int blue0 = BPART(ifgcolor[0]); int green0 = GPART(ifgcolor[0]); int red0 = RPART(ifgcolor[0]); @@ -1197,6 +1893,46 @@ namespace swrenderer fgcolor = _mm_mullo_epi16(fgcolor, mlight); fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); @@ -1244,7 +1980,7 @@ namespace swrenderer int light = 256 - (args.Light() >> (FRACBITS - 8)); __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - + int count = args.Count(); int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); uint32_t fracstep = args.TextureVStep(); @@ -1252,6 +1988,15 @@ namespace swrenderer uint32_t texturefracx = args.TextureUPos(); uint32_t *dest = (uint32_t*)args.Dest(); int dest_y = args.DestY(); + + auto lights = args.dc_lights; + auto num_lights = args.dc_num_lights; + float vpz = args.dc_viewpos.Z; + float stepvpz = args.dc_viewpos_step.Z; + vpz += thread->skipped_by_thread(dest_y) * stepvpz; + stepvpz *= thread->num_cores; + __m128 viewpos_z = _mm_set_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); + __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); count = thread->count_for_thread(dest_y, count); if (count <= 0) return; @@ -1288,7 +2033,48 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend uint32_t alpha0 = APART(ifgcolor[0]); @@ -1326,6 +2112,7 @@ namespace swrenderer _mm_storel_epi64((__m128i*)desttmp, outcolor); dest[offset] = desttmp[0]; dest[offset + pitch] = desttmp[1]; + viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); } if (ssecount * 2 != count) @@ -1343,7 +2130,48 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend uint32_t alpha0 = APART(ifgcolor[0]); @@ -1390,7 +2218,7 @@ namespace swrenderer int light = 256 - (args.Light() >> (FRACBITS - 8)); __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - + int count = args.Count(); int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); uint32_t fracstep = args.TextureVStep(); @@ -1398,6 +2226,15 @@ namespace swrenderer uint32_t texturefracx = args.TextureUPos(); uint32_t *dest = (uint32_t*)args.Dest(); int dest_y = args.DestY(); + + auto lights = args.dc_lights; + auto num_lights = args.dc_num_lights; + float vpz = args.dc_viewpos.Z; + float stepvpz = args.dc_viewpos_step.Z; + vpz += thread->skipped_by_thread(dest_y) * stepvpz; + stepvpz *= thread->num_cores; + __m128 viewpos_z = _mm_set_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); + __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); count = thread->count_for_thread(dest_y, count); if (count <= 0) return; @@ -1473,7 +2310,48 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend uint32_t alpha0 = APART(ifgcolor[0]); @@ -1511,6 +2389,7 @@ namespace swrenderer _mm_storel_epi64((__m128i*)desttmp, outcolor); dest[offset] = desttmp[0]; dest[offset + pitch] = desttmp[1]; + viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); } if (ssecount * 2 != count) @@ -1547,7 +2426,48 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend uint32_t alpha0 = APART(ifgcolor[0]); @@ -1605,7 +2525,7 @@ namespace swrenderer shade_fade = _mm_mullo_epi16(shade_fade, inv_light); __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); int desaturate = shade_constants.desaturate; - + int count = args.Count(); int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); uint32_t fracstep = args.TextureVStep(); @@ -1613,6 +2533,15 @@ namespace swrenderer uint32_t texturefracx = args.TextureUPos(); uint32_t *dest = (uint32_t*)args.Dest(); int dest_y = args.DestY(); + + auto lights = args.dc_lights; + auto num_lights = args.dc_num_lights; + float vpz = args.dc_viewpos.Z; + float stepvpz = args.dc_viewpos_step.Z; + vpz += thread->skipped_by_thread(dest_y) * stepvpz; + stepvpz *= thread->num_cores; + __m128 viewpos_z = _mm_set_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); + __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); count = thread->count_for_thread(dest_y, count); if (count <= 0) return; @@ -1649,6 +2578,7 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; int blue0 = BPART(ifgcolor[0]); int green0 = GPART(ifgcolor[0]); int red0 = RPART(ifgcolor[0]); @@ -1665,6 +2595,46 @@ namespace swrenderer fgcolor = _mm_mullo_epi16(fgcolor, mlight); fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend uint32_t alpha0 = APART(ifgcolor[0]); @@ -1702,6 +2672,7 @@ namespace swrenderer _mm_storel_epi64((__m128i*)desttmp, outcolor); dest[offset] = desttmp[0]; dest[offset + pitch] = desttmp[1]; + viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); } if (ssecount * 2 != count) @@ -1719,6 +2690,7 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; int blue0 = BPART(ifgcolor[0]); int green0 = GPART(ifgcolor[0]); int red0 = RPART(ifgcolor[0]); @@ -1735,6 +2707,46 @@ namespace swrenderer fgcolor = _mm_mullo_epi16(fgcolor, mlight); fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend uint32_t alpha0 = APART(ifgcolor[0]); @@ -1786,7 +2798,7 @@ namespace swrenderer shade_fade = _mm_mullo_epi16(shade_fade, inv_light); __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); int desaturate = shade_constants.desaturate; - + int count = args.Count(); int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); uint32_t fracstep = args.TextureVStep(); @@ -1794,6 +2806,15 @@ namespace swrenderer uint32_t texturefracx = args.TextureUPos(); uint32_t *dest = (uint32_t*)args.Dest(); int dest_y = args.DestY(); + + auto lights = args.dc_lights; + auto num_lights = args.dc_num_lights; + float vpz = args.dc_viewpos.Z; + float stepvpz = args.dc_viewpos_step.Z; + vpz += thread->skipped_by_thread(dest_y) * stepvpz; + stepvpz *= thread->num_cores; + __m128 viewpos_z = _mm_set_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); + __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); count = thread->count_for_thread(dest_y, count); if (count <= 0) return; @@ -1869,6 +2890,7 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; int blue0 = BPART(ifgcolor[0]); int green0 = GPART(ifgcolor[0]); int red0 = RPART(ifgcolor[0]); @@ -1885,6 +2907,46 @@ namespace swrenderer fgcolor = _mm_mullo_epi16(fgcolor, mlight); fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend uint32_t alpha0 = APART(ifgcolor[0]); @@ -1922,6 +2984,7 @@ namespace swrenderer _mm_storel_epi64((__m128i*)desttmp, outcolor); dest[offset] = desttmp[0]; dest[offset + pitch] = desttmp[1]; + viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); } if (ssecount * 2 != count) @@ -1958,6 +3021,7 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; int blue0 = BPART(ifgcolor[0]); int green0 = GPART(ifgcolor[0]); int red0 = RPART(ifgcolor[0]); @@ -1974,6 +3038,46 @@ namespace swrenderer fgcolor = _mm_mullo_epi16(fgcolor, mlight); fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend uint32_t alpha0 = APART(ifgcolor[0]); @@ -2042,7 +3146,7 @@ namespace swrenderer int light = 256 - (args.Light() >> (FRACBITS - 8)); __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - + int count = args.Count(); int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); uint32_t fracstep = args.TextureVStep(); @@ -2050,6 +3154,15 @@ namespace swrenderer uint32_t texturefracx = args.TextureUPos(); uint32_t *dest = (uint32_t*)args.Dest(); int dest_y = args.DestY(); + + auto lights = args.dc_lights; + auto num_lights = args.dc_num_lights; + float vpz = args.dc_viewpos.Z; + float stepvpz = args.dc_viewpos_step.Z; + vpz += thread->skipped_by_thread(dest_y) * stepvpz; + stepvpz *= thread->num_cores; + __m128 viewpos_z = _mm_set_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); + __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); count = thread->count_for_thread(dest_y, count); if (count <= 0) return; @@ -2086,7 +3199,48 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend uint32_t alpha0 = APART(ifgcolor[0]); @@ -2124,6 +3278,7 @@ namespace swrenderer _mm_storel_epi64((__m128i*)desttmp, outcolor); dest[offset] = desttmp[0]; dest[offset + pitch] = desttmp[1]; + viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); } if (ssecount * 2 != count) @@ -2141,7 +3296,48 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend uint32_t alpha0 = APART(ifgcolor[0]); @@ -2188,7 +3384,7 @@ namespace swrenderer int light = 256 - (args.Light() >> (FRACBITS - 8)); __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - + int count = args.Count(); int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); uint32_t fracstep = args.TextureVStep(); @@ -2196,6 +3392,15 @@ namespace swrenderer uint32_t texturefracx = args.TextureUPos(); uint32_t *dest = (uint32_t*)args.Dest(); int dest_y = args.DestY(); + + auto lights = args.dc_lights; + auto num_lights = args.dc_num_lights; + float vpz = args.dc_viewpos.Z; + float stepvpz = args.dc_viewpos_step.Z; + vpz += thread->skipped_by_thread(dest_y) * stepvpz; + stepvpz *= thread->num_cores; + __m128 viewpos_z = _mm_set_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); + __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); count = thread->count_for_thread(dest_y, count); if (count <= 0) return; @@ -2271,7 +3476,48 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend uint32_t alpha0 = APART(ifgcolor[0]); @@ -2309,6 +3555,7 @@ namespace swrenderer _mm_storel_epi64((__m128i*)desttmp, outcolor); dest[offset] = desttmp[0]; dest[offset + pitch] = desttmp[1]; + viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); } if (ssecount * 2 != count) @@ -2345,7 +3592,48 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend uint32_t alpha0 = APART(ifgcolor[0]); @@ -2403,7 +3691,7 @@ namespace swrenderer shade_fade = _mm_mullo_epi16(shade_fade, inv_light); __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); int desaturate = shade_constants.desaturate; - + int count = args.Count(); int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); uint32_t fracstep = args.TextureVStep(); @@ -2411,6 +3699,15 @@ namespace swrenderer uint32_t texturefracx = args.TextureUPos(); uint32_t *dest = (uint32_t*)args.Dest(); int dest_y = args.DestY(); + + auto lights = args.dc_lights; + auto num_lights = args.dc_num_lights; + float vpz = args.dc_viewpos.Z; + float stepvpz = args.dc_viewpos_step.Z; + vpz += thread->skipped_by_thread(dest_y) * stepvpz; + stepvpz *= thread->num_cores; + __m128 viewpos_z = _mm_set_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); + __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); count = thread->count_for_thread(dest_y, count); if (count <= 0) return; @@ -2447,6 +3744,7 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; int blue0 = BPART(ifgcolor[0]); int green0 = GPART(ifgcolor[0]); int red0 = RPART(ifgcolor[0]); @@ -2463,6 +3761,46 @@ namespace swrenderer fgcolor = _mm_mullo_epi16(fgcolor, mlight); fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend uint32_t alpha0 = APART(ifgcolor[0]); @@ -2500,6 +3838,7 @@ namespace swrenderer _mm_storel_epi64((__m128i*)desttmp, outcolor); dest[offset] = desttmp[0]; dest[offset + pitch] = desttmp[1]; + viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); } if (ssecount * 2 != count) @@ -2517,6 +3856,7 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; int blue0 = BPART(ifgcolor[0]); int green0 = GPART(ifgcolor[0]); int red0 = RPART(ifgcolor[0]); @@ -2533,6 +3873,46 @@ namespace swrenderer fgcolor = _mm_mullo_epi16(fgcolor, mlight); fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend uint32_t alpha0 = APART(ifgcolor[0]); @@ -2584,7 +3964,7 @@ namespace swrenderer shade_fade = _mm_mullo_epi16(shade_fade, inv_light); __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); int desaturate = shade_constants.desaturate; - + int count = args.Count(); int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); uint32_t fracstep = args.TextureVStep(); @@ -2592,6 +3972,15 @@ namespace swrenderer uint32_t texturefracx = args.TextureUPos(); uint32_t *dest = (uint32_t*)args.Dest(); int dest_y = args.DestY(); + + auto lights = args.dc_lights; + auto num_lights = args.dc_num_lights; + float vpz = args.dc_viewpos.Z; + float stepvpz = args.dc_viewpos_step.Z; + vpz += thread->skipped_by_thread(dest_y) * stepvpz; + stepvpz *= thread->num_cores; + __m128 viewpos_z = _mm_set_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); + __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); count = thread->count_for_thread(dest_y, count); if (count <= 0) return; @@ -2667,6 +4056,7 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; int blue0 = BPART(ifgcolor[0]); int green0 = GPART(ifgcolor[0]); int red0 = RPART(ifgcolor[0]); @@ -2683,6 +4073,46 @@ namespace swrenderer fgcolor = _mm_mullo_epi16(fgcolor, mlight); fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend uint32_t alpha0 = APART(ifgcolor[0]); @@ -2720,6 +4150,7 @@ namespace swrenderer _mm_storel_epi64((__m128i*)desttmp, outcolor); dest[offset] = desttmp[0]; dest[offset + pitch] = desttmp[1]; + viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); } if (ssecount * 2 != count) @@ -2756,6 +4187,7 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; int blue0 = BPART(ifgcolor[0]); int green0 = GPART(ifgcolor[0]); int red0 = RPART(ifgcolor[0]); @@ -2772,6 +4204,46 @@ namespace swrenderer fgcolor = _mm_mullo_epi16(fgcolor, mlight); fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend uint32_t alpha0 = APART(ifgcolor[0]); @@ -2840,7 +4312,7 @@ namespace swrenderer int light = 256 - (args.Light() >> (FRACBITS - 8)); __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - + int count = args.Count(); int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); uint32_t fracstep = args.TextureVStep(); @@ -2848,6 +4320,15 @@ namespace swrenderer uint32_t texturefracx = args.TextureUPos(); uint32_t *dest = (uint32_t*)args.Dest(); int dest_y = args.DestY(); + + auto lights = args.dc_lights; + auto num_lights = args.dc_num_lights; + float vpz = args.dc_viewpos.Z; + float stepvpz = args.dc_viewpos_step.Z; + vpz += thread->skipped_by_thread(dest_y) * stepvpz; + stepvpz *= thread->num_cores; + __m128 viewpos_z = _mm_set_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); + __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); count = thread->count_for_thread(dest_y, count); if (count <= 0) return; @@ -2884,7 +4365,48 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend uint32_t alpha0 = APART(ifgcolor[0]); @@ -2922,6 +4444,7 @@ namespace swrenderer _mm_storel_epi64((__m128i*)desttmp, outcolor); dest[offset] = desttmp[0]; dest[offset + pitch] = desttmp[1]; + viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); } if (ssecount * 2 != count) @@ -2939,7 +4462,48 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend uint32_t alpha0 = APART(ifgcolor[0]); @@ -2986,7 +4550,7 @@ namespace swrenderer int light = 256 - (args.Light() >> (FRACBITS - 8)); __m128i mlight = _mm_set_epi16(256, light, light, light, 256, light, light, light); __m128i inv_light = _mm_set_epi16(0, 256 - light, 256 - light, 256 - light, 0, 256 - light, 256 - light, 256 - light); - + int count = args.Count(); int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); uint32_t fracstep = args.TextureVStep(); @@ -2994,6 +4558,15 @@ namespace swrenderer uint32_t texturefracx = args.TextureUPos(); uint32_t *dest = (uint32_t*)args.Dest(); int dest_y = args.DestY(); + + auto lights = args.dc_lights; + auto num_lights = args.dc_num_lights; + float vpz = args.dc_viewpos.Z; + float stepvpz = args.dc_viewpos_step.Z; + vpz += thread->skipped_by_thread(dest_y) * stepvpz; + stepvpz *= thread->num_cores; + __m128 viewpos_z = _mm_set_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); + __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); count = thread->count_for_thread(dest_y, count); if (count <= 0) return; @@ -3069,7 +4642,48 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend uint32_t alpha0 = APART(ifgcolor[0]); @@ -3107,6 +4721,7 @@ namespace swrenderer _mm_storel_epi64((__m128i*)desttmp, outcolor); dest[offset] = desttmp[0]; dest[offset + pitch] = desttmp[1]; + viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); } if (ssecount * 2 != count) @@ -3143,7 +4758,48 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend uint32_t alpha0 = APART(ifgcolor[0]); @@ -3201,7 +4857,7 @@ namespace swrenderer shade_fade = _mm_mullo_epi16(shade_fade, inv_light); __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); int desaturate = shade_constants.desaturate; - + int count = args.Count(); int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); uint32_t fracstep = args.TextureVStep(); @@ -3209,6 +4865,15 @@ namespace swrenderer uint32_t texturefracx = args.TextureUPos(); uint32_t *dest = (uint32_t*)args.Dest(); int dest_y = args.DestY(); + + auto lights = args.dc_lights; + auto num_lights = args.dc_num_lights; + float vpz = args.dc_viewpos.Z; + float stepvpz = args.dc_viewpos_step.Z; + vpz += thread->skipped_by_thread(dest_y) * stepvpz; + stepvpz *= thread->num_cores; + __m128 viewpos_z = _mm_set_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); + __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); count = thread->count_for_thread(dest_y, count); if (count <= 0) return; @@ -3245,6 +4910,7 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; int blue0 = BPART(ifgcolor[0]); int green0 = GPART(ifgcolor[0]); int red0 = RPART(ifgcolor[0]); @@ -3261,6 +4927,46 @@ namespace swrenderer fgcolor = _mm_mullo_epi16(fgcolor, mlight); fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend uint32_t alpha0 = APART(ifgcolor[0]); @@ -3298,6 +5004,7 @@ namespace swrenderer _mm_storel_epi64((__m128i*)desttmp, outcolor); dest[offset] = desttmp[0]; dest[offset + pitch] = desttmp[1]; + viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); } if (ssecount * 2 != count) @@ -3315,6 +5022,7 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; int blue0 = BPART(ifgcolor[0]); int green0 = GPART(ifgcolor[0]); int red0 = RPART(ifgcolor[0]); @@ -3331,6 +5039,46 @@ namespace swrenderer fgcolor = _mm_mullo_epi16(fgcolor, mlight); fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend uint32_t alpha0 = APART(ifgcolor[0]); @@ -3382,7 +5130,7 @@ namespace swrenderer shade_fade = _mm_mullo_epi16(shade_fade, inv_light); __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); int desaturate = shade_constants.desaturate; - + int count = args.Count(); int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); uint32_t fracstep = args.TextureVStep(); @@ -3390,6 +5138,15 @@ namespace swrenderer uint32_t texturefracx = args.TextureUPos(); uint32_t *dest = (uint32_t*)args.Dest(); int dest_y = args.DestY(); + + auto lights = args.dc_lights; + auto num_lights = args.dc_num_lights; + float vpz = args.dc_viewpos.Z; + float stepvpz = args.dc_viewpos_step.Z; + vpz += thread->skipped_by_thread(dest_y) * stepvpz; + stepvpz *= thread->num_cores; + __m128 viewpos_z = _mm_set_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); + __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); count = thread->count_for_thread(dest_y, count); if (count <= 0) return; @@ -3465,6 +5222,7 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; int blue0 = BPART(ifgcolor[0]); int green0 = GPART(ifgcolor[0]); int red0 = RPART(ifgcolor[0]); @@ -3481,6 +5239,46 @@ namespace swrenderer fgcolor = _mm_mullo_epi16(fgcolor, mlight); fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend uint32_t alpha0 = APART(ifgcolor[0]); @@ -3518,6 +5316,7 @@ namespace swrenderer _mm_storel_epi64((__m128i*)desttmp, outcolor); dest[offset] = desttmp[0]; dest[offset + pitch] = desttmp[1]; + viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); } if (ssecount * 2 != count) @@ -3554,6 +5353,7 @@ namespace swrenderer __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade + __m128i material = fgcolor; int blue0 = BPART(ifgcolor[0]); int green0 = GPART(ifgcolor[0]); int red0 = RPART(ifgcolor[0]); @@ -3570,6 +5370,46 @@ namespace swrenderer fgcolor = _mm_mullo_epi16(fgcolor, mlight); fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); // Blend uint32_t alpha0 = APART(ifgcolor[0]); diff --git a/src/swrenderer/drawers/r_draw_wall32.php b/src/swrenderer/drawers/r_draw_wall32.php index a414ee7d7..2944fa407 100644 --- a/src/swrenderer/drawers/r_draw_wall32.php +++ b/src/swrenderer/drawers/r_draw_wall32.php @@ -101,7 +101,7 @@ namespace swrenderer __m128i shade_light = _mm_set_epi16(shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, shade_constants.light_alpha, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); int desaturate = shade_constants.desaturate; - + int count = args.Count(); int pitch = RenderViewport::Instance()->RenderTarget->GetPitch(); uint32_t fracstep = args.TextureVStep(); @@ -109,6 +109,15 @@ namespace swrenderer uint32_t texturefracx = args.TextureUPos(); uint32_t *dest = (uint32_t*)args.Dest(); int dest_y = args.DestY(); + + auto lights = args.dc_lights; + auto num_lights = args.dc_num_lights; + float vpz = args.dc_viewpos.Z; + float stepvpz = args.dc_viewpos_step.Z; + vpz += thread->skipped_by_thread(dest_y) * stepvpz; + stepvpz *= thread->num_cores; + __m128 viewpos_z = _mm_set_ps(vpz, vpz + stepvpz, 0.0f, 0.0f); + __m128 step_viewpos_z = _mm_set1_ps(stepvpz * 2.0f); count = thread->count_for_thread(dest_y, count); if (count <= 0) return; @@ -157,6 +166,7 @@ namespace swrenderer _mm_storel_epi64((__m128i*)desttmp, outcolor); dest[offset] = desttmp[0]; dest[offset + pitch] = desttmp[1]; + viewpos_z = _mm_add_ps(viewpos_z, step_viewpos_z); } if (ssecount * 2 != count) @@ -218,8 +228,9 @@ namespace swrenderer } function Shade($isSimpleShade) - { - if ($isSimpleShade == true) + { ?> + __m128i material = fgcolor; + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + __m128i lit = _mm_setzero_si128(); + + for (int i = 0; i != num_lights; i++) + { + __m128 light_x = _mm_set1_ps(lights[i].x); + __m128 light_y = _mm_set1_ps(lights[i].y); + __m128 light_z = _mm_set1_ps(lights[i].z); + __m128 light_radius = _mm_set1_ps(lights[i].radius); + __m128 m256 = _mm_set1_ps(256.0f); + + // L = light-pos + // dist = sqrt(dot(L, L)) + // distance_attenuation = 1 - MIN(dist * (1/radius), 1) + __m128 Lxy2 = light_x; // L.x*L.x + L.y*L.y + __m128 Lz = _mm_sub_ps(light_z, viewpos_z); + __m128 dist2 = _mm_add_ps(Lxy2, _mm_mul_ps(Lz, Lz)); + __m128 rcp_dist = _mm_rsqrt_ps(dist2); + __m128 dist = _mm_mul_ps(dist2, rcp_dist); + __m128 distance_attenuation = _mm_sub_ps(m256, _mm_min_ps(_mm_mul_ps(dist, light_radius), m256)); + + // The simple light type + __m128 simple_attenuation = distance_attenuation; + + // The point light type + // diffuse = dot(N,L) * attenuation + __m128 point_attenuation = _mm_mul_ps(_mm_mul_ps(light_y, rcp_dist), distance_attenuation); + + __m128 is_attenuated = _mm_cmpeq_ps(light_y, _mm_setzero_ps()); + __m128i attenuation = _mm_cvtps_epi32(_mm_or_ps(_mm_and_ps(is_attenuated, simple_attenuation), _mm_andnot_ps(is_attenuated, point_attenuation))); + attenuation = _mm_packs_epi32(_mm_shuffle_epi32(attenuation, _MM_SHUFFLE(0,0,0,0)), _mm_shuffle_epi32(attenuation, _MM_SHUFFLE(1,1,1,1))); + + __m128i light_color = _mm_cvtsi32_si128(lights[i].color); + light_color = _mm_unpacklo_epi8(light_color, _mm_setzero_si128()); + light_color = _mm_shuffle_epi32(light_color, _MM_SHUFFLE(1,0,1,0)); + + lit = _mm_add_epi16(lit, _mm_srli_epi16(_mm_mullo_epi16(light_color, attenuation), 8)); + } + + fgcolor = _mm_add_epi16(fgcolor, _mm_srli_epi16(_mm_mullo_epi16(material, lit), 8)); + fgcolor = _mm_min_epi16(fgcolor, _mm_set1_epi16(255)); + }