diff --git a/src/swrenderer/drawers/r_draw_sprite32.h b/src/swrenderer/drawers/r_draw_sprite32.h index 43802f795..e3baa6c08 100644 --- a/src/swrenderer/drawers/r_draw_sprite32.h +++ b/src/swrenderer/drawers/r_draw_sprite32.h @@ -46,8 +46,6 @@ namespace swrenderer { const uint32_t *source = (const uint32_t*)args.TexturePixels(); const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - const uint8_t *colormap = args.Colormap(); - const uint32_t *translation = (const uint32_t*)args.TranslationMap(); bool is_nearest_filter = (source2 == nullptr); if (is_nearest_filter) { @@ -73,10 +71,10 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -87,17 +85,21 @@ namespace swrenderer desttmp[1] = dest[offset + pitch]; // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); @@ -119,11 +121,14 @@ namespace swrenderer int offset = index * pitch; // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade @@ -160,10 +165,10 @@ namespace swrenderer fracstep *= thread->num_cores; pitch *= thread->num_cores; frac -= one / 2; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -174,7 +179,7 @@ namespace swrenderer desttmp[1] = dest[offset + pitch]; // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { // Clamp to edge unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; @@ -198,7 +203,9 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { @@ -224,7 +231,9 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); @@ -246,7 +255,7 @@ namespace swrenderer int offset = index * pitch; // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; // Clamp to edge unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; @@ -269,8 +278,11 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade @@ -287,8 +299,6 @@ namespace swrenderer { const uint32_t *source = (const uint32_t*)args.TexturePixels(); const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - const uint8_t *colormap = args.Colormap(); - const uint32_t *translation = (const uint32_t*)args.TranslationMap(); bool is_nearest_filter = (source2 == nullptr); if (is_nearest_filter) { @@ -319,10 +329,10 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -333,17 +343,21 @@ namespace swrenderer desttmp[1] = dest[offset + pitch]; // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); @@ -365,11 +379,14 @@ namespace swrenderer int offset = index * pitch; // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade @@ -411,10 +428,10 @@ namespace swrenderer fracstep *= thread->num_cores; pitch *= thread->num_cores; frac -= one / 2; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -425,7 +442,7 @@ namespace swrenderer desttmp[1] = dest[offset + pitch]; // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { // Clamp to edge unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; @@ -449,7 +466,9 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { @@ -475,7 +494,9 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); @@ -497,7 +518,7 @@ namespace swrenderer int offset = index * pitch; // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; // Clamp to edge unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; @@ -520,8 +541,11 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade @@ -554,8 +578,6 @@ namespace swrenderer { const uint32_t *source = (const uint32_t*)args.TexturePixels(); const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - const uint8_t *colormap = args.Colormap(); - const uint32_t *translation = (const uint32_t*)args.TranslationMap(); bool is_nearest_filter = (source2 == nullptr); if (is_nearest_filter) { @@ -581,10 +603,10 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -595,17 +617,21 @@ namespace swrenderer desttmp[1] = dest[offset + pitch]; // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); @@ -628,11 +654,14 @@ namespace swrenderer int offset = index * pitch; // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade @@ -670,10 +699,10 @@ namespace swrenderer fracstep *= thread->num_cores; pitch *= thread->num_cores; frac -= one / 2; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -684,7 +713,7 @@ namespace swrenderer desttmp[1] = dest[offset + pitch]; // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { // Clamp to edge unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; @@ -708,7 +737,9 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { @@ -734,7 +765,9 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); @@ -757,7 +790,7 @@ namespace swrenderer int offset = index * pitch; // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; // Clamp to edge unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; @@ -780,8 +813,11 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade @@ -799,8 +835,6 @@ namespace swrenderer { const uint32_t *source = (const uint32_t*)args.TexturePixels(); const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - const uint8_t *colormap = args.Colormap(); - const uint32_t *translation = (const uint32_t*)args.TranslationMap(); bool is_nearest_filter = (source2 == nullptr); if (is_nearest_filter) { @@ -831,10 +865,10 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -845,23 +879,42 @@ namespace swrenderer desttmp[1] = dest[offset + pitch]; // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend __m128i outcolor = fgcolor; @@ -878,15 +931,33 @@ namespace swrenderer int offset = index * pitch; // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend __m128i outcolor = fgcolor; @@ -925,10 +996,10 @@ namespace swrenderer fracstep *= thread->num_cores; pitch *= thread->num_cores; frac -= one / 2; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -939,7 +1010,7 @@ namespace swrenderer desttmp[1] = dest[offset + pitch]; // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { // Clamp to edge unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; @@ -963,7 +1034,9 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { @@ -989,13 +1062,30 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend __m128i outcolor = fgcolor; @@ -1012,7 +1102,7 @@ namespace swrenderer int offset = index * pitch; // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; // Clamp to edge unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; @@ -1035,12 +1125,30 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend __m128i outcolor = fgcolor; @@ -1070,8 +1178,6 @@ namespace swrenderer { const uint32_t *source = (const uint32_t*)args.TexturePixels(); const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - const uint8_t *colormap = args.Colormap(); - const uint32_t *translation = (const uint32_t*)args.TranslationMap(); bool is_nearest_filter = (source2 == nullptr); if (is_nearest_filter) { @@ -1097,10 +1203,10 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -1112,17 +1218,21 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); @@ -1131,16 +1241,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -1149,10 +1263,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_add_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_add_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -1168,27 +1285,34 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -1197,10 +1321,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_add_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_add_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -1232,10 +1359,10 @@ namespace swrenderer fracstep *= thread->num_cores; pitch *= thread->num_cores; frac -= one / 2; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -1247,7 +1374,7 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { // Clamp to edge unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; @@ -1271,7 +1398,9 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { @@ -1297,7 +1426,9 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); @@ -1306,16 +1437,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -1324,10 +1459,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_add_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_add_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -1343,7 +1481,7 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; // Clamp to edge unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; @@ -1366,24 +1504,31 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -1392,10 +1537,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_add_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_add_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -1407,8 +1555,6 @@ namespace swrenderer { const uint32_t *source = (const uint32_t*)args.TexturePixels(); const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - const uint8_t *colormap = args.Colormap(); - const uint32_t *translation = (const uint32_t*)args.TranslationMap(); bool is_nearest_filter = (source2 == nullptr); if (is_nearest_filter) { @@ -1439,10 +1585,10 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -1454,35 +1600,58 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -1491,10 +1660,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_add_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_add_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -1510,27 +1682,49 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -1539,10 +1733,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_add_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_add_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -1579,10 +1776,10 @@ namespace swrenderer fracstep *= thread->num_cores; pitch *= thread->num_cores; frac -= one / 2; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -1594,7 +1791,7 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { // Clamp to edge unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; @@ -1618,7 +1815,9 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { @@ -1644,25 +1843,46 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -1671,10 +1891,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_add_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_add_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -1690,7 +1913,7 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; // Clamp to edge unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; @@ -1713,24 +1936,46 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -1739,10 +1984,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_add_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_add_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -1770,8 +2018,6 @@ namespace swrenderer { const uint32_t *source = (const uint32_t*)args.TexturePixels(); const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - const uint8_t *colormap = args.Colormap(); - const uint32_t *translation = (const uint32_t*)args.TranslationMap(); bool is_nearest_filter = (source2 == nullptr); if (is_nearest_filter) { @@ -1797,10 +2043,10 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -1812,17 +2058,21 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); @@ -1831,16 +2081,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -1849,10 +2103,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -1868,27 +2125,34 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -1897,10 +2161,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -1932,10 +2199,10 @@ namespace swrenderer fracstep *= thread->num_cores; pitch *= thread->num_cores; frac -= one / 2; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -1947,7 +2214,7 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { // Clamp to edge unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; @@ -1971,7 +2238,9 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { @@ -1997,7 +2266,9 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); @@ -2006,16 +2277,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -2024,10 +2299,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -2043,7 +2321,7 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; // Clamp to edge unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; @@ -2066,24 +2344,31 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -2092,10 +2377,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -2107,8 +2395,6 @@ namespace swrenderer { const uint32_t *source = (const uint32_t*)args.TexturePixels(); const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - const uint8_t *colormap = args.Colormap(); - const uint32_t *translation = (const uint32_t*)args.TranslationMap(); bool is_nearest_filter = (source2 == nullptr); if (is_nearest_filter) { @@ -2139,10 +2425,10 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -2154,35 +2440,58 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -2191,10 +2500,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -2210,27 +2522,49 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -2239,10 +2573,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -2279,10 +2616,10 @@ namespace swrenderer fracstep *= thread->num_cores; pitch *= thread->num_cores; frac -= one / 2; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -2294,7 +2631,7 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { // Clamp to edge unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; @@ -2318,7 +2655,9 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { @@ -2344,25 +2683,46 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -2371,10 +2731,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -2390,7 +2753,7 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; // Clamp to edge unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; @@ -2413,24 +2776,46 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -2439,10 +2824,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -2470,8 +2858,6 @@ namespace swrenderer { const uint32_t *source = (const uint32_t*)args.TexturePixels(); const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - const uint8_t *colormap = args.Colormap(); - const uint32_t *translation = (const uint32_t*)args.TranslationMap(); bool is_nearest_filter = (source2 == nullptr); if (is_nearest_filter) { @@ -2497,10 +2883,10 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -2512,17 +2898,21 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); @@ -2531,16 +2921,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -2549,10 +2943,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); + __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(bg_lo, fg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(bg_hi, fg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -2568,27 +2965,34 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -2597,10 +3001,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); + __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(bg_lo, fg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(bg_hi, fg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -2632,10 +3039,10 @@ namespace swrenderer fracstep *= thread->num_cores; pitch *= thread->num_cores; frac -= one / 2; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -2647,7 +3054,7 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { // Clamp to edge unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; @@ -2671,7 +3078,9 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { @@ -2697,7 +3106,9 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); @@ -2706,16 +3117,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -2724,10 +3139,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); + __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(bg_lo, fg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(bg_hi, fg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -2743,7 +3161,7 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; // Clamp to edge unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; @@ -2766,24 +3184,31 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -2792,10 +3217,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); + __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(bg_lo, fg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(bg_hi, fg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -2807,8 +3235,6 @@ namespace swrenderer { const uint32_t *source = (const uint32_t*)args.TexturePixels(); const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - const uint8_t *colormap = args.Colormap(); - const uint32_t *translation = (const uint32_t*)args.TranslationMap(); bool is_nearest_filter = (source2 == nullptr); if (is_nearest_filter) { @@ -2839,10 +3265,10 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -2854,35 +3280,58 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -2891,10 +3340,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); + __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(bg_lo, fg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(bg_hi, fg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -2910,27 +3362,49 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -2939,10 +3413,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); + __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(bg_lo, fg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(bg_hi, fg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -2979,10 +3456,10 @@ namespace swrenderer fracstep *= thread->num_cores; pitch *= thread->num_cores; frac -= one / 2; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -2994,7 +3471,7 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { // Clamp to edge unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; @@ -3018,7 +3495,9 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { @@ -3044,25 +3523,46 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -3071,10 +3571,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); + __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(bg_lo, fg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(bg_hi, fg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -3090,7 +3593,7 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; // Clamp to edge unsigned int frac_y0 = (clamp(frac, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; unsigned int frac_y1 = (clamp(frac + one, 0, 1 << 30) >> (FRACBITS - 2)) * textureheight; @@ -3113,24 +3616,46 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -3139,10 +3664,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); + __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(bg_lo, fg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(bg_hi, fg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -3170,8 +3698,6 @@ namespace swrenderer { const uint32_t *source = (const uint32_t*)args.TexturePixels(); const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - const uint8_t *colormap = args.Colormap(); - const uint32_t *translation = (const uint32_t*)args.TranslationMap(); int textureheight = args.TextureHeight(); uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; @@ -3194,10 +3720,10 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -3208,15 +3734,19 @@ namespace swrenderer desttmp[1] = dest[offset + pitch]; // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { unsigned int sampleout = srccolor; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { unsigned int sampleout = srccolor; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); @@ -3239,10 +3769,13 @@ namespace swrenderer int offset = index * pitch; // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; unsigned int sampleout = srccolor; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade @@ -3259,8 +3792,6 @@ namespace swrenderer { const uint32_t *source = (const uint32_t*)args.TexturePixels(); const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - const uint8_t *colormap = args.Colormap(); - const uint32_t *translation = (const uint32_t*)args.TranslationMap(); int textureheight = args.TextureHeight(); uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; @@ -3288,10 +3819,10 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -3302,21 +3833,40 @@ namespace swrenderer desttmp[1] = dest[offset + pitch]; // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { unsigned int sampleout = srccolor; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { unsigned int sampleout = srccolor; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend __m128i outcolor = fgcolor; @@ -3333,14 +3883,32 @@ namespace swrenderer int offset = index * pitch; // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; unsigned int sampleout = srccolor; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend __m128i outcolor = fgcolor; @@ -3369,8 +3937,6 @@ namespace swrenderer { const uint32_t *source = (const uint32_t*)args.TexturePixels(); const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - const uint8_t *colormap = args.Colormap(); - const uint32_t *translation = (const uint32_t*)args.TranslationMap(); int textureheight = args.TextureHeight(); uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; @@ -3393,10 +3959,10 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -3408,15 +3974,19 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { unsigned int sampleout = srccolor; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { unsigned int sampleout = srccolor; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); @@ -3425,16 +3995,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -3443,10 +4017,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_add_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_add_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -3462,26 +4039,33 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; unsigned int sampleout = srccolor; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -3490,10 +4074,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_add_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_add_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -3504,8 +4091,6 @@ namespace swrenderer { const uint32_t *source = (const uint32_t*)args.TexturePixels(); const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - const uint8_t *colormap = args.Colormap(); - const uint32_t *translation = (const uint32_t*)args.TranslationMap(); int textureheight = args.TextureHeight(); uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; @@ -3533,10 +4118,10 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -3548,33 +4133,56 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { unsigned int sampleout = srccolor; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { unsigned int sampleout = srccolor; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -3583,10 +4191,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_add_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_add_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -3602,26 +4213,48 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; unsigned int sampleout = srccolor; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -3630,10 +4263,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_add_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_add_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -3660,8 +4296,6 @@ namespace swrenderer { const uint32_t *source = (const uint32_t*)args.TexturePixels(); const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - const uint8_t *colormap = args.Colormap(); - const uint32_t *translation = (const uint32_t*)args.TranslationMap(); int textureheight = args.TextureHeight(); uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; @@ -3684,10 +4318,10 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -3699,15 +4333,19 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { unsigned int sampleout = srccolor; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { unsigned int sampleout = srccolor; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); @@ -3716,16 +4354,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -3734,10 +4376,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -3753,26 +4398,33 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; unsigned int sampleout = srccolor; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -3781,10 +4433,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -3795,8 +4450,6 @@ namespace swrenderer { const uint32_t *source = (const uint32_t*)args.TexturePixels(); const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - const uint8_t *colormap = args.Colormap(); - const uint32_t *translation = (const uint32_t*)args.TranslationMap(); int textureheight = args.TextureHeight(); uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; @@ -3824,10 +4477,10 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -3839,33 +4492,56 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { unsigned int sampleout = srccolor; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { unsigned int sampleout = srccolor; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -3874,10 +4550,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -3893,26 +4572,48 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; unsigned int sampleout = srccolor; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -3921,10 +4622,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -3951,8 +4655,6 @@ namespace swrenderer { const uint32_t *source = (const uint32_t*)args.TexturePixels(); const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - const uint8_t *colormap = args.Colormap(); - const uint32_t *translation = (const uint32_t*)args.TranslationMap(); int textureheight = args.TextureHeight(); uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; @@ -3975,10 +4677,10 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -3990,15 +4692,19 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { unsigned int sampleout = srccolor; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { unsigned int sampleout = srccolor; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); @@ -4007,16 +4713,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -4025,10 +4735,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); + __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(bg_lo, fg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(bg_hi, fg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -4044,26 +4757,33 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; unsigned int sampleout = srccolor; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -4072,10 +4792,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); + __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(bg_lo, fg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(bg_hi, fg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -4086,8 +4809,6 @@ namespace swrenderer { const uint32_t *source = (const uint32_t*)args.TexturePixels(); const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); - const uint8_t *colormap = args.Colormap(); - const uint32_t *translation = (const uint32_t*)args.TranslationMap(); int textureheight = args.TextureHeight(); uint32_t one = ((0x80000000 + textureheight - 1) / textureheight) * 2 + 1; @@ -4115,10 +4836,10 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -4130,33 +4851,56 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { unsigned int sampleout = srccolor; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { unsigned int sampleout = srccolor; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -4165,10 +4909,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); + __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(bg_lo, fg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(bg_hi, fg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -4184,26 +4931,48 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; unsigned int sampleout = srccolor; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -4212,10 +4981,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); + __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(bg_lo, fg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(bg_hi, fg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -4240,8 +5012,7 @@ namespace swrenderer auto shade_constants = args.ColormapConstants(); if (shade_constants.simple_shade) { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); + const uint8_t *source = args.TexturePixels(); const uint8_t *colormap = args.Colormap(); const uint32_t *translation = (const uint32_t*)args.TranslationMap(); int textureheight = args.TextureHeight(); @@ -4266,10 +5037,10 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -4281,30 +5052,30 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { unsigned int sampleout = color; - unsigned int samplealphaout = colormap[source[frac >> FRACBITS]]; - samplealphaout = clamp(samplealphaout, 0, 64) * 4; + unsigned int sampleshadeout = colormap[source[frac >> FRACBITS]]; + sampleshadeout = clamp(sampleshadeout, 0, 64) * 4; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { unsigned int sampleout = color; - unsigned int samplealphaout = colormap[source[frac >> FRACBITS]]; - samplealphaout = clamp(samplealphaout, 0, 64) * 4; + unsigned int sampleshadeout = colormap[source[frac >> FRACBITS]]; + sampleshadeout = clamp(sampleshadeout, 0, 64) * 4; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - int shadealpha = 256; // To do: this comes from a sampled source (samplealphaout) - __m128i alpha = _mm_set1_epi16(shadealpha); - __m128i inv_alpha = _mm_set1_epi16(256 - shadealpha); + __m128i alpha = _mm_set_epi16(ifgshade[1], ifgshade[1], ifgshade[1], ifgshade[1], ifgshade[0], ifgshade[0], ifgshade[0], ifgshade[0]); + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); fgcolor = _mm_mullo_epi16(fgcolor, alpha); bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); @@ -4324,21 +5095,21 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; unsigned int sampleout = color; - unsigned int samplealphaout = colormap[source[frac >> FRACBITS]]; - samplealphaout = clamp(samplealphaout, 0, 64) * 4; + unsigned int sampleshadeout = colormap[source[frac >> FRACBITS]]; + sampleshadeout = clamp(sampleshadeout, 0, 64) * 4; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - int shadealpha = 256; // To do: this comes from a sampled source (samplealphaout) - __m128i alpha = _mm_set1_epi16(shadealpha); - __m128i inv_alpha = _mm_set1_epi16(256 - shadealpha); + __m128i alpha = _mm_set_epi16(ifgshade[1], ifgshade[1], ifgshade[1], ifgshade[1], ifgshade[0], ifgshade[0], ifgshade[0], ifgshade[0]); + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); fgcolor = _mm_mullo_epi16(fgcolor, alpha); bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); @@ -4351,8 +5122,7 @@ namespace swrenderer } else { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); + const uint8_t *source = args.TexturePixels(); const uint8_t *colormap = args.Colormap(); const uint32_t *translation = (const uint32_t*)args.TranslationMap(); int textureheight = args.TextureHeight(); @@ -4382,10 +5152,10 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -4397,30 +5167,30 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { unsigned int sampleout = color; - unsigned int samplealphaout = colormap[source[frac >> FRACBITS]]; - samplealphaout = clamp(samplealphaout, 0, 64) * 4; + unsigned int sampleshadeout = colormap[source[frac >> FRACBITS]]; + sampleshadeout = clamp(sampleshadeout, 0, 64) * 4; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { unsigned int sampleout = color; - unsigned int samplealphaout = colormap[source[frac >> FRACBITS]]; - samplealphaout = clamp(samplealphaout, 0, 64) * 4; + unsigned int sampleshadeout = colormap[source[frac >> FRACBITS]]; + sampleshadeout = clamp(sampleshadeout, 0, 64) * 4; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - int shadealpha = 256; // To do: this comes from a sampled source (samplealphaout) - __m128i alpha = _mm_set1_epi16(shadealpha); - __m128i inv_alpha = _mm_set1_epi16(256 - shadealpha); + __m128i alpha = _mm_set_epi16(ifgshade[1], ifgshade[1], ifgshade[1], ifgshade[1], ifgshade[0], ifgshade[0], ifgshade[0], ifgshade[0]); + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); fgcolor = _mm_mullo_epi16(fgcolor, alpha); bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); @@ -4440,21 +5210,21 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; unsigned int sampleout = color; - unsigned int samplealphaout = colormap[source[frac >> FRACBITS]]; - samplealphaout = clamp(samplealphaout, 0, 64) * 4; + unsigned int sampleshadeout = colormap[source[frac >> FRACBITS]]; + sampleshadeout = clamp(sampleshadeout, 0, 64) * 4; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - int shadealpha = 256; // To do: this comes from a sampled source (samplealphaout) - __m128i alpha = _mm_set1_epi16(shadealpha); - __m128i inv_alpha = _mm_set1_epi16(256 - shadealpha); + __m128i alpha = _mm_set_epi16(ifgshade[1], ifgshade[1], ifgshade[1], ifgshade[1], ifgshade[0], ifgshade[0], ifgshade[0], ifgshade[0]); + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); fgcolor = _mm_mullo_epi16(fgcolor, alpha); bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); @@ -4483,8 +5253,7 @@ namespace swrenderer auto shade_constants = args.ColormapConstants(); if (shade_constants.simple_shade) { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); + const uint8_t *source = args.TexturePixels(); const uint8_t *colormap = args.Colormap(); const uint32_t *translation = (const uint32_t*)args.TranslationMap(); int textureheight = args.TextureHeight(); @@ -4509,10 +5278,10 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -4523,15 +5292,19 @@ namespace swrenderer desttmp[1] = dest[offset + pitch]; // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { unsigned int sampleout = translation[source[frac >> FRACBITS]]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { unsigned int sampleout = translation[source[frac >> FRACBITS]]; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); @@ -4554,10 +5327,13 @@ namespace swrenderer int offset = index * pitch; // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; unsigned int sampleout = translation[source[frac >> FRACBITS]]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade @@ -4572,8 +5348,7 @@ namespace swrenderer } else { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); + const uint8_t *source = args.TexturePixels(); const uint8_t *colormap = args.Colormap(); const uint32_t *translation = (const uint32_t*)args.TranslationMap(); int textureheight = args.TextureHeight(); @@ -4603,10 +5378,10 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -4617,21 +5392,40 @@ namespace swrenderer desttmp[1] = dest[offset + pitch]; // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { unsigned int sampleout = translation[source[frac >> FRACBITS]]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { unsigned int sampleout = translation[source[frac >> FRACBITS]]; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend __m128i outcolor = fgcolor; @@ -4648,14 +5442,32 @@ namespace swrenderer int offset = index * pitch; // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; unsigned int sampleout = translation[source[frac >> FRACBITS]]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend __m128i outcolor = fgcolor; @@ -4682,8 +5494,7 @@ namespace swrenderer auto shade_constants = args.ColormapConstants(); if (shade_constants.simple_shade) { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); + const uint8_t *source = args.TexturePixels(); const uint8_t *colormap = args.Colormap(); const uint32_t *translation = (const uint32_t*)args.TranslationMap(); int textureheight = args.TextureHeight(); @@ -4708,10 +5519,10 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -4723,15 +5534,19 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { unsigned int sampleout = translation[source[frac >> FRACBITS]]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { unsigned int sampleout = translation[source[frac >> FRACBITS]]; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); @@ -4740,16 +5555,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -4758,10 +5577,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_add_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_add_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -4777,26 +5599,33 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; unsigned int sampleout = translation[source[frac >> FRACBITS]]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -4805,10 +5634,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_add_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_add_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -4817,8 +5649,7 @@ namespace swrenderer } else { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); + const uint8_t *source = args.TexturePixels(); const uint8_t *colormap = args.Colormap(); const uint32_t *translation = (const uint32_t*)args.TranslationMap(); int textureheight = args.TextureHeight(); @@ -4848,10 +5679,10 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -4863,33 +5694,56 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { unsigned int sampleout = translation[source[frac >> FRACBITS]]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { unsigned int sampleout = translation[source[frac >> FRACBITS]]; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -4898,10 +5752,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_add_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_add_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -4917,26 +5774,48 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; unsigned int sampleout = translation[source[frac >> FRACBITS]]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -4945,10 +5824,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_add_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_add_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -4973,8 +5855,7 @@ namespace swrenderer auto shade_constants = args.ColormapConstants(); if (shade_constants.simple_shade) { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); + const uint8_t *source = args.TexturePixels(); const uint8_t *colormap = args.Colormap(); const uint32_t *translation = (const uint32_t*)args.TranslationMap(); int textureheight = args.TextureHeight(); @@ -4999,10 +5880,10 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -5014,15 +5895,19 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { unsigned int sampleout = translation[source[frac >> FRACBITS]]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { unsigned int sampleout = translation[source[frac >> FRACBITS]]; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); @@ -5031,16 +5916,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -5049,10 +5938,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -5068,26 +5960,33 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; unsigned int sampleout = translation[source[frac >> FRACBITS]]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -5096,10 +5995,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -5108,8 +6010,7 @@ namespace swrenderer } else { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); + const uint8_t *source = args.TexturePixels(); const uint8_t *colormap = args.Colormap(); const uint32_t *translation = (const uint32_t*)args.TranslationMap(); int textureheight = args.TextureHeight(); @@ -5139,10 +6040,10 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -5154,33 +6055,56 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { unsigned int sampleout = translation[source[frac >> FRACBITS]]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { unsigned int sampleout = translation[source[frac >> FRACBITS]]; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -5189,10 +6113,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -5208,26 +6135,48 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; unsigned int sampleout = translation[source[frac >> FRACBITS]]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -5236,10 +6185,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -5264,8 +6216,7 @@ namespace swrenderer auto shade_constants = args.ColormapConstants(); if (shade_constants.simple_shade) { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); + const uint8_t *source = args.TexturePixels(); const uint8_t *colormap = args.Colormap(); const uint32_t *translation = (const uint32_t*)args.TranslationMap(); int textureheight = args.TextureHeight(); @@ -5290,10 +6241,10 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -5305,15 +6256,19 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { unsigned int sampleout = translation[source[frac >> FRACBITS]]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { unsigned int sampleout = translation[source[frac >> FRACBITS]]; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); @@ -5322,16 +6277,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -5340,10 +6299,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); + __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(bg_lo, fg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(bg_hi, fg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -5359,26 +6321,33 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; unsigned int sampleout = translation[source[frac >> FRACBITS]]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -5387,10 +6356,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); + __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(bg_lo, fg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(bg_hi, fg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -5399,8 +6371,7 @@ namespace swrenderer } else { - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); + const uint8_t *source = args.TexturePixels(); const uint8_t *colormap = args.Colormap(); const uint32_t *translation = (const uint32_t*)args.TranslationMap(); int textureheight = args.TextureHeight(); @@ -5430,10 +6401,10 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -5445,33 +6416,56 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { unsigned int sampleout = translation[source[frac >> FRACBITS]]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { unsigned int sampleout = translation[source[frac >> FRACBITS]]; + unsigned int sampleshadeout = 0; ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -5480,10 +6474,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); + __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(bg_lo, fg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(bg_hi, fg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -5499,26 +6496,48 @@ namespace swrenderer __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; unsigned int sampleout = translation[source[frac >> FRACBITS]]; + unsigned int sampleshadeout = 0; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -5527,10 +6546,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); + __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(bg_lo, fg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(bg_hi, fg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); diff --git a/src/swrenderer/drawers/r_draw_sprite32.php b/src/swrenderer/drawers/r_draw_sprite32.php index 376b9ea4d..483c86762 100644 --- a/src/swrenderer/drawers/r_draw_sprite32.php +++ b/src/swrenderer/drawers/r_draw_sprite32.php @@ -82,12 +82,20 @@ namespace swrenderer } function LoopShade($blendVariant, $sampleVariant, $isSimpleShade) - { ?> - const uint32_t *source = (const uint32_t*)args.TexturePixels(); - const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); + { + if ($sampleVariant == "shaded" || $sampleVariant == "translated") + { ?> + const uint8_t *source = args.TexturePixels(); const uint8_t *colormap = args.Colormap(); const uint32_t *translation = (const uint32_t*)args.TranslationMap(); - + const uint32_t *source = (const uint32_t*)args.TexturePixels(); + const uint32_t *source2 = (const uint32_t*)args.TexturePixels2(); + bool is_nearest_filter = (source2 == nullptr); if (is_nearest_filter) @@ -141,10 +149,10 @@ namespace swrenderer { ?> frac -= one / 2; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); uint32_t srccolor = args.SrcColorBgra(); - uint32_t color = args.SolidColor(); + uint32_t color = LightBgra::shade_pal_index_simple(args.SolidColor(), light); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -158,21 +166,23 @@ namespace swrenderer // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; { ifgcolor[0] = sampleout; + ifgshade[0] = sampleshadeout; frac += fracstep; } { ifgcolor[1] = sampleout; + ifgshade[1] = sampleshadeout; frac += fracstep; } __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - + // Blend @@ -191,14 +201,16 @@ namespace swrenderer // Sample - unsigned int ifgcolor[2]; + unsigned int ifgcolor[2], ifgshade[2]; ifgcolor[0] = sampleout; ifgcolor[1] = 0; + ifgshade[0] = sampleshadeout; + ifgshade[1] = 0; __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - + // Blend @@ -212,21 +224,24 @@ namespace swrenderer if ($sampleVariant == "shaded") { ?> unsigned int sampleout = color; - unsigned int samplealphaout = colormap[source[frac >> FRACBITS]]; - samplealphaout = clamp(samplealphaout, 0, 64) * 4; + unsigned int sampleshadeout = colormap[source[frac >> FRACBITS]]; + sampleshadeout = clamp(sampleshadeout, 0, 64) * 4; unsigned int sampleout = translation[source[frac >> FRACBITS]]; + unsigned int sampleshadeout = 0; unsigned int sampleout = srccolor; + unsigned int sampleshadeout = 0; int sample_index = (((frac << 2) >> FRACBITS) * textureheight) >> FRACBITS; unsigned int sampleout = source[sample_index]; + unsigned int sampleshadeout = 0; @@ -252,11 +267,14 @@ namespace swrenderer unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + unsigned int sampleshadeout = 0; fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); @@ -291,9 +309,8 @@ namespace swrenderer - int shadealpha = 256; // To do: this comes from a sampled source (samplealphaout) - __m128i alpha = _mm_set1_epi16(shadealpha); - __m128i inv_alpha = _mm_set1_epi16(256 - shadealpha); + __m128i alpha = _mm_set_epi16(ifgshade[1], ifgshade[1], ifgshade[1], ifgshade[1], ifgshade[0], ifgshade[0], ifgshade[0], ifgshade[0]); + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); fgcolor = _mm_mullo_epi16(fgcolor, alpha); bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); @@ -301,55 +318,22 @@ namespace swrenderer outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - __m128i out_lo = _mm_srai_epi16(_mm_add_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_add_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(bg_lo, fg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(bg_hi, fg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + else + { ?> + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -358,7 +342,29 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); + + __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); + __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); + + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + } diff --git a/src/swrenderer/drawers/r_draw_wall32.h b/src/swrenderer/drawers/r_draw_wall32.h index 1e4da006c..b6c446a53 100644 --- a/src/swrenderer/drawers/r_draw_wall32.h +++ b/src/swrenderer/drawers/r_draw_wall32.h @@ -71,8 +71,8 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -158,8 +158,8 @@ namespace swrenderer fracstep *= thread->num_cores; pitch *= thread->num_cores; frac -= one / 2; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -312,8 +312,8 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -434,8 +434,8 @@ namespace swrenderer fracstep *= thread->num_cores; pitch *= thread->num_cores; frac -= one / 2; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -629,8 +629,8 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -665,6 +665,7 @@ namespace swrenderer alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + fgcolor = _mm_mullo_epi16(fgcolor, alpha); bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); @@ -698,6 +699,7 @@ namespace swrenderer alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + fgcolor = _mm_mullo_epi16(fgcolor, alpha); bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); @@ -732,8 +734,8 @@ namespace swrenderer fracstep *= thread->num_cores; pitch *= thread->num_cores; frac -= one / 2; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -806,6 +808,7 @@ namespace swrenderer alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + fgcolor = _mm_mullo_epi16(fgcolor, alpha); bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); @@ -858,6 +861,7 @@ namespace swrenderer alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + fgcolor = _mm_mullo_epi16(fgcolor, alpha); bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); @@ -902,8 +906,8 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -953,6 +957,7 @@ namespace swrenderer alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + fgcolor = _mm_mullo_epi16(fgcolor, alpha); bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); @@ -1001,6 +1006,7 @@ namespace swrenderer alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + fgcolor = _mm_mullo_epi16(fgcolor, alpha); bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); @@ -1040,8 +1046,8 @@ namespace swrenderer fracstep *= thread->num_cores; pitch *= thread->num_cores; frac -= one / 2; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -1129,6 +1135,7 @@ namespace swrenderer alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + fgcolor = _mm_mullo_epi16(fgcolor, alpha); bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); @@ -1196,6 +1203,7 @@ namespace swrenderer alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + fgcolor = _mm_mullo_epi16(fgcolor, alpha); bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); @@ -1251,8 +1259,8 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -1283,16 +1291,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -1301,10 +1313,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_add_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_add_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -1331,16 +1346,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -1349,10 +1368,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_add_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_add_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -1384,8 +1406,8 @@ namespace swrenderer fracstep *= thread->num_cores; pitch *= thread->num_cores; frac -= one / 2; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -1454,16 +1476,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -1472,10 +1498,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_add_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_add_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -1521,16 +1550,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -1539,10 +1572,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_add_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_add_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -1584,8 +1620,8 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -1631,16 +1667,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -1649,10 +1689,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_add_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_add_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -1694,16 +1737,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -1712,10 +1759,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_add_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_add_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -1752,8 +1802,8 @@ namespace swrenderer fracstep *= thread->num_cores; pitch *= thread->num_cores; frac -= one / 2; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -1837,16 +1887,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -1855,10 +1909,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_add_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_add_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -1919,16 +1976,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -1937,10 +1998,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_add_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_add_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -1993,8 +2057,8 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -2025,16 +2089,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -2043,10 +2111,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -2073,16 +2144,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -2091,10 +2166,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -2126,8 +2204,8 @@ namespace swrenderer fracstep *= thread->num_cores; pitch *= thread->num_cores; frac -= one / 2; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -2196,16 +2274,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -2214,10 +2296,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -2263,16 +2348,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -2281,10 +2370,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -2326,8 +2418,8 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -2373,16 +2465,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -2391,10 +2487,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -2436,16 +2535,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -2454,10 +2557,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -2494,8 +2600,8 @@ namespace swrenderer fracstep *= thread->num_cores; pitch *= thread->num_cores; frac -= one / 2; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -2579,16 +2685,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -2597,10 +2707,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -2661,16 +2774,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -2679,10 +2796,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -2735,8 +2855,8 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -2767,16 +2887,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -2785,10 +2909,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); + __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(bg_lo, fg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(bg_hi, fg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -2815,16 +2942,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -2833,10 +2964,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); + __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(bg_lo, fg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(bg_hi, fg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -2868,8 +3002,8 @@ namespace swrenderer fracstep *= thread->num_cores; pitch *= thread->num_cores; frac -= one / 2; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -2938,16 +3072,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -2956,10 +3094,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); + __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(bg_lo, fg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(bg_hi, fg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -3005,16 +3146,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -3023,10 +3168,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); + __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(bg_lo, fg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(bg_hi, fg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -3068,8 +3216,8 @@ namespace swrenderer dest = thread->dest_for_thread(dest_y, pitch, dest); fracstep *= thread->num_cores; pitch *= thread->num_cores; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -3115,16 +3263,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -3133,10 +3285,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); + __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(bg_lo, fg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(bg_hi, fg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -3178,16 +3333,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -3196,10 +3355,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); + __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(bg_lo, fg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(bg_hi, fg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -3236,8 +3398,8 @@ namespace swrenderer fracstep *= thread->num_cores; pitch *= thread->num_cores; frac -= one / 2; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -3321,16 +3483,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -3339,10 +3505,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); + __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(bg_lo, fg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(bg_hi, fg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); @@ -3403,16 +3572,20 @@ namespace swrenderer fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); // Blend - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -3421,10 +3594,13 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); + __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(bg_lo, fg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(bg_hi, fg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); diff --git a/src/swrenderer/drawers/r_draw_wall32.php b/src/swrenderer/drawers/r_draw_wall32.php index 81abbfe69..a414ee7d7 100644 --- a/src/swrenderer/drawers/r_draw_wall32.php +++ b/src/swrenderer/drawers/r_draw_wall32.php @@ -120,8 +120,8 @@ namespace swrenderer { ?> frac -= one / 2; - __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); - __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); + uint32_t srcalpha = args.SrcAlpha() >> (FRACBITS - 8); + uint32_t destalpha = args.DestAlpha() >> (FRACBITS - 8); int ssecount = count / 2; for (int index = 0; index < ssecount; index++) @@ -252,63 +252,34 @@ namespace swrenderer outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + { ?> + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + fgcolor = _mm_mullo_epi16(fgcolor, alpha); bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - __m128i out_lo = _mm_srai_epi16(_mm_add_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_add_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(fg_lo, bg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(fg_hi, bg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - - __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(bg_lo, fg_lo), 8); - __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(bg_hi, fg_hi), 8); - __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); - outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); - outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); - - __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); - alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 - __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); - - - __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); - bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + else + { ?> + uint32_t alpha0 = APART(ifgcolor[0]); + uint32_t alpha1 = APART(ifgcolor[1]); + alpha0 += alpha0 >> 7; // 255->256 + alpha1 += alpha1 >> 7; // 255->256 + uint32_t inv_alpha0 = 256 - alpha0; + uint32_t inv_alpha1 = 256 - alpha1; - __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); - fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + uint32_t bgalpha0 = (destalpha * alpha0 + (inv_alpha0 << 8) + 128) >> 8; + uint32_t bgalpha1 = (destalpha * alpha1 + (inv_alpha1 << 8) + 128) >> 8; + uint32_t fgalpha0 = (srcalpha * alpha0 + 128) >> 8; + uint32_t fgalpha1 = (srcalpha * alpha1 + 128) >> 8; + + __m128i bgalpha = _mm_set_epi16(bgalpha1, bgalpha1, bgalpha1, bgalpha1, bgalpha0, bgalpha0, bgalpha0, bgalpha0); + __m128i fgalpha = _mm_set_epi16(fgalpha1, fgalpha1, fgalpha1, fgalpha1, fgalpha0, fgalpha0, fgalpha0, fgalpha0); fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); @@ -317,7 +288,29 @@ namespace swrenderer __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); - + __m128i out_lo = _mm_add_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_add_epi32(fg_hi, bg_hi); + + __m128i out_lo = _mm_sub_epi32(fg_lo, bg_lo); + __m128i out_hi = _mm_sub_epi32(fg_hi, bg_hi); + + __m128i out_lo = _mm_sub_epi32(bg_lo, fg_lo); + __m128i out_hi = _mm_sub_epi32(bg_hi, fg_hi); + + + out_lo = _mm_srai_epi32(out_lo, 8); + out_hi = _mm_srai_epi32(out_hi, 8); + __m128i outcolor = _mm_packs_epi32(out_lo, out_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + }